diff --git "a/big_vision/configs/proj/image_text/SigLIP_demo.ipynb" "b/big_vision/configs/proj/image_text/SigLIP_demo.ipynb"
new file mode 100644--- /dev/null
+++ "b/big_vision/configs/proj/image_text/SigLIP_demo.ipynb"
@@ -0,0 +1,1022 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "gpuType": "T4"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "accelerator": "GPU"
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# General information\n",
+ "\n",
+ "Example colab for SigLIP models described in [the SigLIP paper](https://arxiv.org/abs/2303.15343).\n",
+ "\n",
+ "**These models are not official Google products and were trained and released for research purposes.**\n",
+ "\n",
+ "If you find our model(s) useful for your research, consider citing\n",
+ "\n",
+ "```\n",
+ "@article{zhai2023sigmoid,\n",
+ " title={Sigmoid loss for language image pre-training},\n",
+ " author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},\n",
+ " journal={International Conference on Computer Vision ({ICCV})},\n",
+ " year={2023}\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "If you use our released models in your products, we will appreciate any direct feedback. We are reachable by xzhai@google.com, basilm@google.com, akolesnikov@google.com and lbeyer@google.com.\n",
+ "\n",
+ "\n",
+ "Only the models explicitly marked with `i18n` in the name are expected to perform reasonably well on non-english data."
+ ],
+ "metadata": {
+ "id": "wR53lePHuiP-"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@markdown # Environment setup\n",
+ "#@markdown **IMPORTANT NOTE**: Modern jax (>0.4) does not support the Colab TPU\n",
+ "#@markdown anymore, so don't select TPU runtime here. CPU and GPU work and are both fast enough.\n",
+ "\n",
+ "# Install the right jax version for TPU/GPU/CPU\n",
+ "import os\n",
+ "if 'COLAB_TPU_ADDR' in os.environ:\n",
+ " raise \"TPU colab not supported.\"\n",
+ "elif 'NVIDIA_PRODUCT_NAME' in os.environ:\n",
+ " !nvidia-smi\n",
+ "import jax\n",
+ "jax.devices()\n",
+ "\n",
+ "\n",
+ "# Get latest version of big_vision codebase.\n",
+ "!git clone --quiet --branch=main --depth=1 https://github.com/google-research/big_vision\n",
+ "!cd big_vision && git pull --rebase --quiet\n",
+ "!pip -q install -r big_vision/big_vision/requirements.txt\n",
+ "# Gives us ~2x faster gsutil cp to get the model checkpoints.\n",
+ "!pip3 -q install --no-cache-dir -U crcmod\n",
+ "\n",
+ "%cd big_vision\n",
+ "\n",
+ "\n",
+ "import numpy as np\n",
+ "import matplotlib as mpl\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline\n",
+ "%config InlineBackend.figure_format = 'retina'\n",
+ "\n",
+ "import jax\n",
+ "import jax.numpy as jnp\n",
+ "import ml_collections\n",
+ "\n",
+ "from google.colab.output import _publish as publish"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "kXSdSXVg2PAI",
+ "outputId": "ba908946-0cd3-4468-9034-cd108529986f",
+ "cellView": "form"
+ },
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Thu Sep 28 09:08:47 2023 \n",
+ "+-----------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n",
+ "|-------------------------------+----------------------+----------------------+\n",
+ "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
+ "| | | MIG M. |\n",
+ "|===============================+======================+======================|\n",
+ "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
+ "| N/A 75C P8 14W / 70W | 0MiB / 15360MiB | 0% Default |\n",
+ "| | | N/A |\n",
+ "+-------------------------------+----------------------+----------------------+\n",
+ " \n",
+ "+-----------------------------------------------------------------------------+\n",
+ "| Processes: |\n",
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
+ "| ID ID Usage |\n",
+ "|=============================================================================|\n",
+ "| No running processes found |\n",
+ "+-----------------------------------------------------------------------------+\n",
+ "fatal: destination path 'big_vision' already exists and is not an empty directory.\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "/content/big_vision\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Choose and load model, perform inference"
+ ],
+ "metadata": {
+ "id": "byHpmgAO6inM"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Pick your hero: (WHEN CHANGING THIS, RERUN IMAGE/TEXT EMBEDDING CELLS)\n",
+ "# Give this cell 1-3mins.\n",
+ "\n",
+ "# VARIANT, RES = 'B/16', 224\n",
+ "# VARIANT, RES = 'B/16', 256\n",
+ "# VARIANT, RES = 'B/16', 384\n",
+ "# VARIANT, RES = 'B/16', 512\n",
+ "# VARIANT, RES = 'L/16', 256\n",
+ "VARIANT, RES = 'L/16', 384\n",
+ "# VARIANT, RES = 'So400m/14', 224\n",
+ "# VARIANT, RES = 'So400m/14', 384\n",
+ "# VARIANT, RES = 'B/16-i18n', 256\n",
+ "\n",
+ "CKPT, TXTVARIANT, EMBDIM, SEQLEN, VOCAB = {\n",
+ " ('B/16', 224): ('webli_en_b16_224_63724782.npz', 'B', 768, 64, 32_000),\n",
+ " ('B/16', 256): ('webli_en_b16_256_60500360.npz', 'B', 768, 64, 32_000),\n",
+ " ('B/16', 384): ('webli_en_b16_384_68578854.npz', 'B', 768, 64, 32_000),\n",
+ " ('B/16', 512): ('webli_en_b16_512_68580893.npz', 'B', 768, 64, 32_000),\n",
+ " ('L/16', 256): ('webli_en_l16_256_60552751.npz', 'L', 1024, 64, 32_000),\n",
+ " ('L/16', 384): ('webli_en_l16_384_63634585.npz', 'L', 1024, 64, 32_000),\n",
+ " ('So400m/14', 224): ('webli_en_so400m_224_57633886.npz', 'So400m', 1152, 16, 32_000),\n",
+ " ('So400m/14', 384): ('webli_en_so400m_384_58765454.npz', 'So400m', 1152, 64, 32_000),\n",
+ " ('B/16-i18n', 256): ('webli_i18n_b16_256_66117334.npz', 'B', 768, 64, 250_000),\n",
+ " ('So400m/16', 256): ('webli_i18n_so400m_16_256_78061115.npz', 'So400m', 1152, 64, 250_000),\n",
+ "}[VARIANT, RES]\n",
+ "\n",
+ "# It is significantly faster to first copy the checkpoint (30s vs 8m30 for B and 1m vs ??? for L)\n",
+ "!test -f /tmp/{CKPT} || gsutil cp gs://big_vision/siglip/{CKPT} /tmp/\n",
+ "\n",
+ "if VARIANT.endswith('-i18n'):\n",
+ " VARIANT = VARIANT[:-len('-i18n')]\n",
+ "\n",
+ "import big_vision.models.proj.image_text.two_towers as model_mod\n",
+ "\n",
+ "model_cfg = ml_collections.ConfigDict()\n",
+ "model_cfg.image_model = 'vit' # TODO(lbeyer): remove later, default\n",
+ "model_cfg.text_model = 'proj.image_text.text_transformer' # TODO(lbeyer): remove later, default\n",
+ "model_cfg.image = dict(variant=VARIANT, pool_type='map')\n",
+ "model_cfg.text = dict(variant=TXTVARIANT, vocab_size=VOCAB)\n",
+ "model_cfg.out_dim = (None, EMBDIM) # (image_out_dim, text_out_dim)\n",
+ "model_cfg.bias_init = -10.0\n",
+ "model_cfg.temperature_init = 10.0\n",
+ "\n",
+ "model = model_mod.Model(**model_cfg)\n",
+ "\n",
+ "# Using `init_params` is slower but will lead to `load` below performing sanity-checks.\n",
+ "# init_params = jax.jit(model.init, backend=\"cpu\")(jax.random.PRNGKey(42), jnp.zeros([1, RES, RES, 3], jnp.float32), jnp.zeros([1, SEQLEN], jnp.int32))['params']\n",
+ "init_params = None # Faster but bypasses loading sanity-checks.\n",
+ "\n",
+ "params = model_mod.load(init_params, f'/tmp/{CKPT}', model_cfg)"
+ ],
+ "metadata": {
+ "id": "0DsOabGD7MRG",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "5afc9f52-7eb4-4a0d-b681-3ab5945ce9b4"
+ },
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Copying gs://big_vision/siglip/webli_i18n_b16_256_66117334.npz...\n",
+ "- [1 files][ 1.3 GiB/ 1.3 GiB] 45.3 MiB/s \n",
+ "Operation completed over 1 objects/1.3 GiB. \n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@title Load and embed images\n",
+ "\n",
+ "import big_vision.pp.builder as pp_builder\n",
+ "import big_vision.pp.ops_general\n",
+ "import big_vision.pp.ops_image\n",
+ "import big_vision.pp.ops_text\n",
+ "import PIL\n",
+ "\n",
+ "!wget -q https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg\n",
+ "!wget -q https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg\n",
+ "!wget -q 'https://images.unsplash.com/photo-1566467021888-b03548769dd1?ixlib=rb-4.0.3&q=85&fm=jpg&crop=entropy&cs=srgb&dl=svetlana-gumerova-hQHm2D1fH70-unsplash.jpg&w=640' -O cold_drink.jpg\n",
+ "!wget -q 'https://images.rawpixel.com/image_1300/czNmcy1wcml2YXRlL3Jhd3BpeGVsX2ltYWdlcy93ZWJzaXRlX2NvbnRlbnQvbHIvdXB3azU4ODU5NzY1LXdpa2ltZWRpYS1pbWFnZS1rb3diMmhkeC5qcGc.jpg' -O hot_drink.jpg\n",
+ "!wget -q https://storage.googleapis.com/big_vision/siglip/authors.jpg\n",
+ "!wget -q https://storage.googleapis.com/big_vision/siglip/siglip.jpg\n",
+ "!wget -q https://storage.googleapis.com/big_vision/siglip/caffeine.jpg\n",
+ "!wget -q https://storage.googleapis.com/big_vision/siglip/robosign.jpg\n",
+ "!wget -q https://storage.googleapis.com/big_vision/siglip/fried_fish.jpeg\n",
+ "!wget -q 'https://pbs.twimg.com/media/FTyEyxyXsAAyKPc?format=jpg&name=small' -O cow_beach.jpg\n",
+ "!wget -q 'https://storage.googleapis.com/big_vision/siglip/cow_beach2.jpg' -O cow_beach2.jpg\n",
+ "!wget -q 'https://pbs.twimg.com/media/Frb6NIEXwAA8-fI?format=jpg&name=medium' -O mountain_view.jpg\n",
+ "\n",
+ "\n",
+ "images = [PIL.Image.open(fname) for fname in [\n",
+ " 'apple-ipod.jpg',\n",
+ " 'apple-blank.jpg',\n",
+ " 'cold_drink.jpg',\n",
+ " 'hot_drink.jpg',\n",
+ " 'caffeine.jpg',\n",
+ " 'siglip.jpg',\n",
+ " 'authors.jpg',\n",
+ " 'robosign.jpg',\n",
+ " 'cow_beach.jpg',\n",
+ " 'cow_beach2.jpg',\n",
+ " 'mountain_view.jpg',\n",
+ "]]\n",
+ "\n",
+ "pp_img = pp_builder.get_preprocess_fn(f'resize({RES})|value_range(-1, 1)')\n",
+ "imgs = np.array([pp_img({'image': np.array(image)})['image'] for image in images])\n",
+ "zimg, _, out = model.apply({'params': params}, imgs, None)\n",
+ "\n",
+ "print(imgs.shape, zimg.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xmuXfCfBjgeF",
+ "outputId": "3627819b-007e-4107-e1f4-06b7ad3ac03a"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(11, 384, 384, 3) (11, 1024)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@title Tokenize and embed texts\n",
+ "\n",
+ "texts = [\n",
+ " 'an apple',\n",
+ " 'a picture of an apple',\n",
+ " 'an ipod',\n",
+ " 'granny smith',\n",
+ " 'an apple with a note saying \"ipod\"',\n",
+ " 'a cold drink on a hot day',\n",
+ " 'a hot drink on a cold day',\n",
+ " 'a photo of a cold drink on a hot day',\n",
+ " 'a photo of a hot drink on a cold day',\n",
+ " #\n",
+ " 'a photo of two guys in need of caffeine',\n",
+ " 'a photo of two guys in need of water',\n",
+ " 'a photo of the SigLIP authors',\n",
+ " 'a photo of a rock band',\n",
+ " 'a photo of researchers at Google Brain',\n",
+ " 'a photo of researchers at OpenAI',\n",
+ " #\n",
+ " 'a robot on a sign',\n",
+ " 'a photo of a robot on a sign',\n",
+ " 'an empty street',\n",
+ " 'autumn in Toronto',\n",
+ " 'a photo of autumn in Toronto',\n",
+ " 'a photo of Toronto in autumn',\n",
+ " 'a photo of Toronto in summer',\n",
+ " 'autumn in Singapore',\n",
+ " #\n",
+ " 'cow',\n",
+ " 'a cow in a tuxedo',\n",
+ " 'a cow on the beach',\n",
+ " 'a cow in the prairie',\n",
+ " #\n",
+ " 'the real mountain view',\n",
+ " 'Zürich',\n",
+ " 'San Francisco',\n",
+ " 'a picture of a laptop with the lockscreen on, a cup of cappucino, salt and pepper grinders. The view through the window reveals lake Zürich and the Alps in the background of the city.',\n",
+ "]\n",
+ "\n",
+ "TOKENIZERS = {\n",
+ " 32_000: 'c4_en',\n",
+ " 250_000: 'mc4',\n",
+ "}\n",
+ "pp_txt = pp_builder.get_preprocess_fn(f'tokenize(max_len={SEQLEN}, model=\"{TOKENIZERS[VOCAB]}\", eos=\"sticky\", pad_value=1, inkey=\"text\")')\n",
+ "txts = np.array([pp_txt({'text': text})['labels'] for text in texts])\n",
+ "_, ztxt, out = model.apply({'params': params}, None, txts)\n",
+ "\n",
+ "print(txts.shape, ztxt.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "KGrpkRTtjU-L",
+ "outputId": "7c43b56e-cd53-4801-b1e3-66774368a1d2"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(31, 64) (31, 1024)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# This is how to get all probabilities:\n",
+ "print(f\"Learned temperature {out['t'].item():.1f}, learned bias: {out['b'].item():.1f}\")\n",
+ "probs = jax.nn.sigmoid(zimg @ ztxt.T * out['t'] + out['b'])\n",
+ "print(f\"{probs[0][0]:.1%} that image 0 is '{texts[0]}'\")\n",
+ "print(f\"{probs[0][1]:.1%} that image 0 is '{texts[1]}'\")"
+ ],
+ "metadata": {
+ "id": "TIdAVw9VGEAw",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "22fc0d9a-8986-4679-ca89-6e4330a55c6e"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Learned temperature 118.2, learned bias: -12.7\n",
+ "10.4% that image 0 is 'an apple'\n",
+ "42.8% that image 0 is 'a picture of an apple'\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# @title Pretty demo (code)\n",
+ "from IPython.display import Javascript\n",
+ "\n",
+ "DEMO_IMG_SIZE = 96\n",
+ "\n",
+ "import base64\n",
+ "import io\n",
+ "\n",
+ "def bv2rgb(bv_img):\n",
+ " return (bv_img * 127.5 + 127.5).astype(np.uint8)\n",
+ "\n",
+ "def html_img(*, enc_img=None, pixels=None, id=None, size=100, max_size=None, max_height=None, style=\"\"):\n",
+ " if enc_img is None and pixels is not None:\n",
+ " with io.BytesIO() as buf:\n",
+ " PIL.Image.fromarray(np.asarray(pixels)).save(buf, format=\"JPEG\")\n",
+ " enc_img = buf.getvalue()\n",
+ "\n",
+ " img_data = base64.b64encode(np.ascontiguousarray(enc_img)).decode('ascii')\n",
+ "\n",
+ " id_spec = f'id={id}' if id else ''\n",
+ " if size is not None:\n",
+ " style_spec = f'style=\"{style}; width: {size}px; height: {size}px\"'\n",
+ " elif max_size is not None:\n",
+ " style_spec = f'style=\"{style}; width: auto; height: auto; max-width: {max_size}px; max-height: {max_size}px;\"'\n",
+ " elif max_height is not None:\n",
+ " style_spec = f'style=\"{style}; object-fit: cover; width: auto; height: {max_height}px;\"'\n",
+ " else: style_spec = ''\n",
+ "\n",
+ " return f'
'\n",
+ "\n",
+ "\n",
+ "def make_table(zimg, ztxt, out):\n",
+ " # The default learnable bias is a little conservative. Play around with it!\n",
+ " t, b = out['t'].item(), out['b'].item()\n",
+ " tempered_logits = zimg @ ztxt.T * t\n",
+ " probs = 1 / (1 + np.exp(-tempered_logits - b))\n",
+ " publish.javascript(f\"var logits = {tempered_logits.tolist()};\")\n",
+ "\n",
+ " def color(p):\n",
+ " return mpl.colors.rgb2hex(mpl.cm.Greens(p / 2)) if p >= 0.01 else \"transparent\"\n",
+ "\n",
+ " publish.javascript(f\"var cmap = {[color(x) for x in np.linspace(0, 1, 50)]};\")\n",
+ " def cell(x, iimg, itxt):\n",
+ " return f\"
{x * 100:>4.0f}% \"\n",
+ "\n",
+ " html = f'''\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " '''\n",
+ "\n",
+ " html += \"\\n\"\n",
+ " html += \"\"\n",
+ " html += \"\".join([f\"\" + html_img(pixels=bv2rgb(img), size=DEMO_IMG_SIZE) for img in imgs])\n",
+ " html += \" | \"\n",
+ " for itxt, txt in enumerate(texts):\n",
+ " html += f\" | \" + \"\".join([cell(probs[iimg, itxt], iimg, itxt) for iimg in range(len(imgs))]) + f\"{txt}\"\n",
+ "\n",
+ " publish.css(r\"\"\"\n",
+ " table {\n",
+ " border-collapse: collapse;\n",
+ " }\n",
+ "\n",
+ " tr {\n",
+ " border: 1px transparent;\n",
+ " }\n",
+ "\n",
+ " tr:nth-child(odd) {\n",
+ " background-color: #F5F5F5;\n",
+ " }\n",
+ "\n",
+ " tr:hover {\n",
+ " background-color: lightyellow;\n",
+ " border: 1px solid black;\n",
+ " }\n",
+ "\n",
+ " td.pct {\n",
+ " text-align: center;\n",
+ " }\n",
+ " \"\"\")\n",
+ " publish.html(html)\n",
+ "\n",
+ " # JS code to compute and write all probs from the logits.\n",
+ " display(Javascript('''\n",
+ " function update(b) {\n",
+ " for(var iimg = 0; iimg < logits.length; iimg++) {\n",
+ " for(var itxt = 0; itxt < logits[iimg].length; itxt++) {\n",
+ " const el = document.getElementById(`p_${iimg}_${itxt}`);\n",
+ " const p = Math.round(100 / (1 + Math.exp(-logits[iimg][itxt] - b)));\n",
+ " const pad = p < 10.0 ? ' ' : p < 100.0 ? ' ' : ''\n",
+ " el.innerHTML = pad + (p).toFixed(0) + '%';\n",
+ "\n",
+ " const td = document.getElementById(`td_${iimg}_${itxt}`);\n",
+ " const c = cmap[Math.round(p / 100 * (cmap.length - 1))];\n",
+ " td.style.backgroundColor = c;\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " '''))\n",
+ "\n",
+ " # JS code to connect the bias value slider\n",
+ " display(Javascript('''\n",
+ " const value = document.querySelector(\"#value\");\n",
+ " const input = document.querySelector(\"#b\");\n",
+ " value.textContent = input.value;\n",
+ " input.addEventListener(\"input\", (event) => {\n",
+ " value.textContent = event.target.value;\n",
+ " update(event.target.value);\n",
+ " });\n",
+ " '''))\n",
+ "\n",
+ " # Make the cell output as large as the table to avoid annoying scrollbars.\n",
+ " display(Javascript(f'update({b})'))\n",
+ " display(Javascript('google.colab.output.resizeIframeToContent()'))"
+ ],
+ "metadata": {
+ "cellView": "form",
+ "id": "eolOc7vd_ZSj"
+ },
+ "execution_count": 7,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "make_table(zimg, ztxt, out)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 767
+ },
+ "id": "mt5BIywzzA6c",
+ "outputId": "3b06cfb9-a3da-42d7-8caf-d5366d058f8b"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "var logits = [[10.509522438049316, 12.372017860412598, 13.07434368133545, 9.578202247619629, 21.19094467163086, 1.310517430305481, 1.2763848304748535, 3.0990359783172607, 2.360225200653076, -3.670855760574341, -4.780072212219238, -1.4530967473983765, -3.3108861446380615, -3.8945610523223877, -4.378420829772949, 0.35140618681907654, 2.7228779792785645, -6.806382656097412, -3.9012961387634277, -1.7843879461288452, -4.578653812408447, -7.306142807006836, -1.253274917602539, -1.8402824401855469, -6.329799175262451, -9.506726264953613, -5.78713846206665, -1.6370103359222412, -9.404793739318848, -4.342881202697754, -13.128281593322754], [12.365941047668457, 13.45022964477539, 0.9843839406967163, 12.809731483459473, 6.767915725708008, 2.808335304260254, 1.050551414489746, 3.6161491870880127, 1.152547001838684, -7.214369297027588, -5.146897792816162, -6.283102035522461, -11.463550567626953, -7.751645565032959, -11.252680778503418, -9.319047927856445, -8.11094856262207, -8.898587226867676, -2.15217661857605, -0.10237424820661545, -3.6214966773986816, -12.085700035095215, -1.599789023399353, -1.7422595024108887, -7.456813335418701, -8.457598686218262, -5.5325212478637695, -2.4997880458831787, -8.217476844787598, -8.986675262451172, -10.336335182189941], [-1.1052173376083374, -1.3570473194122314, -3.8713269233703613, 2.3654367923736572, -9.037796020507812, 11.620930671691895, 2.1417031288146973, 13.036051750183105, -0.11228565871715546, 0.33224615454673767, 3.9813454151153564, -6.005640029907227, -5.856462001800537, -7.669452667236328, -9.974565505981445, -11.242084503173828, -12.130292892456055, -5.630223274230957, -5.570030689239502, -6.117311000823975, -7.32966423034668, -5.952571392059326, 0.4303727447986603, -0.5507297515869141, -7.554576873779297, -3.3274905681610107, -3.4397053718566895, 0.9088093638420105, -4.845495700836182, -7.663942337036133, -10.328642845153809], [1.1323682069778442, 1.3157405853271484, 0.828519880771637, -1.6223008632659912, -7.967062950134277, 4.090002059936523, 14.007913589477539, 6.785359859466553, 16.369604110717773, 1.524818778038025, -4.911859035491943, -9.018620491027832, -9.306066513061523, -8.402979850769043, -11.57016658782959, -9.890503883361816, -10.68331527709961, -5.442021369934082, 4.999141216278076, 5.106411933898926, 4.015860080718994, -12.08991527557373, 6.171087741851807, -1.0262863636016846, -8.962656021118164, -6.404715538024902, -4.912563323974609, -2.5522496700286865, -6.039242744445801, -10.613517761230469, -6.997122287750244], [-3.4062156677246094, -3.2604005336761475, -4.109685897827148, -4.58593225479126, -9.489058494567871, 1.6483688354492188, 2.376404047012329, 0.7108156681060791, 0.5808579921722412, 17.98756980895996, 9.364227294921875, 1.8207945823669434, -6.545583724975586, 3.3331942558288574, 2.5704448223114014, -7.702937602996826, -9.870623588562012, -1.303507924079895, -5.957301616668701, -6.226568222045898, -6.917541980743408, -7.621560573577881, -0.5124773979187012, -2.2896718978881836, -12.721405029296875, -6.885163307189941, -9.90884780883789, -1.4125298261642456, 2.3772332668304443, -5.4370293617248535, -1.6405099630355835], [-3.2013378143310547, -3.3440065383911133, -1.2165169715881348, -4.172476291656494, -5.278318881988525, -2.3818702697753906, -3.210822582244873, -3.580622911453247, -5.1373138427734375, -1.7848750352859497, -1.4050911664962769, 16.463136672973633, -1.4766411781311035, 16.46843147277832, 11.259382247924805, -1.0086976289749146, -1.908290982246399, -4.666292667388916, -2.9601247310638428, -2.0503976345062256, -1.600439190864563, -1.4223682880401611, -2.251126289367676, -4.444605827331543, -9.10830020904541, -10.853714942932129, -11.52085018157959, -1.6640691757202148, 2.193969964981079, 2.127061367034912, -4.728240013122559], [-0.5153040289878845, -1.290441632270813, -1.3887863159179688, -2.88513445854187, -8.828889846801758, 1.3482768535614014, 0.010438825935125351, -0.6988681554794312, -2.9927048683166504, 2.8313045501708984, 2.5383071899414062, 6.094320297241211, -1.2357840538024902, 19.095901489257812, 12.049205780029297, -2.1667087078094482, -3.2871627807617188, -4.000303268432617, -2.7362473011016846, -1.7782089710235596, -1.643406629562378, -4.0933918952941895, -2.1210238933563232, -3.1019272804260254, -8.912919998168945, -8.04006290435791, -10.427931785583496, 0.8204227089881897, -1.7909467220306396, -0.8497583270072937, -5.065787315368652], [-1.4752472639083862, -0.13337232172489166, 1.7657679319381714, -2.7154576778411865, -2.644958257675171, -1.401767373085022, 0.21228086948394775, -0.5131799578666687, 1.4820858240127563, -2.5781843662261963, 3.075222969055176, -2.9382081031799316, -7.704923152923584, -3.6199238300323486, -3.213698625564575, 10.677529335021973, 12.515663146972656, 3.690605401992798, 10.979350090026855, 12.963836669921875, 11.986873626708984, 4.023745059967041, 0.9576215744018555, -4.142323970794678, -7.46238374710083, -9.735015869140625, -8.231826782226562, -1.0106267929077148, -2.2898473739624023, -2.2792820930480957, -6.5174055099487305], [-0.3335295617580414, 1.2584013938903809, -1.2919337749481201, -2.0686888694763184, -11.050207138061523, 5.148484706878662, 0.46310505270957947, 4.050027847290039, -1.6178984642028809, -6.791775703430176, -2.2926063537597656, -7.568892002105713, -10.240560531616211, -7.8912248611450195, -11.374415397644043, -7.808314323425293, -7.384036540985107, -5.577442646026611, -4.582977771759033, -4.019510746002197, -5.569993019104004, -2.2238216400146484, -0.21682055294513702, 12.080615043640137, 6.551390647888184, 17.416383743286133, 8.308161735534668, -0.3994586169719696, -1.8691462278366089, -2.187755823135376, -4.866983413696289], [-2.294813394546509, -1.4864670038223267, -1.4635752439498901, -2.9900710582733154, -14.971826553344727, 4.747520446777344, -0.9042328000068665, 3.1032114028930664, -3.679764747619629, -5.160387992858887, -1.1286523342132568, -7.035560607910156, -6.664344787597656, -7.769715309143066, -10.94699478149414, -6.526098251342773, -6.273430347442627, -6.723901271820068, -5.448723316192627, -5.721604824066162, -7.575157165527344, -4.370161056518555, -1.393196702003479, 11.913715362548828, 17.861845016479492, 15.086359024047852, 6.581197261810303, -0.31534600257873535, -2.1320040225982666, -4.305175304412842, -7.700469970703125], [-2.552478790283203, -1.305349349975586, 0.03923465311527252, -5.891383647918701, -7.833784580230713, 1.2974026203155518, 5.689708709716797, 2.8017938137054443, 7.800131320953369, -0.12797383964061737, -4.34028434753418, -4.815661430358887, -8.476018905639648, -1.2871994972229004, -1.1152652502059937, -6.992332458496094, -7.258864402770996, 0.09565334022045135, -6.82894229888916, -5.026597023010254, -3.2372162342071533, -7.9831085205078125, -3.8290252685546875, -0.595430850982666, -5.086977005004883, -4.143807888031006, -5.033395290374756, 4.200597763061523, 6.196822166442871, -4.807774066925049, 23.876855850219727]];\n",
+ "//# sourceURL=js_5e545691b3"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "var cmap = ['transparent', '#f6fcf4', '#f4fbf2', '#f3faf0', '#f1faee', '#f0f9ec', '#eff9eb', '#edf8e9', '#ecf8e8', '#eaf7e6', '#e8f6e4', '#e7f6e3', '#e5f5e1', '#e4f5df', '#e1f3dc', '#def2d9', '#dcf2d7', '#daf0d4', '#d7efd1', '#d5efcf', '#d2edcc', '#d0edca', '#cdecc7', '#cbeac4', '#c9eac2', '#c6e8bf', '#c3e7bc', '#c0e6b9', '#bce4b5', '#bae3b3', '#b6e2af', '#b4e1ad', '#b0dfaa', '#acdea6', '#aadda4', '#a7dba0', '#a3da9d', '#a0d99b', '#9cd797', '#99d595', '#95d391', '#91d28e', '#8ed08b', '#8ace88', '#87cd86', '#83cb82', '#7fc97f', '#7cc87c', '#78c679', '#73c476'];\n",
+ "//# sourceURL=js_b212ab59e1"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " |  |  |  |  |  |  |  |  |  |  | | 10% | 43% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | an apple | 43% | 69% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | a picture of an apple | 60% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | an ipod | 4% | 54% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | granny smith | 100% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | an apple with a note saying \"ipod\" | 0% | 0% | 26% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | a cold drink on a hot day | 0% | 0% | 0% | 79% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | a hot drink on a cold day | 0% | 0% | 59% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | a photo of a cold drink on a hot day | 0% | 0% | 0% | 98% | 0% | 0% | 0% | 0% | 0% | 0% | 1% | a photo of a hot drink on a cold day | 0% | 0% | 0% | 0% | 100% | 0% | 0% | 0% | 0% | 0% | 0% | a photo of two guys in need of caffeine | 0% | 0% | 0% | 0% | 4% | 0% | 0% | 0% | 0% | 0% | 0% | a photo of two guys in need of water | 0% | 0% | 0% | 0% | 0% | 98% | 0% | 0% | 0% | 0% | 0% | a photo of the SigLIP authors | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | a photo of a rock band | 0% | 0% | 0% | 0% | 0% | 98% | 100% | 0% | 0% | 0% | 0% | a photo of researchers at Google Brain | 0% | 0% | 0% | 0% | 0% | 20% | 35% | 0% | 0% | 0% | 0% | a photo of researchers at OpenAI | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 12% | 0% | 0% | 0% | a robot on a sign | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 46% | 0% | 0% | 0% | a photo of a robot on a sign | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | an empty street | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 16% | 0% | 0% | 0% | autumn in Toronto | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 57% | 0% | 0% | 0% | a photo of autumn in Toronto | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 34% | 0% | 0% | 0% | a photo of Toronto in autumn | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | a photo of Toronto in summer | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | autumn in Singapore | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 36% | 32% | 0% | cow | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 99% | 0% | a cow in a tuxedo | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 99% | 92% | 0% | a cow on the beach | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 1% | 0% | 0% | a cow in the prairie | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | the real mountain view | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | Zürich | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | San Francisco | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | a picture of a laptop with the lockscreen on, a cup of cappucino, salt and pepper grinders. The view through the window reveals lake Zürich and the Alps in the background of the city."
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "\n",
+ " function update(b) {\n",
+ " for(var iimg = 0; iimg < logits.length; iimg++) {\n",
+ " for(var itxt = 0; itxt < logits[iimg].length; itxt++) {\n",
+ " const el = document.getElementById(`p_${iimg}_${itxt}`);\n",
+ " const p = Math.round(100 / (1 + Math.exp(-logits[iimg][itxt] - b)));\n",
+ " const pad = p < 10.0 ? ' ' : p < 100.0 ? ' ' : ''\n",
+ " el.innerHTML = pad + (p).toFixed(0) + '%';\n",
+ "\n",
+ " const td = document.getElementById(`td_${iimg}_${itxt}`);\n",
+ " const c = cmap[Math.round(p / 100 * (cmap.length - 1))];\n",
+ " td.style.backgroundColor = c;\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "\n",
+ " const value = document.querySelector(\"#value\");\n",
+ " const input = document.querySelector(\"#b\");\n",
+ " value.textContent = input.value;\n",
+ " input.addEventListener(\"input\", (event) => {\n",
+ " value.textContent = event.target.value;\n",
+ " update(event.target.value);\n",
+ " });\n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "update(-12.661874771118164)"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "google.colab.output.resizeIframeToContent()"
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# More international examples (choose i18n model for this)"
+ ],
+ "metadata": {
+ "id": "f5lIiaD700UK"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@title Load and embed images\n",
+ "\n",
+ "import big_vision.pp.builder as pp_builder\n",
+ "import big_vision.pp.ops_general\n",
+ "import big_vision.pp.ops_image\n",
+ "import big_vision.pp.ops_text\n",
+ "import PIL\n",
+ "\n",
+ "!wget -q 'https://live.staticflickr.com/4152/5189547658_3b2a7126cb_b.jpg' -O ants_climbing_a_tree_food.jpg\n",
+ "!wget -q 'https://storage.googleapis.com/big_vision/siglip/pexels-poranimm-athithawatthee-842401.jpg' -O ants_climbing_tree.jpg\n",
+ "!wget -q 'https://images.rawpixel.com/image_1300/cHJpdmF0ZS9zdGF0aWMvaW1hZ2Uvd2Vic2l0ZS8yMDIyLTA0L2xyL3B4OTE3NDYyLWltYWdlLWt3eW8ydmxrLmpwZw.jpg' -O lion_head.jpg\n",
+ "!wget -q 'https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIzLTA5L3Jhd3BpeGVsX29mZmljZV8yN19taW5pbWFsX3NpbXBsZV9fbGlvbl9fcGFwZXJfY29sbGFnZV9taW5pbWFsX183OGRlOGU3OS02ZTE3LTQ2YzAtYTUyOS02ZDAxM2YzNDg0OWVfMi5qcGc.jpg' -O lion_head_red.jpg\n",
+ "!wget -q https://live.staticflickr.com/232/551040940_87299a85ec_h.jpg -O meat_ball.jpg\n",
+ "!wget -q https://storage.googleapis.com/big_vision/siglip/squirrel_fish.jpg -O squirrel_fish.jpg\n",
+ "!wget -q 'https://ideogram.ai/api/images/direct/F3lMxBprSk6ligq5Vy3XSw' -O squirrel_fish2.jpg\n",
+ "!wget -q 'https://pbs.twimg.com/media/FTyEyxyXsAAyKPc?format=jpg&name=small' -O cow_beach.jpg\n",
+ "!wget -q 'https://storage.googleapis.com/big_vision/siglip/cow_beach2.jpg' -O cow_beach2.jpg\n",
+ "\n",
+ "\n",
+ "images = [PIL.Image.open(fname) for fname in [\n",
+ " 'ants_climbing_a_tree_food.jpg',\n",
+ " 'ants_climbing_tree.jpg',\n",
+ " 'meat_ball.jpg',\n",
+ " 'lion_head.jpg',\n",
+ " 'lion_head_red.jpg',\n",
+ " 'fried_fish.jpeg',\n",
+ " 'squirrel_fish.jpg',\n",
+ " 'squirrel_fish2.jpg',\n",
+ " 'cow_beach.jpg',\n",
+ " 'cow_beach2.jpg',\n",
+ "]]\n",
+ "\n",
+ "pp_img = pp_builder.get_preprocess_fn(f'resize({RES})|value_range(-1, 1)')\n",
+ "imgs = np.array([pp_img({'image': np.array(image)})['image'] for image in images])\n",
+ "zimg, _, out = model.apply({'params': params}, imgs, None)\n",
+ "\n",
+ "print(imgs.shape, zimg.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YsK74v2J04Xp",
+ "outputId": "63f024ad-205c-4dd3-a5af-4dfd5ff198ca"
+ },
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: \n",
+ "\n",
+ "TensorFlow Addons (TFA) has ended development and introduction of new features.\n",
+ "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n",
+ "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n",
+ "\n",
+ "For more information see: https://github.com/tensorflow/addons/issues/2807 \n",
+ "\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(10, 256, 256, 3) (10, 768)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@title Tokenize and embed texts\n",
+ "\n",
+ "texts = [\n",
+ " '蚂蚁上树',\n",
+ " '肉末粉丝',\n",
+ " 'ants climbing a tree',\n",
+ " 'minced pork rice noodle',\n",
+ " #\n",
+ " '红烧狮子头',\n",
+ " 'red burned lion head',\n",
+ " 'lion head',\n",
+ " 'meat ball with soy sauce',\n",
+ " #\n",
+ " '松鼠鳜鱼',\n",
+ " 'squirrel',\n",
+ " 'squirrel and fish',\n",
+ " 'squirrel mandarinfish',\n",
+ " 'squirrel mandarin fish',\n",
+ " 'sweet and sour mandarin fish',\n",
+ " #\n",
+ " 'cow',\n",
+ " 'a cow in a tuxedo',\n",
+ " 'a cow on the beach',\n",
+ " 'a cow in the prairie',\n",
+ " 'une vache sur la plage',\n",
+ " 'eine Kuh am Strand',\n",
+ " 'วัวอยู่ที่ชายหาด',\n",
+ " '一只躺在沙滩上的牛',\n",
+ " '一只沙滩上的牛',\n",
+ " 'корова на пляже',\n",
+ " 'بقرة على الشاطئ',\n",
+ "]\n",
+ "\n",
+ "TOKENIZERS = {\n",
+ " 32_000: 'c4_en',\n",
+ " 250_000: 'mc4',\n",
+ "}\n",
+ "pp_txt = pp_builder.get_preprocess_fn(f'tokenize(max_len={SEQLEN}, model=\"{TOKENIZERS[VOCAB]}\", eos=\"sticky\", pad_value=1, inkey=\"text\")')\n",
+ "txts = np.array([pp_txt({'text': text})['labels'] for text in texts])\n",
+ "_, ztxt, out = model.apply({'params': params}, None, txts)\n",
+ "\n",
+ "print(txts.shape, ztxt.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dAzAuYJh1eQ3",
+ "outputId": "6c07c1a2-c236-4b68-b7e3-f92dcc070fcc"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(25, 64) (25, 768)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "make_table(zimg, ztxt, out)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 633
+ },
+ "id": "JlMwn6K1-62i",
+ "outputId": "6b8fa113-06f3-492c-ffa7-942d4799cae3"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "var logits = [[15.194855690002441, 14.548081398010254, 4.362802505493164, 8.915352821350098, 0.12249733507633209, -1.8669313192367554, -2.1026358604431152, 4.83571195602417, -1.48772132396698, -2.885380744934082, -3.757584571838379, -9.74438190460205, -6.739628791809082, 1.0982742309570312, -1.8383992910385132, -8.639388084411621, -8.514564514160156, -8.664950370788574, -9.010446548461914, -8.695591926574707, -0.29446348547935486, -2.3145699501037598, 0.3301776945590973, -9.183826446533203, -7.548545837402344], [3.1235272884368896, -2.662849187850952, 15.499628067016602, -5.6270432472229, -8.800381660461426, -5.2857537269592285, -4.901862621307373, -8.64078426361084, -8.457619667053223, -0.7642378211021423, -6.292320251464844, -6.919025421142578, -5.699285984039307, -6.146625518798828, -1.7575650215148926, -9.384129524230957, -6.215198040008545, -6.763903617858887, -6.789668560028076, -6.646523952484131, 2.078498125076294, 0.1571565568447113, 1.2640687227249146, -4.958133697509766, -4.504084587097168], [2.4513118267059326, 3.711794853210449, -2.7506296634674072, 6.2139153480529785, 12.623679161071777, -2.242187261581421, -0.873506486415863, 12.75291633605957, 5.779244422912598, -3.411043405532837, -2.7684485912323, 0.8032691478729248, 2.4132730960845947, 10.139656066894531, -1.5548374652862549, -7.363276481628418, -10.937602043151855, -10.354545593261719, -12.12853717803955, -11.330802917480469, -3.7032158374786377, -4.167450428009033, -2.857227087020874, -12.429163932800293, -10.023411750793457], [-7.848373889923096, -8.82786750793457, -4.246535301208496, -11.672212600708008, -4.754408836364746, 5.023717403411865, 10.245930671691895, -9.671830177307129, -5.305540561676025, 0.939210832118988, -3.7660276889801025, -6.9834089279174805, -5.540616512298584, -7.520627498626709, 0.6897578239440918, -4.008193016052246, -3.137038230895996, -2.492392063140869, -3.349771022796631, -2.571514129638672, -0.5961494445800781, 1.920261025428772, -0.5972135066986084, -3.192373275756836, -2.797152280807495], [-7.591951370239258, -9.57149887084961, -7.410569667816162, -10.887884140014648, -2.1018383502960205, 10.839365005493164, 12.306414604187012, -8.755990028381348, -6.4970011711120605, 1.732677698135376, -1.484777808189392, -3.788830280303955, -2.954533338546753, -4.137475967407227, 1.2805907726287842, -4.848579406738281, -4.63262939453125, -4.869859218597412, -4.654362201690674, -4.7860589027404785, -0.6505587697029114, -0.741170346736908, -1.2220640182495117, -5.068485260009766, -4.302990913391113], [0.38381102681159973, -0.5291793346405029, -4.558042049407959, -0.798613965511322, 1.3992505073547363, -3.269932508468628, -2.243269205093384, 3.4091484546661377, 13.690838813781738, -3.199730396270752, 2.4068713188171387, 4.793602466583252, 6.522286415100098, 12.24045467376709, -0.973887026309967, -5.842926025390625, -8.813263893127441, -10.347548484802246, -10.193572044372559, -9.09493350982666, 0.17290785908699036, -2.690534830093384, 0.4429348409175873, -10.299919128417969, -7.2381591796875], [-11.066581726074219, -10.138232231140137, -5.7180986404418945, -11.073030471801758, -9.701227188110352, 1.2774648666381836, 0.6818075776100159, -11.766871452331543, 7.582111358642578, 6.539462089538574, 13.692913055419922, 11.608633041381836, 12.523263931274414, 2.838015556335449, 0.06712919473648071, -8.434947967529297, -5.371018409729004, -7.046348571777344, -5.160297393798828, -4.178375244140625, -1.4383944272994995, -1.4511940479278564, -0.826172947883606, -4.657361030578613, -4.185240745544434], [-3.598116874694824, -6.576178073883057, -2.7102479934692383, -8.999201774597168, -6.829661846160889, -5.066120147705078, -1.7694122791290283, -7.724926471710205, 0.23896828293800354, 11.48562240600586, 18.98163414001465, 10.054450035095215, 10.879026412963867, -0.23405185341835022, 1.1370410919189453, -4.135552406311035, -0.34031882882118225, -1.2078852653503418, -1.5318009853363037, -3.0245869159698486, -0.7356898188591003, 2.346902847290039, 1.158348560333252, -1.281561017036438, -1.2338509559631348], [-9.843914985656738, -9.799589157104492, -6.7716383934021, -9.883660316467285, -12.059309005737305, -6.143594264984131, -3.1696691513061523, -7.953651428222656, -14.6300048828125, -5.153632164001465, -9.101214408874512, -8.86422061920166, -7.411843299865723, -9.261401176452637, 12.271851539611816, 7.439639091491699, 19.08420181274414, 9.05471420288086, 18.37834930419922, 18.505441665649414, 14.171286582946777, 12.338602066040039, 14.924001693725586, 17.368127822875977, 17.931604385375977], [-9.439372062683105, -8.37105941772461, -9.730523109436035, -9.263359069824219, -7.634936809539795, -5.775638580322266, -0.2548319399356842, -6.097734451293945, -12.719864845275879, -5.2038702964782715, -8.733600616455078, -8.040817260742188, -6.40618896484375, -8.534762382507324, 11.509172439575195, 18.91118049621582, 14.150744438171387, 6.8233747482299805, 13.563973426818848, 13.099942207336426, 10.563776016235352, 10.233851432800293, 11.005309104919434, 15.13718032836914, 14.48193359375]];\n",
+ "//# sourceURL=js_ca0f68d49c"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "var cmap = ['transparent', '#f6fcf4', '#f4fbf2', '#f3faf0', '#f1faee', '#f0f9ec', '#eff9eb', '#edf8e9', '#ecf8e8', '#eaf7e6', '#e8f6e4', '#e7f6e3', '#e5f5e1', '#e4f5df', '#e1f3dc', '#def2d9', '#dcf2d7', '#daf0d4', '#d7efd1', '#d5efcf', '#d2edcc', '#d0edca', '#cdecc7', '#cbeac4', '#c9eac2', '#c6e8bf', '#c3e7bc', '#c0e6b9', '#bce4b5', '#bae3b3', '#b6e2af', '#b4e1ad', '#b0dfaa', '#acdea6', '#aadda4', '#a7dba0', '#a3da9d', '#a0d99b', '#9cd797', '#99d595', '#95d391', '#91d28e', '#8ed08b', '#8ace88', '#87cd86', '#83cb82', '#7fc97f', '#7cc87c', '#78c679', '#73c476'];\n",
+ "//# sourceURL=js_b212ab59e1"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " |  |  |  |  |  |  |  |  |  | | 91% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 蚂蚁上树 | 84% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 肉末粉丝 | 0% | 93% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | ants climbing a tree | 2% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | minced pork rice noodle | 0% | 0% | 43% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 红烧狮子头 | 0% | 0% | 0% | 0% | 11% | 0% | 0% | 0% | 0% | 0% | red burned lion head | 0% | 0% | 0% | 7% | 36% | 0% | 0% | 0% | 0% | 0% | lion head | 0% | 0% | 47% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | meat ball with soy sauce | 0% | 0% | 0% | 0% | 0% | 69% | 0% | 0% | 0% | 0% | 松鼠鳜鱼 | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 20% | 0% | 0% | squirrel | 0% | 0% | 0% | 0% | 0% | 0% | 69% | 100% | 0% | 0% | squirrel and fish | 0% | 0% | 0% | 0% | 0% | 0% | 22% | 6% | 0% | 0% | squirrel mandarinfish | 0% | 0% | 0% | 0% | 0% | 0% | 41% | 12% | 0% | 0% | squirrel mandarin fish | 0% | 0% | 6% | 0% | 0% | 34% | 0% | 0% | 0% | 0% | sweet and sour mandarin fish | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 35% | 20% | cow | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | a cow in a tuxedo | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | 78% | a cow on the beach | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 2% | 0% | a cow in the prairie | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | 66% | une vache sur la plage | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 100% | 55% | eine Kuh am Strand | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 78% | 9% | วัวอยู่ที่ชายหาด | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 37% | 7% | 一只躺在沙滩上的牛 | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 88% | 13% | 一只沙滩上的牛 | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 99% | 90% | корова на пляже | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 99% | 83% | بقرة على الشاطئ"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "\n",
+ " function update(b) {\n",
+ " for(var iimg = 0; iimg < logits.length; iimg++) {\n",
+ " for(var itxt = 0; itxt < logits[iimg].length; itxt++) {\n",
+ " const el = document.getElementById(`p_${iimg}_${itxt}`);\n",
+ " const p = Math.round(100 / (1 + Math.exp(-logits[iimg][itxt] - b)));\n",
+ " const pad = p < 10.0 ? ' ' : p < 100.0 ? ' ' : ''\n",
+ " el.innerHTML = pad + (p).toFixed(0) + '%';\n",
+ "\n",
+ " const td = document.getElementById(`td_${iimg}_${itxt}`);\n",
+ " const c = cmap[Math.round(p / 100 * (cmap.length - 1))];\n",
+ " td.style.backgroundColor = c;\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "\n",
+ " const value = document.querySelector(\"#value\");\n",
+ " const input = document.querySelector(\"#b\");\n",
+ " value.textContent = input.value;\n",
+ " input.addEventListener(\"input\", (event) => {\n",
+ " value.textContent = event.target.value;\n",
+ " update(event.target.value);\n",
+ " });\n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "update(-12.885268211364746)"
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "application/javascript": [
+ "google.colab.output.resizeIframeToContent()"
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Explanation for non-Chinese speakers:\n",
+ "\n",
+ "- The first dish is literally called \"ants climbing a tree\" in Chinese.\n",
+ "- The second dish is literally called \"red burned lion head\" in Chinese.\n",
+ "- The third dish is literally called \"squirrel mandarinfish\" in Chinese.\n",
+ "\n",
+ "We are looking for more interesting examples that highlight culture-language aspects and where a non-EN model should \"get it\" while an EN-only does not."
+ ],
+ "metadata": {
+ "id": "bNGoftU3y4UQ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Example image credits\n",
+ "\n",
+ "- The apple and apple + iPod images are from OpenAI.\n",
+ "- [Cold drink on hot day](https://unsplash.com/fr/photos/hQHm2D1fH70).\n",
+ "- [Hot drink on cold day](https://www.rawpixel.com/image/3282934).\n",
+ "- Cows on beach were created by Chitwan Saharia using the Imagen model and shared with permission.\n",
+ "- [\"ant climbing tree\" noodles](https://www.flickr.com/photos/avlxyz/5189547658)\n",
+ "- [actual ants climbing on a tree](https://www.pexels.com/photo/macro-photo-of-five-orange-ants-842401/)\n",
+ "- [real lion head](https://www.rawpixel.com/image/5941715/free-public-domain-cc0-photo)\n",
+ "- [cartoon red lion head](https://www.rawpixel.com/image/12447997/image-texture-paper-png)\n",
+ "- Collaged [squirrel](https://www.pexels.com/photo/brown-squirrel-47547/) and [fish](https://zh.wikipedia.org/zh-hans/%E9%B3%9C%E9%B1%BC) images.\n",
+ "- cartoon [squirrel and fish](https://ideogram.ai/g/zgoma01ASS21U1YwIC7MrA/2) generated by [ideogram.ai](http://ideogram.ai) [with permission](https://x.com/ideogram_ai/status/1697428471184515316?s=20).\n",
+ "- The remaining pictures are personal photos taken by the authors, long after the models were trained."
+ ],
+ "metadata": {
+ "id": "etDZ3sl4kZ_q"
+ }
+ }
+ ]
+}
| | | |