clip

Sleeping

File size: 8,893 Bytes

5185219
 
c81898a
 
0779f15
b59b1d0
5185219
c81898a
9033a8d
0779f15
 
 
3e50eda
 
a55de09
aae8769
fbe7708
c81898a
a55de09
aae8769
 
 
 
0779f15
aae8769
 
 
 
 
 
 
 
 
 
 
 
 
a55de09
 
c81898a
aae8769
 
0779f15
 
5185219
a55de09
 
aae8769
5185219
 
 
 
 
 
 
 
5b1c1bd
 
 
 
 
 
 
 
 
 
 
5185219
aae8769
5185219
5b1c1bd
 
aae8769
5b1c1bd
 
 
aae8769
5b1c1bd
aae8769
5b1c1bd
 
 
5185219
 
 
aae8769
 
5b1c1bd
 
 
5185219
 
a55de09
 
 
 
5185219
a55de09
 
 
 
c81898a
a55de09
5639350
ff968d5
09a1cac
ff968d5
09a1cac
ba03fb2
09a1cac
a55de09
 
74074fa
09a1cac
 
 
74074fa
 
aae8769
 
 
 
 
 
c81898a
 
a55de09
 
c81898a
 
 
 
ff968d5
 
 
 
 
 
 
 
c81898a
aae8769
 
555584f
aae8769
555584f
7600dc3
586f7e5
55fea56
586f7e5
c81898a
 
 
 
 
 
a55de09
 
 
 
5639350
74074fa
0779f15
3e50eda
0779f15
74074fa
a55de09
5185219
 
 
 
a55de09
aae8769
 
9033a8d
 
 
 
aae8769
 
0779f15
aae8769
 
8d49b55
aae8769
 
 
 
 
a55de09
aae8769
0779f15
aae8769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5185219
 
 
 
aae8769
5185219
 
aae8769
 
0779f15
3e50eda
5185219
a55de09

from html import escape
import re
import streamlit as st
import pandas as pd, numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from st_clickable_images import clickable_images

MODEL_NAMES = [
    #    "base-patch32",
    #    "base-patch16",
    #    "large-patch14",
    "large-patch14-336"
]


@st.cache(allow_output_mutation=True)
def load():
    df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
    models = {}
    processors = {}
    embeddings = {}
    for name in MODEL_NAMES:
        models[name] = CLIPModel.from_pretrained(f"openai/clip-vit-{name}").eval()
        processors[name] = CLIPProcessor.from_pretrained(f"openai/clip-vit-{name}")
        embeddings[name] = {
            0: np.load(f"embeddings-vit-{name}.npy"),
            1: np.load(f"embeddings2-vit-{name}.npy"),
        }
        for k in [0, 1]:
            embeddings[name][k] = embeddings[name][k] / np.linalg.norm(
                embeddings[name][k], axis=1, keepdims=True
            )
    return models, processors, df, embeddings


models, processors, df, embeddings = load()
source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}


def compute_text_embeddings(list_of_strings, name):
    inputs = processors[name](text=list_of_strings, return_tensors="pt", padding=True)
    with torch.no_grad():
        result = models[name].get_text_features(**inputs).detach().numpy()
    return result / np.linalg.norm(result, axis=1, keepdims=True)


def image_search(query, corpus, name, n_results=24):
    positive_embeddings = None

    def concatenate_embeddings(e1, e2):
        if e1 is None:
            return e2
        else:
            return np.concatenate((e1, e2), axis=0)

    splitted_query = query.split("EXCLUDING ")
    dot_product = 0
    k = 0 if corpus == "Unsplash" else 1
    if len(splitted_query[0]) > 0:
        positive_queries = splitted_query[0].split(";")
        for positive_query in positive_queries:
            match = re.match(r"\[(Movies|Unsplash):(\d{1,5})\](.*)", positive_query)
            if match:
                corpus2, idx, remainder = match.groups()
                idx, remainder = int(idx), remainder.strip()
                k2 = 0 if corpus2 == "Unsplash" else 1
                positive_embeddings = concatenate_embeddings(
                    positive_embeddings, embeddings[name][k2][idx : idx + 1, :]
                )
                if len(remainder) > 0:
                    positive_embeddings = concatenate_embeddings(
                        positive_embeddings, compute_text_embeddings([remainder], name)
                    )
            else:
                positive_embeddings = concatenate_embeddings(
                    positive_embeddings, compute_text_embeddings([positive_query], name)
                )
        dot_product = embeddings[name][k] @ positive_embeddings.T
        dot_product = dot_product - np.median(dot_product, axis=0)
        dot_product = dot_product / np.max(dot_product, axis=0, keepdims=True)
        dot_product = np.min(dot_product, axis=1)

    if len(splitted_query) > 1:
        negative_queries = (" ".join(splitted_query[1:])).split(";")
        negative_embeddings = compute_text_embeddings(negative_queries, name)
        dot_product2 = embeddings[name][k] @ negative_embeddings.T
        dot_product2 = dot_product2 - np.median(dot_product2, axis=0)
        dot_product2 = dot_product2 / np.max(dot_product2, axis=0, keepdims=True)
        dot_product -= np.max(np.maximum(dot_product2, 0), axis=1)

    results = np.argsort(dot_product)[-1 : -n_results - 1 : -1]
    return [
        (
            df[k].iloc[i]["path"],
            df[k].iloc[i]["tooltip"] + source[k],
            i,
        )
        for i in results
    ]


description = """
# 意味による画像検索

**検索語を入力してから Enter キーを押してください**

*OpenAI の [CLIP](https://openai.com/blog/clip/) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), [Unsplash](https://unsplash.com/) の 25k images と [The Movie Database (TMDB)](https://www.themoviedb.org/) の 8k images を使用して構築しています。*

*Vladimir Haltakov の [Unsplash Image Search](https://github.com/haltakov/natural-language-image-search) と Travis Hoppe の [Alph, The Sacred River](https://github.com/thoppe/alph-the-sacred-river) 　に触発されました。*
"""

howto = """
- 画像をクリックすると、それをクエリとして使用し、類似画像を検索できます。
- 複数の検索語を組み合わせることができます(区切り文字として「**;**」を使用します)。
- 検索語に　「**EXCLUDING**」　が含まれている場合、その右側の部分が否定クエリとして使用されます。
"""

div_style = {
    "display": "flex",
    "justify-content": "center",
    "flex-wrap": "wrap",
}


def main():
    st.markdown(
        """
              <style>
              .block-container{
                max-width: 1200px;
              }
              div.row-widget.stRadio > div{
                flex-direction:row;
                display: flex;
                justify-content: center;
              }
              div.row-widget.stRadio > div > label{
                margin-left: 5px;
                margin-right: 5px;
              }
              .row-widget {
                margin-top: -25px;
              }
              section>div:first-child {
                padding-top: 30px;
              }
              div.reportview-container > section:first-child{
                max-width: 320px;
              }
              #MainMenu {
                visibility: hidden;
              }
              footer {
                visibility: hidden;
              }
              </style>""",
        unsafe_allow_html=True,
    )
    st.sidebar.markdown(description)
    with st.sidebar.expander("高度な使用方法"):
        st.markdown(howto)
    # mode = st.sidebar.selectbox(
    #    "", ["Results for ViT-L/14@336px", "Comparison of 2 models"], index=0
    # )

    _, c, _ = st.columns((1, 3, 1))
    if "query" in st.session_state:
        query = c.text_input("", value=st.session_state["query"])
    else:
        query = c.text_input("", value="clouds at sunset")
    corpus = st.radio("", ["Unsplash", "Movies"])

    models_dict = {
        "ViT-B/32 (quicker)": "base-patch32",
        "ViT-B/16 (average)": "base-patch16",
        # "ViT-L/14 (slow)": "large-patch14",
        "ViT-L/14@336px (slower)": "large-patch14-336",
    }

    if False:  # "Comparison" in mode:
        c1, c2 = st.columns((1, 1))
        selection1 = c1.selectbox("", models_dict.keys(), index=0)
        selection2 = c2.selectbox("", models_dict.keys(), index=2)
        name1 = models_dict[selection1]
        name2 = models_dict[selection2]
    else:
        name1 = MODEL_NAMES[-1]

    if len(query) > 0:
        results1 = image_search(query, corpus, name1)
        if False:  # "Comparison" in mode:
            with c1:
                clicked1 = clickable_images(
                    [result[0] for result in results1],
                    titles=[result[1] for result in results1],
                    div_style=div_style,
                    img_style={"margin": "2px", "height": "150px"},
                    key=query + corpus + name1 + "1",
                )
            results2 = image_search(query, corpus, name2)
            with c2:
                clicked2 = clickable_images(
                    [result[0] for result in results2],
                    titles=[result[1] for result in results2],
                    div_style=div_style,
                    img_style={"margin": "2px", "height": "150px"},
                    key=query + corpus + name2 + "2",
                )
        else:
            clicked1 = clickable_images(
                [result[0] for result in results1],
                titles=[result[1] for result in results1],
                div_style=div_style,
                img_style={"margin": "2px", "height": "200px"},
                key=query + corpus + name1 + "1",
            )
            clicked2 = -1

        if clicked2 >= 0 or clicked1 >= 0:
            change_query = False
            if "last_clicked" not in st.session_state:
                change_query = True
            else:
                if max(clicked2, clicked1) != st.session_state["last_clicked"]:
                    change_query = True
            if change_query:
                if clicked1 >= 0:
                    st.session_state["query"] = f"[{corpus}:{results1[clicked1][2]}]"
                # elif clicked2 >= 0:
                #    st.session_state["query"] = f"[{corpus}:{results2[clicked2][2]}]"
                st.experimental_rerun()


if __name__ == "__main__":
    main()