Spaces:

Shreemit
/

search-demo

Runtime error

App Files Files Community

Shreemit commited on Nov 13, 2023

Commit

6beecf5

1 Parent(s): 26e374b

Uploaded files

Browse files

Files changed (4) hide show

Data/embeddings.pkl +3 -0
embeddings_demo.py +359 -0
requirements.txt +4 -0
test_search_bar.py +64 -0

Data/embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b6ab0cf2f12332d3208f00bc0c3964374e2a3aadb22bf251005f5d0e05674ba
+size 133

embeddings_demo.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import streamlit as st
+import numpy as np
+import numpy.linalg as la
+import pickle
+import os
+import gdown
+from sentence_transformers import SentenceTransformer
+import matplotlib.pyplot as plt
+import math
+#import streamlit_analytics
+# Compute Cosine Similarity
+def cosine_similarity(x,y):
+    """
+    Exponentiated cosine similarity
+    """
+    x_arr = np.array(x)
+    y_arr = np.array(y)
+    if la.norm(x_arr) == 0 or la.norm(y_arr) == 0:
+        return math.exp(-1)
+    else:
+        return math.exp(np.dot(x_arr,y_arr)/(max(la.norm(x_arr)*la.norm(y_arr),1)))
+# Function to Load Glove Embeddings
+def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
+    with open(glove_path,"rb") as f:
+        embeddings_dict = pickle.load(f, encoding="latin1")
+    return embeddings_dict
+def get_model_id_gdrive(model_type):
+    if model_type == "25d":
+        word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
+        embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
+    elif model_type == "50d":
+        embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
+        word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
+    elif model_type == "100d":
+        word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
+        embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"
+    return word_index_id, embeddings_id
+def download_glove_embeddings_gdrive(model_type):
+    # Get glove embeddings from google drive
+    word_index_id, embeddings_id = get_model_id_gdrive(model_type)
+    # Use gdown to get files from google drive
+    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
+    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
+    # Download word_index pickle file
+    print("Downloading word index dictionary....\n")
+    gdown.download(id=word_index_id, output = word_index_temp, quiet=False)
+    # Download embeddings numpy file
+    print("Donwloading embedings...\n\n")
+    gdown.download(id=embeddings_id, output = embeddings_temp, quiet=False)
+#@st.cache_data()
+def load_glove_embeddings_gdrive(model_type):
+    word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
+    embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
+    # Load word index dictionary
+    word_index_dict = pickle.load(open(word_index_temp,"rb"), encoding="latin")
+    # Load embeddings numpy
+    embeddings = np.load(embeddings_temp)
+    return word_index_dict, embeddings
+@st.cache_resource()
+def load_sentence_transformer_model(model_name):
+    sentenceTransformer = SentenceTransformer(model_name)
+    return sentenceTransformer
+def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
+    # 384 dimensional embedding
+    # Default model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+    sentenceTransformer = load_sentence_transformer_model(model_name)
+    try:
+        return sentenceTransformer.encode(sentence)
+    except:
+        if model_name=="all-MiniLM-L6-v2":
+            return np.zeros(384)
+        else:
+            return np.zeros(512)
+def get_result_from_gpt(sentence, gpt_model="3.5"):
+    ### GPT Authentication ###
+    pass
+    ###
+def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
+    """
+    Get glove embedding for a single word
+    """
+    if word.lower() in word_index_dict:
+        return embeddings[word_index_dict[word.lower()]]
+    else:
+        return np.zeros(int(model_type.split("d")[0]))
+# Get Averaged Glove Embedding of a sentence
+def averaged_glove_embeddings(sentence, embeddings_dict):
+    words = sentence.split(" ")
+    glove_embedding = np.zeros(50)
+    count_words = 0
+    for word in words:
+        word = word.lower()
+        if word.lower() in embeddings_dict:
+            glove_embedding += embeddings_dict[word.lower()]
+            count_words += 1
+    return glove_embedding/max(count_words,1)
+def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
+    words = sentence.split(" ")
+    embedding = np.zeros(int(model_type.split("d")[0]))
+    count_words = 0
+    for word in words:
+        if word in word_index_dict:
+            embedding += embeddings[word_index_dict[word]]
+            count_words += 1
+    return embedding/max(count_words,1)
+def get_category_embeddings(embeddings_metadata):
+    model_name = embeddings_metadata["model_name"]
+    st.session_state["cat_embed_" + model_name] = {}
+    for category in st.session_state.categories.split(" "):
+        if model_name:
+            if not category in st.session_state["cat_embed_" + model_name]:
+                st.session_state["cat_embed_" + model_name][category] =  get_sentence_transformer_embeddings(category, model_name=model_name)
+        else:
+            if not category in st.session_state["cat_embed_" + model_name]:
+                st.session_state["cat_embed_" + model_name][category] =  get_sentence_transformer_embeddings(category)
+def update_category_embeddings(embedings_metadata):
+    get_category_embeddings(embeddings_metadata)
+def get_sorted_cosine_similarity(input_sentence, embeddings_metadata):
+    categories = st.session_state.categories.split(" ")
+    cosine_sim = {}
+    if embeddings_metadata["embedding_model"] == "glove":
+        word_index_dict = embeddings_metadata["word_index_dict"]
+        embeddings = embeddings_metadata["embeddings"]
+        model_type = embeddings_metadata["model_type"]
+        input_embedding = averaged_glove_embeddings_gdrive(st.session_state.text_search, word_index_dict, embeddings, model_type)
+        for index in range(len(categories)):
+            cosine_sim[index] = cosine_similarity(input_embedding, get_glove_embeddings(categories[index], word_index_dict, embeddings, model_type))
+    else:
+        model_name = embeddings_metadata["model_name"]
+        if not "cat_embed_" + model_name in st.session_state:
+            get_category_embeddings(embeddings_metadata)
+        category_embeddings = st.session_state["cat_embed_" + model_name]
+        print("text_search = ", st.session_state.text_search)
+        if model_name:
+            input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
+        else:
+            input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
+        for index in range(len(categories)):
+            #cosine_sim[index] = cosine_similarity(input_embedding, get_sentence_transformer_embeddings(categories[index], model_name=model_name))
+            # Update category embeddings if category not found
+            if not categories[index] in category_embeddings:
+                update_category_embeddings(embeddings_metadata)
+                category_embeddings = st.session_state["cat_embed_" + model_name]
+            cosine_sim[index] = cosine_similarity(input_embedding, category_embeddings[categories[index]])
+    sorted_cosine_sim = sorted(cosine_sim.items(), key = lambda x: x[1], reverse=True)
+    return sorted_cosine_sim
+def plot_piechart(sorted_cosine_scores_items):
+    sorted_cosine_scores = np.array([sorted_cosine_scores_items[index][1] for index in range(len(sorted_cosine_scores_items))])
+    categories = st.session_state.categories.split(" ")
+    categories_sorted = [categories[sorted_cosine_scores_items[index][0]] for index in range(len(sorted_cosine_scores_items))]
+    fig, ax = plt.subplots()
+    ax.pie(sorted_cosine_scores, labels = categories_sorted, autopct='%1.1f%%')
+    st.pyplot(fig) # Figure
+def plot_piechart_helper(sorted_cosine_scores_items):
+    sorted_cosine_scores = np.array([sorted_cosine_scores_items[index][1] for index in range(len(sorted_cosine_scores_items))])
+    categories = st.session_state.categories.split(" ")
+    categories_sorted = [categories[sorted_cosine_scores_items[index][0]] for index in range(len(sorted_cosine_scores_items))]
+    fig, ax = plt.subplots(figsize=(3,3))
+    my_explode = np.zeros(len(categories_sorted))
+    my_explode[0] = 0.2
+    if len(categories_sorted) == 3:
+        my_explode[1] = 0.1 # explode this by 0.2
+    elif len(categories_sorted) > 3:
+        my_explode[2] = 0.05
+    ax.pie(sorted_cosine_scores, labels = categories_sorted, autopct='%1.1f%%', explode=my_explode)
+    return fig
+def plot_piecharts(sorted_cosine_scores_models):
+    scores_list = []
+    categories = st.session_state.categories.split(" ")
+    index = 0
+    for model in sorted_cosine_scores_models:
+        scores_list.append(sorted_cosine_scores_models[model])
+        #scores_list[index] = np.array([scores_list[index][ind2][1] for ind2 in range(len(scores_list[index]))])
+        index += 1
+    if len(sorted_cosine_scores_models) == 2:
+        fig, (ax1, ax2) = plt.subplots(2)
+        categories_sorted = [categories[scores_list[0][index][0]] for index in range(len(scores_list[0]))]
+        sorted_scores = np.array([scores_list[0][index][1] for index in range(len(scores_list[0]))])
+        ax1.pie(sorted_scores, labels = categories_sorted, autopct='%1.1f%%')
+        categories_sorted = [categories[scores_list[1][index][0]] for index in range(len(scores_list[1]))]
+        sorted_scores = np.array([scores_list[1][index][1] for index in range(len(scores_list[1]))])
+        ax2.pie(sorted_scores, labels = categories_sorted, autopct='%1.1f%%')
+    st.pyplot(fig)
+def plot_alatirchart(sorted_cosine_scores_models):
+    models = list(sorted_cosine_scores_models.keys())
+    tabs = st.tabs(models)
+    figs = {}
+    for model in models:
+        figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
+    for index in range(len(tabs)):
+        with tabs[index]:
+            st.pyplot(figs[models[index]])
+# Text Search
+#with streamlit_analytics.track():
+# ---------------------
+# Common part
+# ---------------------
+st.sidebar.title('GloVe Twitter')
+st.sidebar.markdown("""
+GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
+2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
+Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
+""")
+model_type = st.sidebar.selectbox(
+    'Choose the model',
+    ('25d', '50d'),
+    index=1
+)
+st.title("Search Based Retrieval Demo")
+st.subheader("Pass in space separated categories you want this search demo to be about.")
+#st.selectbox(label="Pick the categories you want this search demo to be about...",
+               # options=("Flowers Colors Cars Weather Food", "Chocolate Milk", "Anger Joy Sad Frustration Worry Happiness", "Positive Negative"),
+               # key="categories"
+               # )
+st.text_input(label="Categories", key="categories",value="Flowers Colors Cars Weather Food")
+print(st.session_state["categories"])
+print(type(st.session_state["categories"]))
+#print("Categories = ", categories)
+    #st.session_state.categories = categories
+st.subheader("Pass in an input word or even a sentence")
+text_search = st.text_input(label="Input your sentence", key="text_search", value="Roses are red, trucks are blue, and Seattle is grey right now")
+    #st.session_state.text_search = text_search
+# Download glove embeddings if it doesn't exist
+embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
+word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl"
+if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path):
+    print("Model type = ", model_type)
+    glove_path = "Data/glove_" + str(model_type) + ".pkl"
+    print("glove_path = ", glove_path)
+    # Download embeddings from google drive
+    with st.spinner("Downloading glove embeddings..."):
+        download_glove_embeddings_gdrive(model_type)
+# Load glove embeddings
+word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type)
+# Find closest word to an input word
+if st.session_state.text_search:
+    # Glove embeddings
+    print("Glove Embedding")
+    embeddings_metadata = {"embedding_model": "glove", "word_index_dict": word_index_dict, "embeddings": embeddings, "model_type": model_type}
+    with st.spinner("Obtaining Cosine similarity for Glove..."):
+        sorted_cosine_sim_glove = get_sorted_cosine_similarity(st.session_state.text_search, embeddings_metadata)
+    # Sentence transformer embeddings
+    print("Sentence Transformer Embedding")
+    embeddings_metadata = {"embedding_model": "transformers","model_name": ""}
+    with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
+        sorted_cosine_sim_transformer = get_sorted_cosine_similarity(st.session_state.text_search, embeddings_metadata)
+    # Results and Plot Pie Chart for Glove
+    print("Categories are: ", st.session_state.categories)
+    st.subheader("Closest word I have between: " + st.session_state.categories + " as per different Embeddings")
+    print(sorted_cosine_sim_glove)
+    print(sorted_cosine_sim_transformer)
+    #print(sorted_distilbert)
+    # Altair Chart for all models
+    plot_alatirchart({"glove_" + str(model_type): sorted_cosine_sim_glove, \
+                         "sentence_transformer_384": sorted_cosine_sim_transformer})
+                            #"distilbert_512": sorted_distilbert})
+    st.write("")
+    st.write("Demo developed by [Dr. Karthik Mohan](https://www.linkedin.com/in/karthik-mohan-72a4b323/)")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gdown==4.7.1
+sentence_transformers
+matplotlib
+click<=8.0.4

test_search_bar.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import streamlit as st
+import numpy as np
+import numpy.linalg as la
+import pickle
+#import streamlit_analytics
+# Compute Cosine Similarity
+def cosine_similarity(x,y):
+    x_arr = np.array(x)
+    y_arr = np.array(y)
+    return np.dot(x_arr,y_arr)/(la.norm(x_arr)*la.norm(y_arr))
+# Function to Load Glove Embeddings
+def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
+    with open(glove_path,"rb") as f:
+        embeddings_dict = pickle.load(f)
+    return embeddings_dict
+# Get Averaged Glove Embedding of a sentence
+def averaged_glove_embeddings(sentence, embeddings_dict):
+    words = sentence.split(" ")
+    glove_embedding = np.zeros(50)
+    count_words = 0
+    for word in words:
+        if word in embeddings_dict:
+            glove_embedding += embeddings_dict[word]
+            count_words += 1
+    return glove_embedding/max(count_words,1)
+# Load glove embeddings
+glove_embeddings = load_glove_embeddings()
+# Gold standard words to search from
+gold_words = ["flower","mountain","tree","car","building"]
+# Text Search
+#with streamlit_analytics.track():
+st.title("Search Based Retrieval Demo")
+st.subheader("Pass in an input word or even a sentence (e.g. jasmine or mount adams)")
+text_search = st.text_input("", value="")
+# Find closest word to an input word
+if text_search:
+    input_embedding = averaged_glove_embeddings(text_search, glove_embeddings)
+    cosine_sim = {}
+    for index in range(len(gold_words)):
+        cosine_sim[index] = cosine_similarity(input_embedding, glove_embeddings[gold_words[index]])
+    print(cosine_sim)
+    sorted_cosine_sim = sorted(cosine_sim.items(), key = lambda x: x[1], reverse=True)
+    st.write("(My search uses glove embeddings)")
+    st.write("Closest word I have between flower, mountain, tree, car and building for your input is: ")
+    st.subheader(gold_words[sorted_cosine_sim[0][0]] )
+    st.write("")
+    st.write("Demo developed by Dr. Karthik Mohan")