QueryExpansionForEtsy

Sleeping

App Files Files Community

HarryLee commited on Apr 24, 2023

Commit

c306d7d

1 Parent(s): 0d2a873

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -29

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import gzip
 import os
 import torch
 import pickle
-import yake
 ############
 ## Main page
@@ -36,7 +35,7 @@ user_query = st.text_input("Enter a query for the generated text: e.g., gift, ho
 # Add selectbox in streamlit
 option1 = st.sidebar.selectbox(
      'Which transformers model would you like to be selected?',
-     ('multi-qa-MiniLM-L6-cos-v1','louis030195/multi-qa-MiniLM-L6-cos-v1-de-ecommerce','null'))
 option2 = st.sidebar.selectbox(
      'Which corss-encoder model would you like to be selected?',
@@ -65,20 +64,52 @@ with open(embedding_cache_path, "rb") as fIn:
   passages = cache_data['sentences']
   corpus_embeddings = cache_data['embeddings']
-kw_extractor = yake.KeywordExtractor()
-language = "en"
-max_ngram_size = 3
-deduplication_threshold = 0.9
-numOfKeywords = 20
-custom_kw_extractor=yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
 # This function will search all wikipedia articles for passages that
 # answer the query
 def search(query):
-    st.write("Input question:", query)
     ##### Sematic Search #####
     # Encode the query using the bi-encoder and find potentially relevant passages
     query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
     hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
     hits = hits[0]  # Get the hits for the first query
@@ -91,28 +122,33 @@ def search(query):
     for idx in range(len(cross_scores)):
         hits[idx]['cross-score'] = cross_scores[idx]
-    # Output of top-N hits from bi-encoder
-    #st.write("\n-------------------------\n")
-    #st.subheader("Top-N Bi-Encoder Retrieval hits")
-    #hits = sorted(hits, key=lambda x: x['score'], reverse=True)
-    #for hit in hits[0:maxtags_sidebar]:
-    #    st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
-    # Output of top-N hits from re-ranker
-    st.write("\n-------------------------\n")
-    st.subheader("Top-N Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
-    #for hit in hits[0:maxtags_sidebar]:
-    #    st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
-    hit_res = []
     for hit in hits[0:1000]:
-      q = passages[hit['corpus_id']].replace("\n", " ")
-    if q not in hit_res:
-        hit_res.append(q)
-    for res in hit_res[0:maxtags_sidebar]:
-        keywords = custom_kw_extractor.extract_keywords(res)
-        for kw in keywords:
-            st.write(kw)
 st.write("## Results:")
 if st.button('Generated Expansion'):

 import os
 import torch
 import pickle
 ############
 ## Main page
 # Add selectbox in streamlit
 option1 = st.sidebar.selectbox(
      'Which transformers model would you like to be selected?',
+     ('multi-qa-MiniLM-L6-cos-v1','null','null'))
 option2 = st.sidebar.selectbox(
      'Which corss-encoder model would you like to be selected?',
   passages = cache_data['sentences']
   corpus_embeddings = cache_data['embeddings']
+from rank_bm25 import BM25Okapi
+from sklearn.feature_extraction import _stop_words
+import string
+from tqdm.autonotebook import tqdm
+import numpy as np
+# We lower case our text and remove stop-words from indexing
+def bm25_tokenizer(text):
+    tokenized_doc = []
+    for token in text.lower().split():
+        token = token.strip(string.punctuation)
+        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
+            tokenized_doc.append(token)
+    return tokenized_doc
 # This function will search all wikipedia articles for passages that
 # answer the query
 def search(query):
+    print("Input query:", query)
+    total_qe = []
+    ##### BM25 search (lexical search) #####
+    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
+    top_n = np.argpartition(bm25_scores, -5)[-5:]
+    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
+    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
+    #print("Top-10 lexical search (BM25) hits")
+    qe_string = []
+    for hit in bm25_hits[0:1000]:
+      if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
+        qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
+    sub_string = []
+    for item in qe_string:
+      for sub_item in item.split(","):
+        sub_string.append(sub_item)
+    #print(sub_string)
+    total_qe.append(sub_string)
     ##### Sematic Search #####
     # Encode the query using the bi-encoder and find potentially relevant passages
     query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    query_embedding = query_embedding.cuda()
     hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
     hits = hits[0]  # Get the hits for the first query
     for idx in range(len(cross_scores)):
         hits[idx]['cross-score'] = cross_scores[idx]
+    # Output of top-10 hits from bi-encoder
+    #print("\n-------------------------\n")
+    #print("Top-N Bi-Encoder Retrieval hits")
+    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
+    qe_string = []
+    for hit in hits[0:1000]:
+      if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
+        qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
+    #print(qe_string)
+    total_qe.append(qe_string)
+    # Output of top-10 hits from re-ranker
+    #print("\n-------------------------\n")
+    #print("Top-N Cross-Encoder Re-ranker hits")
     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    qe_string = []
     for hit in hits[0:1000]:
+      if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
+        qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
+    #print(qe_string)
+    total_qe.append(qe_string)
+    # Total Results
+    total_qe.append(qe_string)
+    print("E-Commerce Query Expansion Results: \n")
+    print(total_qe)
 st.write("## Results:")
 if st.button('Generated Expansion'):