yinlinfu HarryLee commited on
Commit
051dae2
·
0 Parent(s):

Duplicate from HarryLee/QueryExpansionForEtsy

Browse files

Co-authored-by: harryhe <[email protected]>

Files changed (7) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +304 -0
  4. etsy-embeddings-cpu.pkl +3 -0
  5. etsy-shop-LLC.png +0 -0
  6. requirements.txt +9 -0
  7. top.png +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ 000000000001.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: QueryExpansion
3
+ emoji: 👁
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: HarryLee/QueryExpansionForEtsy
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_tags import st_tags, st_tags_sidebar
3
+ from keytotext import pipeline
4
+ from PIL import Image
5
+
6
+ import json
7
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
8
+ import gzip
9
+ import os
10
+ import torch
11
+ import pickle
12
+ import random
13
+ import numpy as np
14
+
15
+ ############
16
+ ## Main page
17
+ ############
18
+
19
+ st.write("# Demonstration for Etsy Query Expansion(Etsy-QE)")
20
+
21
+ st.markdown("***Idea is to build a model which will take query as inputs and generate expansion information as outputs.***")
22
+ image = Image.open('etsy-shop-LLC.png')
23
+ st.image(image)
24
+
25
+ st.sidebar.write("# Top-N Selection")
26
+ maxtags_sidebar = st.sidebar.slider('Number of query allowed?', 1, 20, 1, key='ehikwegrjifbwreuk')
27
+ #user_query = st_tags(
28
+ # label='# Enter Query:',
29
+ # text='Press enter to add more',
30
+ # value=['Mother'],
31
+ # suggestions=['gift', 'nike', 'wool'],
32
+ # maxtags=maxtags_sidebar,
33
+ # key="aljnf")
34
+
35
+ user_query = st.text_input("Enter a query for the generated text: e.g., gift, home decoration ...")
36
+
37
+ # Add selectbox in streamlit
38
+ option1 = st.sidebar.selectbox(
39
+ 'Which transformers model would you like to be selected?',
40
+ ('multi-qa-MiniLM-L6-cos-v1','null','null'))
41
+
42
+ option2 = st.sidebar.selectbox(
43
+ 'Which corss-encoder model would you like to be selected?',
44
+ ('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))
45
+
46
+ st.sidebar.success("Load Successfully!")
47
+
48
+ #if not torch.cuda.is_available():
49
+ # print("Warning: No GPU found. Please add GPU to your notebook")
50
+
51
+ #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
52
+ bi_encoder = SentenceTransformer(option1,device='cpu')
53
+ bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
54
+ top_k = 32 #Number of passages we want to retrieve with the bi-encoder
55
+
56
+ #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
57
+ cross_encoder = CrossEncoder(option2, device='cpu')
58
+
59
+ passages = []
60
+
61
+ # load pre-train embeedings files
62
+ embedding_cache_path = 'etsy-embeddings-cpu.pkl'
63
+ print("Load pre-computed embeddings from disc")
64
+ with open(embedding_cache_path, "rb") as fIn:
65
+ cache_data = pickle.load(fIn)
66
+ passages = cache_data['sentences']
67
+ corpus_embeddings = cache_data['embeddings']
68
+
69
+ from rank_bm25 import BM25Okapi
70
+ from sklearn.feature_extraction import _stop_words
71
+ import string
72
+ from tqdm.autonotebook import tqdm
73
+ import numpy as np
74
+ import re
75
+
76
+ import yake
77
+
78
+ language = "en"
79
+ max_ngram_size = 3
80
+ deduplication_threshold = 0.9
81
+ deduplication_algo = 'seqm'
82
+ windowSize = 3
83
+ numOfKeywords = 3
84
+
85
+ custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
86
+
87
+ # We lower case our text and remove stop-words from indexing
88
+ def bm25_tokenizer(text):
89
+ tokenized_doc = []
90
+ for token in text.lower().split():
91
+ token = token.strip(string.punctuation)
92
+
93
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
94
+ tokenized_doc.append(token)
95
+ return tokenized_doc
96
+
97
+ tokenized_corpus = []
98
+ for passage in tqdm(passages):
99
+ tokenized_corpus.append(bm25_tokenizer(passage))
100
+
101
+ bm25 = BM25Okapi(tokenized_corpus)
102
+
103
+ def word_len(s):
104
+ return len([i for i in s.split(' ') if i])
105
+
106
+
107
+ # This function will search all wikipedia articles for passages that
108
+ # answer the query
109
+ def search(query):
110
+ print("Input query:", query)
111
+ total_qe = []
112
+
113
+ ##### BM25 search (lexical search) #####
114
+ bm25_scores = bm25.get_scores(bm25_tokenizer(query))
115
+ top_n = np.argpartition(bm25_scores, -5)[-5:]
116
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
117
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
118
+
119
+ #print("Top-10 lexical search (BM25) hits")
120
+ qe_string = []
121
+ for hit in bm25_hits[0:1000]:
122
+ if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
123
+ qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
124
+
125
+ sub_string = []
126
+ for item in qe_string:
127
+ for sub_item in item.split(","):
128
+ sub_string.append(sub_item)
129
+ #print(sub_string)
130
+ total_qe.append(sub_string)
131
+
132
+ ##### Sematic Search #####
133
+ # Encode the query using the bi-encoder and find potentially relevant passages
134
+ query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
135
+ hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
136
+ hits = hits[0] # Get the hits for the first query
137
+
138
+ ##### Re-Ranking #####
139
+ # Now, score all retrieved passages with the cross_encoder
140
+ cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
141
+ cross_scores = cross_encoder.predict(cross_inp)
142
+
143
+ # Sort results by the cross-encoder scores
144
+ for idx in range(len(cross_scores)):
145
+ hits[idx]['cross-score'] = cross_scores[idx]
146
+
147
+ # Output of top-10 hits from bi-encoder
148
+ #print("\n-------------------------\n")
149
+ #print("Top-N Bi-Encoder Retrieval hits")
150
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
151
+ qe_string = []
152
+ for hit in hits[0:1000]:
153
+ if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
154
+ qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
155
+ #print(qe_string)
156
+ total_qe.append(qe_string)
157
+
158
+ # Output of top-10 hits from re-ranker
159
+ #print("\n-------------------------\n")
160
+ #print("Top-N Cross-Encoder Re-ranker hits")
161
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
162
+ qe_string = []
163
+ for hit in hits[0:1000]:
164
+ if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
165
+ qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
166
+ #print(qe_string)
167
+ total_qe.append(qe_string)
168
+
169
+ # Total Results
170
+ total_qe.append(qe_string)
171
+ st.write("E-Commerce Query Expansion Results: \n")
172
+
173
+ res = []
174
+ for sub_list in total_qe:
175
+ for i in sub_list:
176
+ rs = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', i)
177
+ rs_final = re.sub("\x20\x20", "\n", rs)
178
+ #st.write(rs_final.strip())
179
+ res.append(rs_final.strip())
180
+
181
+ res_clean = []
182
+ for out in res:
183
+ if len(out) > 20:
184
+ keywords = custom_kw_extractor.extract_keywords(out)
185
+ for key in keywords:
186
+ res_clean.append(key[0])
187
+ else:
188
+ res_clean.append(out)
189
+
190
+ show_out = []
191
+ for i in res_clean:
192
+ num = word_len(i)
193
+ if num > 1:
194
+ show_out.append(i)
195
+ unique_list = list(set(show_out))
196
+ new_unique_list = [item for item in unique_list if item != query]
197
+ Lowercasing_list = [item.lower() for item in new_unique_list]
198
+ st.write(Lowercasing_list[0:maxtags_sidebar])
199
+
200
+ return Lowercasing_list
201
+
202
+ def search_nolog(query):
203
+ total_qe = []
204
+ ##### BM25 search (lexical search) #####
205
+ bm25_scores = bm25.get_scores(bm25_tokenizer(query))
206
+ top_n = np.argpartition(bm25_scores, -5)[-5:]
207
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
208
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
209
+
210
+ qe_string = []
211
+ for hit in bm25_hits[0:1000]:
212
+ if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
213
+ qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
214
+
215
+ sub_string = []
216
+ for item in qe_string:
217
+ for sub_item in item.split(","):
218
+ sub_string.append(sub_item)
219
+ total_qe.append(sub_string)
220
+
221
+ ##### Sematic Search #####
222
+ # Encode the query using the bi-encoder and find potentially relevant passages
223
+ query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
224
+ hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
225
+ hits = hits[0] # Get the hits for the first query
226
+
227
+ ##### Re-Ranking #####
228
+ # Now, score all retrieved passages with the cross_encoder
229
+ cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
230
+ cross_scores = cross_encoder.predict(cross_inp)
231
+
232
+ # Sort results by the cross-encoder scores
233
+ for idx in range(len(cross_scores)):
234
+ hits[idx]['cross-score'] = cross_scores[idx]
235
+
236
+ # Output of top-10 hits from bi-encoder
237
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
238
+ qe_string = []
239
+ for hit in hits[0:1000]:
240
+ if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
241
+ qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
242
+ total_qe.append(qe_string)
243
+
244
+ # Output of top-10 hits from re-ranker
245
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
246
+ qe_string = []
247
+ for hit in hits[0:1000]:
248
+ if passages[hit['corpus_id']].replace("\n", " ") not in qe_string:
249
+ qe_string.append(passages[hit['corpus_id']].replace("\n", ""))
250
+ total_qe.append(qe_string)
251
+
252
+ # Total Results
253
+ total_qe.append(qe_string)
254
+
255
+ res = []
256
+ for sub_list in total_qe:
257
+ for i in sub_list:
258
+ rs = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', i)
259
+ rs_final = re.sub("\x20\x20", "\n", rs)
260
+ res.append(rs_final.strip())
261
+
262
+ res_clean = []
263
+ for out in res:
264
+ if len(out) > 20:
265
+ keywords = custom_kw_extractor.extract_keywords(out)
266
+ for key in keywords:
267
+ res_clean.append(key[0])
268
+ else:
269
+ res_clean.append(out)
270
+
271
+ show_out = []
272
+ for i in res_clean:
273
+ num = word_len(i)
274
+ if num > 1:
275
+ show_out.append(i)
276
+
277
+ return show_out
278
+
279
+ def reranking():
280
+ rerank_list = []
281
+ reres = []
282
+ rerank_list = search_nolog(query = user_query)
283
+ unique_list = list(set(rerank_list))
284
+ new_unique_list = [item for item in unique_list if item != user_query]
285
+ Lowercasing_list = [item.lower() for item in new_unique_list]
286
+
287
+ st.write("E-Commerce Query Expansion Results: \n")
288
+ st.write(Lowercasing_list[0:maxtags_sidebar])
289
+
290
+ for i in Lowercasing_list[0:maxtags_sidebar]:
291
+ reres.append(i)
292
+ np.random.seed(7)
293
+ np.random.shuffle(reres)
294
+ st.write("Reranking Results: \n")
295
+ st.write(reres)
296
+
297
+ st.write("## Results:")
298
+ if st.button('Generated Expansion'):
299
+ out_res = search(query = user_query)
300
+ #st.success(out_res)
301
+
302
+ if st.button('Rerank'):
303
+ out_res = reranking()
304
+ #st.success(out_res)
etsy-embeddings-cpu.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a8eb36f4ec40a7d1cb382376afc38cac7caed6104bbaf5a8b28f8a98ba18cb5
3
+ size 456491627
etsy-shop-LLC.png ADDED
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==0.82.0
2
+ streamlit_tags
3
+ pyarrow
4
+ keytotext
5
+ opencv-python-headless
6
+ sentence-transformers
7
+ rank_bm25
8
+ yake
9
+ altair==4.0
top.png ADDED