Spaces:
Sleeping
Sleeping
Commit
·
7f029b5
1
Parent(s):
ea0dd40
Now accepts .zip file as inputs. Moved semantic search option bar. Minor API mode changes.
Browse files- .dockerignore +1 -1
- .gitignore +1 -1
- app.py +9 -6
- search_funcs/bm25_functions.py +3 -4
- search_funcs/helper_functions.py +50 -7
- search_funcs/semantic_functions.py +36 -61
- search_funcs/spacy_search_funcs.py +4 -6
.dockerignore
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
*.csv
|
2 |
*.pyc
|
3 |
*.cpython-311.pyc
|
4 |
-
*.cpython-310.pyc
|
5 |
*.bat
|
6 |
*.json
|
7 |
*.xlsx
|
@@ -16,6 +15,7 @@
|
|
16 |
*.pkl
|
17 |
*.pkl.gz
|
18 |
*.pem
|
|
|
19 |
docs/*
|
20 |
build/*
|
21 |
dist/*
|
|
|
1 |
*.csv
|
2 |
*.pyc
|
3 |
*.cpython-311.pyc
|
|
|
4 |
*.bat
|
5 |
*.json
|
6 |
*.xlsx
|
|
|
15 |
*.pkl
|
16 |
*.pkl.gz
|
17 |
*.pem
|
18 |
+
*.zip
|
19 |
docs/*
|
20 |
build/*
|
21 |
dist/*
|
.gitignore
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
*.csv
|
2 |
*.pyc
|
3 |
*.cpython-311.pyc
|
4 |
-
*.cpython-310.pyc
|
5 |
*.bat
|
6 |
*.json
|
7 |
*.xlsx
|
@@ -18,6 +17,7 @@
|
|
18 |
*.pem
|
19 |
*.json.out
|
20 |
*.env
|
|
|
21 |
docs/*
|
22 |
build/*
|
23 |
dist/*
|
|
|
1 |
*.csv
|
2 |
*.pyc
|
3 |
*.cpython-311.pyc
|
|
|
4 |
*.bat
|
5 |
*.json
|
6 |
*.xlsx
|
|
|
17 |
*.pem
|
18 |
*.json.out
|
19 |
*.env
|
20 |
+
*.zip
|
21 |
docs/*
|
22 |
build/*
|
23 |
dist/*
|
app.py
CHANGED
@@ -78,7 +78,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
78 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
79 |
|
80 |
with gr.Accordion(label = "Load in data", open=True):
|
81 |
-
in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz'])
|
82 |
with gr.Row():
|
83 |
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
84 |
load_bm25_data_button = gr.Button(value="Load data")
|
@@ -107,7 +107,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
107 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
108 |
|
109 |
with gr.Accordion("Load in data", open = True):
|
110 |
-
in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
|
111 |
|
112 |
with gr.Row():
|
113 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
@@ -115,6 +115,9 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
115 |
|
116 |
semantic_load_progress = gr.Textbox(label="Load progress")
|
117 |
|
|
|
|
|
|
|
118 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
119 |
semantic_submit = gr.Button(value="Start semantic search", variant="primary")
|
120 |
|
@@ -146,8 +149,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
146 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
147 |
with gr.Accordion(label="Fuzzy search options", open = False):
|
148 |
no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
|
149 |
-
|
150 |
-
semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.6, minimum=0, maximum=0.95, step=0.01)
|
151 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
152 |
in_join_file = gr.File(label="Upload your data to join here")
|
153 |
in_join_message = gr.Textbox(label="Join file load progress")
|
@@ -180,7 +182,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
180 |
|
181 |
### BM25 SEARCH ###
|
182 |
# Update dropdowns upon initial file load
|
183 |
-
in_bm25_file.
|
184 |
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
185 |
|
186 |
# Load in BM25 data
|
@@ -197,7 +199,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
197 |
### SEMANTIC SEARCH ###
|
198 |
|
199 |
# Load in a csv/excel file for semantic search
|
200 |
-
in_semantic_file.
|
|
|
201 |
load_semantic_data_button.click(
|
202 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
|
203 |
then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
|
|
|
78 |
current_source = gr.Textbox(label="Current data source(s)", value="None")
|
79 |
|
80 |
with gr.Accordion(label = "Load in data", open=True):
|
81 |
+
in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz', '.zip'])
|
82 |
with gr.Row():
|
83 |
in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
84 |
load_bm25_data_button = gr.Button(value="Load data")
|
|
|
107 |
current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
|
108 |
|
109 |
with gr.Accordion("Load in data", open = True):
|
110 |
+
in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz', '.zip'])
|
111 |
|
112 |
with gr.Row():
|
113 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
|
|
115 |
|
116 |
semantic_load_progress = gr.Textbox(label="Load progress")
|
117 |
|
118 |
+
with gr.Accordion(label="Semantic search options", open = False):
|
119 |
+
semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.2, minimum=0, maximum=0.95, step=0.01)
|
120 |
+
|
121 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
122 |
semantic_submit = gr.Button(value="Start semantic search", variant="primary")
|
123 |
|
|
|
149 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
150 |
with gr.Accordion(label="Fuzzy search options", open = False):
|
151 |
no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
|
152 |
+
|
|
|
153 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
154 |
in_join_file = gr.File(label="Upload your data to join here")
|
155 |
in_join_message = gr.Textbox(label="Join file load progress")
|
|
|
182 |
|
183 |
### BM25 SEARCH ###
|
184 |
# Update dropdowns upon initial file load
|
185 |
+
in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load")
|
186 |
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
187 |
|
188 |
# Load in BM25 data
|
|
|
199 |
### SEMANTIC SEARCH ###
|
200 |
|
201 |
# Load in a csv/excel file for semantic search
|
202 |
+
in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic, in_semantic_file], api_name="semantic_data_load")
|
203 |
+
|
204 |
load_semantic_data_button.click(
|
205 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
|
206 |
then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
|
search_funcs/bm25_functions.py
CHANGED
@@ -685,10 +685,9 @@ def bm25_search(
|
|
685 |
|
686 |
output_files.append(results_df_name)
|
687 |
|
688 |
-
csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
689 |
-
results_df_out.to_csv(csv_output_file, index=None)
|
690 |
-
|
691 |
-
output_files.append(csv_output_file)
|
692 |
|
693 |
print("Returning results")
|
694 |
|
|
|
685 |
|
686 |
output_files.append(results_df_name)
|
687 |
|
688 |
+
#csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
689 |
+
#results_df_out.to_csv(csv_output_file, index=None)
|
690 |
+
#output_files.append(csv_output_file)
|
|
|
691 |
|
692 |
print("Returning results")
|
693 |
|
search_funcs/helper_functions.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6 |
import shutil
|
7 |
import getpass
|
8 |
import gzip
|
|
|
9 |
import pickle
|
10 |
import numpy as np
|
11 |
|
@@ -177,7 +178,40 @@ def read_file(filename):
|
|
177 |
|
178 |
return file
|
179 |
|
180 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
'''
|
182 |
When file is loaded, update the column dropdown choices and relevant state variables
|
183 |
'''
|
@@ -192,10 +226,15 @@ def initial_data_load(in_file:List[str]):
|
|
192 |
|
193 |
file_list = [string.name for string in in_file]
|
194 |
|
195 |
-
#
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
|
198 |
-
print(data_file_names)
|
199 |
|
200 |
if not data_file_names:
|
201 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
@@ -204,9 +243,10 @@ def initial_data_load(in_file:List[str]):
|
|
204 |
|
205 |
# This if you have loaded in a documents object for the semantic search
|
206 |
if "pkl" in data_file_names[0]:
|
|
|
207 |
df = read_file(data_file_names[0])
|
208 |
new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
|
209 |
-
current_source = get_file_path_end_with_ext(data_file_names[0])
|
210 |
|
211 |
# This if you have loaded in a csv/parquets/xlsx
|
212 |
else:
|
@@ -231,11 +271,14 @@ def initial_data_load(in_file:List[str]):
|
|
231 |
|
232 |
concat_choices.extend(new_choices)
|
233 |
|
|
|
|
|
234 |
# Check if there is a search index file already
|
235 |
-
index_file_names = [string for string in file_list if "gz" in string.lower()]
|
236 |
|
237 |
if index_file_names:
|
238 |
index_file_name = index_file_names[0]
|
|
|
239 |
index_load = read_file(index_file_name)
|
240 |
|
241 |
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
@@ -254,10 +297,10 @@ def initial_data_load(in_file:List[str]):
|
|
254 |
if tokenised_file_names:
|
255 |
tokenised_load = read_file(tokenised_file_names[0])
|
256 |
|
257 |
-
out_message = "Initial data
|
258 |
print(out_message)
|
259 |
|
260 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source
|
261 |
|
262 |
def put_columns_in_join_df(in_file:str):
|
263 |
'''
|
|
|
6 |
import shutil
|
7 |
import getpass
|
8 |
import gzip
|
9 |
+
import zipfile
|
10 |
import pickle
|
11 |
import numpy as np
|
12 |
|
|
|
178 |
|
179 |
return file
|
180 |
|
181 |
+
def process_zip_files(file_list, progress=gr.Progress(track_tqdm=True)):
|
182 |
+
"""
|
183 |
+
Processes a list of file names, unzipping any ZIP files found
|
184 |
+
and adding the extracted file names to the list.
|
185 |
+
|
186 |
+
Args:
|
187 |
+
file_list: A list of file names (strings).
|
188 |
+
"""
|
189 |
+
progress(0.1, desc="Unzipping zip files")
|
190 |
+
|
191 |
+
i = 0
|
192 |
+
while i < len(file_list): # Use 'while' for dynamic list changes
|
193 |
+
file_path = file_list[i]
|
194 |
+
|
195 |
+
if file_path.endswith(".zip"):
|
196 |
+
try:
|
197 |
+
zip_dir = os.path.dirname(file_path) or "." # Get zip file's directory or use current if none
|
198 |
+
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
199 |
+
zip_ref.extractall(zip_dir) # Extract to zip's directory
|
200 |
+
#print("List of files in zip:", zip_ref.namelist())
|
201 |
+
extracted_files = [os.path.join(zip_dir, name) for name in zip_ref.namelist()]
|
202 |
+
file_list.extend(extracted_files)
|
203 |
+
|
204 |
+
except zipfile.BadZipFile:
|
205 |
+
print(f"Warning: '{file_path}' is not a valid zip file.")
|
206 |
+
|
207 |
+
i += 1
|
208 |
+
|
209 |
+
file_list = [file for file in file_list if not file.endswith(".zip")]
|
210 |
+
print("file_list after files in zip extracted:", file_list)
|
211 |
+
|
212 |
+
return file_list
|
213 |
+
|
214 |
+
def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)):
|
215 |
'''
|
216 |
When file is loaded, update the column dropdown choices and relevant state variables
|
217 |
'''
|
|
|
226 |
|
227 |
file_list = [string.name for string in in_file]
|
228 |
|
229 |
+
# If a zip file is loaded, unzip it and add the file names to the file_list
|
230 |
+
file_list = process_zip_files(file_list)
|
231 |
+
|
232 |
+
#print("File_list that makes it to main data load function:", file_list)
|
233 |
+
|
234 |
+
progress(0.3, desc="Loading in data files")
|
235 |
|
236 |
data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
|
237 |
+
print("Data file names:", data_file_names)
|
238 |
|
239 |
if not data_file_names:
|
240 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
|
|
243 |
|
244 |
# This if you have loaded in a documents object for the semantic search
|
245 |
if "pkl" in data_file_names[0]:
|
246 |
+
print("Document object for semantic search:", data_file_names[0])
|
247 |
df = read_file(data_file_names[0])
|
248 |
new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
|
249 |
+
current_source = get_file_path_end_with_ext(data_file_names[0])
|
250 |
|
251 |
# This if you have loaded in a csv/parquets/xlsx
|
252 |
else:
|
|
|
271 |
|
272 |
concat_choices.extend(new_choices)
|
273 |
|
274 |
+
progress(0.6, desc="Loading in embedding/search index files")
|
275 |
+
|
276 |
# Check if there is a search index file already
|
277 |
+
index_file_names = [string for string in file_list if ".gz" in string.lower()]
|
278 |
|
279 |
if index_file_names:
|
280 |
index_file_name = index_file_names[0]
|
281 |
+
print("Search index file name found:", index_file_name)
|
282 |
index_load = read_file(index_file_name)
|
283 |
|
284 |
embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
|
|
|
297 |
if tokenised_file_names:
|
298 |
tokenised_load = read_file(tokenised_file_names[0])
|
299 |
|
300 |
+
out_message = "Initial data load successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
|
301 |
print(out_message)
|
302 |
|
303 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source, file_list
|
304 |
|
305 |
def put_columns_in_join_df(in_file:str):
|
306 |
'''
|
search_funcs/semantic_functions.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import os
|
2 |
import time
|
3 |
import pandas as pd
|
4 |
from typing import Type
|
@@ -116,20 +115,15 @@ def docs_to_bge_embed_np_array(
|
|
116 |
|
117 |
if "bge" in embeddings_model_name:
|
118 |
print("Embedding with BGE model")
|
119 |
-
if embeddings_compress == "No":
|
120 |
-
print("Embedding with full fp32 precision")
|
121 |
-
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True)
|
122 |
-
else:
|
123 |
-
print("Embedding with int8 precision")
|
124 |
-
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True, precision="int8")
|
125 |
else:
|
126 |
print("Embedding with MiniLM-L6-v2 model")
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
133 |
|
134 |
toc = time.perf_counter()
|
135 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
@@ -288,60 +282,43 @@ def bge_semantic_search(
|
|
288 |
|
289 |
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
290 |
if "bge" in embeddings_model_name:
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
#query = query_fp32
|
295 |
-
query = quantize_embeddings(
|
296 |
-
query_fp32,
|
297 |
-
precision="int8",
|
298 |
-
calibration_embeddings=embeddings)
|
299 |
-
|
300 |
-
else:
|
301 |
-
query = embeddings_model.encode(query_str, normalize_embeddings=True)
|
302 |
|
303 |
-
# Get cosine similarities
|
304 |
-
cosine_similarities = query @ embeddings.T
|
305 |
|
306 |
-
|
307 |
-
|
308 |
|
309 |
-
#
|
310 |
-
#
|
311 |
-
#
|
|
|
|
|
312 |
else:
|
313 |
-
|
|
|
|
|
|
|
314 |
|
315 |
-
|
316 |
-
query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
|
317 |
-
|
318 |
-
#query = query_fp32
|
319 |
-
query = quantize_embeddings(
|
320 |
-
query_fp32,
|
321 |
-
precision="int8",
|
322 |
-
calibration_embeddings=embeddings)
|
323 |
-
else:
|
324 |
-
query = embeddings_model.encode(query_str, normalize_embeddings=True)
|
325 |
|
326 |
-
|
327 |
|
328 |
-
|
329 |
-
|
330 |
|
331 |
-
|
332 |
|
333 |
-
|
334 |
-
|
335 |
|
336 |
-
|
|
|
337 |
|
338 |
-
|
339 |
-
cosine_similarities = (expanded_query_fp32 @ normalized_embeddings.T)
|
340 |
|
341 |
-
|
342 |
-
|
343 |
-
# Flatten the tensor to a 1D array
|
344 |
-
cosine_similarities = cosine_similarities.flatten()
|
345 |
|
346 |
# Create a Pandas Series
|
347 |
cosine_similarities_series = pd.Series(cosine_similarities)
|
@@ -379,14 +356,12 @@ def bge_semantic_search(
|
|
379 |
|
380 |
#results_df_out.to_excel(results_df_name, index= None)
|
381 |
results_first_text = results_df_out.iloc[0, 1]
|
382 |
-
|
383 |
output_files.append(results_df_name)
|
384 |
|
385 |
-
csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
386 |
-
results_df_out.to_csv(csv_output_file, index=None)
|
387 |
-
|
388 |
-
output_files.append(csv_output_file)
|
389 |
|
390 |
print("Returning results")
|
391 |
|
392 |
-
return results_first_text,
|
|
|
|
|
1 |
import time
|
2 |
import pandas as pd
|
3 |
from typing import Type
|
|
|
115 |
|
116 |
if "bge" in embeddings_model_name:
|
117 |
print("Embedding with BGE model")
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
else:
|
119 |
print("Embedding with MiniLM-L6-v2 model")
|
120 |
+
|
121 |
+
if embeddings_compress == "No":
|
122 |
+
print("Embedding with full fp32 precision")
|
123 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
|
124 |
+
else:
|
125 |
+
print("Embedding with int8 precision")
|
126 |
+
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
|
127 |
|
128 |
toc = time.perf_counter()
|
129 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
|
|
282 |
|
283 |
# Encode the query using the sentence transformer and convert to a PyTorch tensor
|
284 |
if "bge" in embeddings_model_name:
|
285 |
+
print("Comparing similarity using BGE model")
|
286 |
+
else:
|
287 |
+
print("Comparing similarity using MiniLM-L6-v2 model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
|
|
|
|
|
289 |
|
290 |
+
if embeddings_compress == "Yes":
|
291 |
+
query_fp32 = embeddings_model.encode(query_str)
|
292 |
|
293 |
+
# Using a query as int8 doesn't actually seem to work
|
294 |
+
# query_int8 = quantize_embeddings(
|
295 |
+
# query_fp32, precision="int8", calibration_embeddings=embeddings
|
296 |
+
# )
|
297 |
+
|
298 |
else:
|
299 |
+
query_fp32 = embeddings_model.encode(query_str)
|
300 |
+
|
301 |
+
#print("query:", query_fp32)
|
302 |
+
#print("embeddings:", embeddings)
|
303 |
|
304 |
+
# Normalise embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
+
query = query_fp32.astype('float32')
|
307 |
|
308 |
+
query_norm = np.linalg.norm(query)
|
309 |
+
normalized_query = query / query_norm
|
310 |
|
311 |
+
embeddings = embeddings.astype('float32')
|
312 |
|
313 |
+
embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting
|
314 |
+
normalized_embeddings = embeddings / embeddings_norm
|
315 |
|
316 |
+
#print("normalized_query:", normalized_query)
|
317 |
+
#print("normalized_embeddings:", normalized_embeddings)
|
318 |
|
319 |
+
cosine_similarities = (normalized_query @ normalized_embeddings.T)
|
|
|
320 |
|
321 |
+
#print("Initial cosine similarities:", cosine_similarities)
|
|
|
|
|
|
|
322 |
|
323 |
# Create a Pandas Series
|
324 |
cosine_similarities_series = pd.Series(cosine_similarities)
|
|
|
356 |
|
357 |
#results_df_out.to_excel(results_df_name, index= None)
|
358 |
results_first_text = results_df_out.iloc[0, 1]
|
|
|
359 |
output_files.append(results_df_name)
|
360 |
|
361 |
+
#csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
|
362 |
+
#results_df_out.to_csv(csv_output_file, index=None)
|
363 |
+
#output_files.append(csv_output_file)
|
|
|
364 |
|
365 |
print("Returning results")
|
366 |
|
367 |
+
return results_first_text, output_files
|
search_funcs/spacy_search_funcs.py
CHANGED
@@ -1,7 +1,3 @@
|
|
1 |
-
import spacy
|
2 |
-
spacy.prefer_gpu()
|
3 |
-
from spacy.cli.download import download
|
4 |
-
from spacy.matcher import Matcher
|
5 |
import numpy as np
|
6 |
import gradio as gr
|
7 |
import pandas as pd
|
@@ -13,11 +9,13 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
13 |
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
16 |
-
|
17 |
-
|
18 |
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
19 |
''' Conduct fuzzy match on a list of data.'''
|
20 |
|
|
|
|
|
|
|
|
|
21 |
# Load spaCy model
|
22 |
nlp = load_spacy_model()
|
23 |
|
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
|
|
9 |
|
10 |
today_rev = datetime.now().strftime("%Y%m%d")
|
11 |
|
|
|
|
|
12 |
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
|
13 |
''' Conduct fuzzy match on a list of data.'''
|
14 |
|
15 |
+
import spacy
|
16 |
+
spacy.prefer_gpu()
|
17 |
+
from spacy.matcher import Matcher
|
18 |
+
|
19 |
# Load spaCy model
|
20 |
nlp = load_spacy_model()
|
21 |
|