Spaces:
Sleeping
Sleeping
Commit
·
dbad462
1
Parent(s):
20b4aa0
Included exact/fuzzy phrase matching. Updated packages and added basic logging.
Browse files- Dockerfile +5 -4
- app.py +72 -12
- requirements.txt +9 -8
- requirements_aws.txt +8 -8
- requirements_gpu.txt +9 -8
- requirements_keyword_only.txt +9 -8
- search_funcs/aws_functions.py +44 -1
- search_funcs/bm25_functions.py +1 -1
- search_funcs/custom_csvlogger.py +171 -0
- search_funcs/helper_functions.py +96 -59
- search_funcs/semantic_functions.py +5 -4
- search_funcs/spacy_search_funcs.py +72 -27
Dockerfile
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
|
3 |
|
4 |
# Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
5 |
-
# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.
|
6 |
|
7 |
# Update apt
|
8 |
RUN apt-get update && rm -rf /var/lib/apt/lists/*
|
@@ -14,10 +14,10 @@ WORKDIR /src
|
|
14 |
|
15 |
COPY requirements_aws.txt .
|
16 |
|
17 |
-
RUN pip install torch==2.
|
18 |
-
&& pip install --no-cache-dir --target=/install sentence-transformers==3.
|
19 |
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt \
|
20 |
-
&& pip install --no-cache-dir --target=/install gradio==
|
21 |
|
22 |
# Add /install to the PYTHONPATH
|
23 |
ENV PYTHONPATH="/install:${PYTHONPATH}"
|
@@ -57,6 +57,7 @@ ENV HOME=/home/user \
|
|
57 |
GRADIO_NUM_PORTS=1 \
|
58 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
59 |
GRADIO_SERVER_PORT=7860 \
|
|
|
60 |
GRADIO_THEME=huggingface \
|
61 |
AWS_STS_REGIONAL_ENDPOINT=regional \
|
62 |
SYSTEM=spaces
|
|
|
2 |
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
|
3 |
|
4 |
# Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
5 |
+
# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.4 /lambda-adapter /opt/extensions/lambda-adapter
|
6 |
|
7 |
# Update apt
|
8 |
RUN apt-get update && rm -rf /var/lib/apt/lists/*
|
|
|
14 |
|
15 |
COPY requirements_aws.txt .
|
16 |
|
17 |
+
RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
|
18 |
+
&& pip install --no-cache-dir --target=/install sentence-transformers==3.3.1 --no-deps \
|
19 |
&& pip install --no-cache-dir --target=/install -r requirements_aws.txt \
|
20 |
+
&& pip install --no-cache-dir --target=/install gradio==5.6.0
|
21 |
|
22 |
# Add /install to the PYTHONPATH
|
23 |
ENV PYTHONPATH="/install:${PYTHONPATH}"
|
|
|
57 |
GRADIO_NUM_PORTS=1 \
|
58 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
59 |
GRADIO_SERVER_PORT=7860 \
|
60 |
+
GRADIO_ANALYTICS_ENABLED=False \
|
61 |
GRADIO_THEME=huggingface \
|
62 |
AWS_STS_REGIONAL_ENDPOINT=regional \
|
63 |
SYSTEM=spaces
|
app.py
CHANGED
@@ -3,20 +3,35 @@ import gradio as gr
|
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import os
|
|
|
|
|
|
|
6 |
PandasDataFrame = Type[pd.DataFrame]
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
-
from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array,
|
11 |
-
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
-
from search_funcs.aws_functions import load_data_from_aws
|
14 |
from search_funcs.auth import authenticate_user
|
|
|
|
|
|
|
15 |
|
16 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
|
17 |
# temp_folder_path = get_temp_folder_path()
|
18 |
# empty_folder(temp_folder_path)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
## Gradio app - BM25 search
|
21 |
app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
|
22 |
|
@@ -47,6 +62,20 @@ with app:
|
|
47 |
join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
|
48 |
output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
# Informational state objects
|
51 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
52 |
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
|
@@ -89,7 +118,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
89 |
keyword_query = gr.Textbox(label="Enter your search term")
|
90 |
with gr.Row():
|
91 |
keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
|
92 |
-
fuzzy_search_button = gr.Button(value="Fuzzy search (slow
|
93 |
with gr.Row():
|
94 |
output_single_text = gr.Textbox(label="Top result")
|
95 |
output_file = gr.File(label="File output")
|
@@ -131,7 +160,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
131 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
132 |
embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
133 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
134 |
-
with gr.Accordion(label="
|
135 |
with gr.Row():
|
136 |
in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
|
137 |
in_k1_button = gr.Button(value = "k1 value info", scale = 1)
|
@@ -147,7 +176,8 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
147 |
with gr.Row():
|
148 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
149 |
with gr.Accordion(label="Fuzzy search options", open = False):
|
150 |
-
|
|
|
151 |
|
152 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
153 |
in_join_file = gr.File(label="Upload your data to join here")
|
@@ -181,19 +211,20 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
181 |
|
182 |
### BM25 SEARCH ###
|
183 |
# Update dropdowns upon initial file load
|
184 |
-
in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load")
|
|
|
185 |
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
186 |
|
187 |
# Load in BM25 data
|
188 |
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, prepared_keyword_data_state, tokenised_prepared_keyword_data_state, in_clean_data, return_intermediate_files], outputs=[tokenised_prepared_keyword_data_state, load_finished_message, prepared_keyword_data_state, output_file, output_file, in_bm25_column], api_name="load_keyword").\
|
189 |
-
then(fn=prepare_bm25, inputs=[tokenised_prepared_keyword_data_state, in_bm25_file, in_bm25_column, bm25_search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file, bm25_search_index_state, tokenised_prepared_keyword_data_state], api_name="prepare_keyword")
|
190 |
|
191 |
# BM25 search functions on click or enter
|
192 |
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file], api_name="keyword_search")
|
193 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file])
|
194 |
|
195 |
# Fuzzy search functions on click
|
196 |
-
fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, tokenised_prepared_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column,
|
197 |
|
198 |
### SEMANTIC SEARCH ###
|
199 |
|
@@ -205,10 +236,39 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
205 |
then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
|
206 |
|
207 |
# Semantic search query
|
208 |
-
semantic_submit.click(
|
209 |
-
semantic_query.submit(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
-
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
212 |
|
213 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
214 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
|
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import os
|
6 |
+
import socket
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
PandasDataFrame = Type[pd.DataFrame]
|
10 |
|
11 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
12 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
13 |
+
from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, semantic_search
|
14 |
+
from search_funcs.helper_functions import display_info, get_input_file_names, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var
|
15 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
16 |
+
from search_funcs.aws_functions import load_data_from_aws, upload_file_to_s3
|
17 |
from search_funcs.auth import authenticate_user
|
18 |
+
from search_funcs.custom_csvlogger import CSVLogger_custom
|
19 |
+
|
20 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
21 |
|
22 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
|
23 |
# temp_folder_path = get_temp_folder_path()
|
24 |
# empty_folder(temp_folder_path)
|
25 |
|
26 |
+
host_name = socket.gethostname()
|
27 |
+
|
28 |
+
# Logging state
|
29 |
+
log_file_name = 'log.csv'
|
30 |
+
|
31 |
+
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
32 |
+
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
33 |
+
usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
34 |
+
|
35 |
## Gradio app - BM25 search
|
36 |
app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
|
37 |
|
|
|
62 |
join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
|
63 |
output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
|
64 |
|
65 |
+
feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
|
66 |
+
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
67 |
+
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
68 |
+
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
69 |
+
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
70 |
+
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
71 |
+
|
72 |
+
data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
|
73 |
+
doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
74 |
+
data_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
75 |
+
data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
|
76 |
+
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
77 |
+
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
78 |
+
|
79 |
# Informational state objects
|
80 |
in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
|
81 |
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
|
|
|
118 |
keyword_query = gr.Textbox(label="Enter your search term")
|
119 |
with gr.Row():
|
120 |
keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
|
121 |
+
fuzzy_search_button = gr.Button(value="Fuzzy search (slow)", variant="secondary", scale = 0)
|
122 |
with gr.Row():
|
123 |
output_single_text = gr.Textbox(label="Top result")
|
124 |
output_file = gr.File(label="File output")
|
|
|
160 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
161 |
embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
162 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
163 |
+
with gr.Accordion(label="BM25 search options", open = False):
|
164 |
with gr.Row():
|
165 |
in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
|
166 |
in_k1_button = gr.Button(value = "k1 value info", scale = 1)
|
|
|
176 |
with gr.Row():
|
177 |
in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
|
178 |
with gr.Accordion(label="Fuzzy search options", open = False):
|
179 |
+
search_whole_phrase_bool = gr.Checkbox(label= "Search for the whole phrase (rather than individual words within also)", value=True)
|
180 |
+
spelling_mistakes_max_num = gr.Slider(label = "Maximum number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
|
181 |
|
182 |
with gr.Accordion(label = "Join on additional dataframes to results", open = False):
|
183 |
in_join_file = gr.File(label="Upload your data to join here")
|
|
|
211 |
|
212 |
### BM25 SEARCH ###
|
213 |
# Update dropdowns upon initial file load
|
214 |
+
in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load").then(fn=get_input_file_names, inputs=[in_bm25_file], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, doc_full_file_name_textbox])
|
215 |
+
|
216 |
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
217 |
|
218 |
# Load in BM25 data
|
219 |
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, prepared_keyword_data_state, tokenised_prepared_keyword_data_state, in_clean_data, return_intermediate_files], outputs=[tokenised_prepared_keyword_data_state, load_finished_message, prepared_keyword_data_state, output_file, output_file, in_bm25_column], api_name="load_keyword").\
|
220 |
+
then(fn=prepare_bm25, inputs=[tokenised_prepared_keyword_data_state, in_bm25_file, in_bm25_column, bm25_search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file, bm25_search_index_state, tokenised_prepared_keyword_data_state], api_name="prepare_keyword")
|
221 |
|
222 |
# BM25 search functions on click or enter
|
223 |
keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file], api_name="keyword_search")
|
224 |
keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file])
|
225 |
|
226 |
# Fuzzy search functions on click
|
227 |
+
fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, tokenised_prepared_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, spelling_mistakes_max_num, search_whole_phrase_bool], outputs=[output_single_text, output_file], api_name="fuzzy_search")
|
228 |
|
229 |
### SEMANTIC SEARCH ###
|
230 |
|
|
|
236 |
then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
|
237 |
|
238 |
# Semantic search query
|
239 |
+
semantic_submit.click(semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
|
240 |
+
semantic_query.submit(semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
|
241 |
+
|
242 |
+
###
|
243 |
+
# APP LOAD AND LOGGING FUNCTIONS
|
244 |
+
###
|
245 |
+
|
246 |
+
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
247 |
+
|
248 |
+
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
249 |
+
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
250 |
+
access_callback.setup([session_hash_textbox], access_logs_folder)
|
251 |
+
|
252 |
+
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
253 |
+
then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
254 |
+
|
255 |
+
# User submitted feedback for pdf redactions
|
256 |
+
# pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
257 |
+
# pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, data_file_name_no_extension_textbox], feedback_logs_folder)
|
258 |
+
# pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, data_file_name_no_extension_textbox], None, preprocess=False).\
|
259 |
+
# then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
260 |
+
|
261 |
+
# Log processing time/token usage when making a query
|
262 |
+
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
263 |
+
usage_callback.setup([session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], usage_logs_folder)
|
264 |
+
|
265 |
+
# If output files are created, write logs to s3 (if possible)
|
266 |
+
output_file.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], preprocess=False).\
|
267 |
+
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
268 |
+
|
269 |
+
semantic_output_file.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], preprocess=False).\
|
270 |
+
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
271 |
|
|
|
272 |
|
273 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
274 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
requirements.txt
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
pandas==2.2.
|
2 |
polars==0.20.3
|
3 |
-
pyarrow==
|
4 |
openpyxl==3.1.3
|
5 |
-
torch==2.
|
6 |
-
spacy
|
7 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
8 |
-
gradio
|
9 |
-
sentence_transformers==3.
|
10 |
lxml==5.2.2
|
11 |
-
boto3==1.
|
|
|
|
1 |
+
pandas==2.2.3
|
2 |
polars==0.20.3
|
3 |
+
pyarrow==17.0.0
|
4 |
openpyxl==3.1.3
|
5 |
+
torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
|
6 |
+
spacy==3.8.0
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
8 |
+
gradio==5.6.0
|
9 |
+
sentence_transformers==3.3.1
|
10 |
lxml==5.2.2
|
11 |
+
boto3==1.35.71
|
12 |
+
python-levenshtein==0.26.1
|
requirements_aws.txt
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
pandas==2.2.
|
2 |
polars==0.20.3
|
3 |
-
pyarrow==
|
4 |
openpyxl==3.1.3
|
5 |
-
spacy==3.
|
6 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
7 |
lxml==5.2.2
|
8 |
-
boto3==1.
|
9 |
-
transformers==4.
|
10 |
scikit-learn==1.5.1
|
11 |
-
scipy==1.11.4
|
12 |
tqdm==4.66.5
|
13 |
-
numpy==1.26.4
|
|
|
|
1 |
+
pandas==2.2.3
|
2 |
polars==0.20.3
|
3 |
+
pyarrow==17.0.0
|
4 |
openpyxl==3.1.3
|
5 |
+
spacy==3.8.0
|
6 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
7 |
lxml==5.2.2
|
8 |
+
boto3==1.35.71
|
9 |
+
transformers==4.46.3
|
10 |
scikit-learn==1.5.1
|
|
|
11 |
tqdm==4.66.5
|
12 |
+
numpy==1.26.4
|
13 |
+
python-levenshtein==0.26.1
|
requirements_gpu.txt
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
pandas==2.2.
|
2 |
polars==0.20.3
|
3 |
-
pyarrow==
|
4 |
openpyxl==3.1.3
|
5 |
-
torch==2.
|
6 |
-
spacy
|
7 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
8 |
-
gradio
|
9 |
-
sentence_transformers==3.
|
10 |
lxml==5.2.2
|
11 |
-
boto3==1.
|
|
|
|
1 |
+
pandas==2.2.3
|
2 |
polars==0.20.3
|
3 |
+
pyarrow==17.0.0
|
4 |
openpyxl==3.1.3
|
5 |
+
torch==2.5.1 --index-url https://download.pytorch.org/whl/nightly/cu121
|
6 |
+
spacy==3.8.0
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
8 |
+
gradio==5.6.0
|
9 |
+
sentence_transformers==3.3.1
|
10 |
lxml==5.2.2
|
11 |
+
boto3==1.35.71
|
12 |
+
python-levenshtein==0.26.1
|
requirements_keyword_only.txt
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
pandas==2.2.
|
2 |
polars==0.20.3
|
3 |
-
pyarrow==
|
4 |
openpyxl==3.1.3
|
5 |
-
#torch==2.
|
6 |
-
spacy
|
7 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
8 |
-
gradio
|
9 |
-
#sentence_transformers==3.
|
10 |
lxml==5.2.2
|
11 |
-
#boto3==1.
|
|
|
|
1 |
+
pandas==2.2.3
|
2 |
polars==0.20.3
|
3 |
+
pyarrow==17.0.0
|
4 |
openpyxl==3.1.3
|
5 |
+
#torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
|
6 |
+
spacy==3.8.0
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
8 |
+
gradio==5.6.1
|
9 |
+
#sentence_transformers==3.3.1
|
10 |
lxml==5.2.2
|
11 |
+
#boto3==1.35.71
|
12 |
+
python-levenshtein==0.26.1
|
search_funcs/aws_functions.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from typing import Type
|
2 |
import pandas as pd
|
3 |
import boto3
|
4 |
import tempfile
|
@@ -166,3 +166,46 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
|
|
166 |
|
167 |
return files, out_message
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type, List
|
2 |
import pandas as pd
|
3 |
import boto3
|
4 |
import tempfile
|
|
|
166 |
|
167 |
return files, out_message
|
168 |
|
169 |
+
def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
|
170 |
+
"""
|
171 |
+
Uploads a file from local machine to Amazon S3.
|
172 |
+
|
173 |
+
Args:
|
174 |
+
- local_file_path: Local file path(s) of the file(s) to upload.
|
175 |
+
- s3_key: Key (path) to the file in the S3 bucket.
|
176 |
+
- s3_bucket: Name of the S3 bucket.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
- Message as variable/printed to console
|
180 |
+
"""
|
181 |
+
final_out_message = []
|
182 |
+
|
183 |
+
s3_client = boto3.client('s3')
|
184 |
+
|
185 |
+
if isinstance(local_file_paths, str):
|
186 |
+
local_file_paths = [local_file_paths]
|
187 |
+
|
188 |
+
for file in local_file_paths:
|
189 |
+
if s3_client:
|
190 |
+
#print(s3_client)
|
191 |
+
try:
|
192 |
+
# Get file name off file path
|
193 |
+
file_name = os.path.basename(file)
|
194 |
+
|
195 |
+
s3_key_full = s3_key + file_name
|
196 |
+
print("S3 key: ", s3_key_full)
|
197 |
+
|
198 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
199 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
200 |
+
print(out_message)
|
201 |
+
|
202 |
+
except Exception as e:
|
203 |
+
out_message = f"Error uploading file(s): {e}"
|
204 |
+
print(out_message)
|
205 |
+
|
206 |
+
final_out_message.append(out_message)
|
207 |
+
final_out_message_str = '\n'.join(final_out_message)
|
208 |
+
|
209 |
+
else: final_out_message_str = "Could not connect to AWS."
|
210 |
+
|
211 |
+
return final_out_message_str
|
search_funcs/bm25_functions.py
CHANGED
@@ -345,7 +345,7 @@ def prepare_bm25_input_data(
|
|
345 |
|
346 |
progress(0.4, desc = "Tokenising text")
|
347 |
|
348 |
-
print("Tokenised state:", tokenised_state)
|
349 |
|
350 |
if tokenised_state:
|
351 |
prepared_search_text_list = tokenised_state.iloc[:,0].tolist()
|
|
|
345 |
|
346 |
progress(0.4, desc = "Tokenising text")
|
347 |
|
348 |
+
#print("Tokenised state:", tokenised_state)
|
349 |
|
350 |
if tokenised_state:
|
351 |
prepared_search_text_list = tokenised_state.iloc[:,0].tolist()
|
search_funcs/custom_csvlogger.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import contextlib
|
3 |
+
import csv
|
4 |
+
import datetime
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
from collections.abc import Sequence
|
8 |
+
from multiprocessing import Lock
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import TYPE_CHECKING, Any
|
11 |
+
|
12 |
+
from gradio_client import utils as client_utils
|
13 |
+
|
14 |
+
import gradio as gr
|
15 |
+
from gradio import utils, wasm_utils
|
16 |
+
|
17 |
+
if TYPE_CHECKING:
|
18 |
+
from gradio.components import Component
|
19 |
+
from gradio.flagging import FlaggingCallback
|
20 |
+
from threading import Lock
|
21 |
+
|
22 |
+
class CSVLogger_custom(FlaggingCallback):
|
23 |
+
"""
|
24 |
+
The default implementation of the FlaggingCallback abstract class in gradio>=5.0. Each flagged
|
25 |
+
sample (both the input and output data) is logged to a CSV file with headers on the machine running
|
26 |
+
the gradio app. Unlike ClassicCSVLogger, this implementation is concurrent-safe and it creates a new
|
27 |
+
dataset file every time the headers of the CSV (derived from the labels of the components) change. It also
|
28 |
+
only creates columns for "username" and "flag" if the flag_option and username are provided, respectively.
|
29 |
+
|
30 |
+
Example:
|
31 |
+
import gradio as gr
|
32 |
+
def image_classifier(inp):
|
33 |
+
return {'cat': 0.3, 'dog': 0.7}
|
34 |
+
demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
|
35 |
+
flagging_callback=CSVLogger())
|
36 |
+
Guides: using-flagging
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(
|
40 |
+
self,
|
41 |
+
simplify_file_data: bool = True,
|
42 |
+
verbose: bool = True,
|
43 |
+
dataset_file_name: str | None = None,
|
44 |
+
):
|
45 |
+
"""
|
46 |
+
Parameters:
|
47 |
+
simplify_file_data: If True, the file data will be simplified before being written to the CSV file. If CSVLogger is being used to cache examples, this is set to False to preserve the original FileData class
|
48 |
+
verbose: If True, prints messages to the console about the dataset file creation
|
49 |
+
dataset_file_name: The name of the dataset file to be created (should end in ".csv"). If None, the dataset file will be named "dataset1.csv" or the next available number.
|
50 |
+
"""
|
51 |
+
self.simplify_file_data = simplify_file_data
|
52 |
+
self.verbose = verbose
|
53 |
+
self.dataset_file_name = dataset_file_name
|
54 |
+
self.lock = (
|
55 |
+
Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
|
56 |
+
) # The multiprocessing module doesn't work on Lite.
|
57 |
+
|
58 |
+
def setup(
|
59 |
+
self,
|
60 |
+
components: Sequence[Component],
|
61 |
+
flagging_dir: str | Path,
|
62 |
+
):
|
63 |
+
self.components = components
|
64 |
+
self.flagging_dir = Path(flagging_dir)
|
65 |
+
self.first_time = True
|
66 |
+
|
67 |
+
def _create_dataset_file(self, additional_headers: list[str] | None = None):
|
68 |
+
os.makedirs(self.flagging_dir, exist_ok=True)
|
69 |
+
|
70 |
+
if additional_headers is None:
|
71 |
+
additional_headers = []
|
72 |
+
headers = (
|
73 |
+
[
|
74 |
+
getattr(component, "label", None) or f"component {idx}"
|
75 |
+
for idx, component in enumerate(self.components)
|
76 |
+
]
|
77 |
+
+ additional_headers
|
78 |
+
+ [
|
79 |
+
"timestamp",
|
80 |
+
]
|
81 |
+
)
|
82 |
+
headers = utils.sanitize_list_for_csv(headers)
|
83 |
+
dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
|
84 |
+
|
85 |
+
if self.dataset_file_name:
|
86 |
+
self.dataset_filepath = self.flagging_dir / self.dataset_file_name
|
87 |
+
elif dataset_files:
|
88 |
+
try:
|
89 |
+
latest_file = max(
|
90 |
+
dataset_files, key=lambda f: int(re.findall(r"\d+", f.stem)[0])
|
91 |
+
)
|
92 |
+
latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
|
93 |
+
|
94 |
+
with open(latest_file, newline="", encoding="utf-8") as csvfile:
|
95 |
+
reader = csv.reader(csvfile)
|
96 |
+
existing_headers = next(reader, None)
|
97 |
+
|
98 |
+
if existing_headers != headers:
|
99 |
+
new_num = latest_num + 1
|
100 |
+
self.dataset_filepath = self.flagging_dir / f"dataset{new_num}.csv"
|
101 |
+
else:
|
102 |
+
self.dataset_filepath = latest_file
|
103 |
+
except Exception:
|
104 |
+
self.dataset_filepath = self.flagging_dir / "dataset1.csv"
|
105 |
+
else:
|
106 |
+
self.dataset_filepath = self.flagging_dir / "dataset1.csv"
|
107 |
+
|
108 |
+
if not Path(self.dataset_filepath).exists():
|
109 |
+
with open(
|
110 |
+
self.dataset_filepath, "w", newline="", encoding="utf-8"
|
111 |
+
) as csvfile:
|
112 |
+
writer = csv.writer(csvfile)
|
113 |
+
writer.writerow(utils.sanitize_list_for_csv(headers))
|
114 |
+
if self.verbose:
|
115 |
+
print("Created dataset file at:", self.dataset_filepath)
|
116 |
+
elif self.verbose:
|
117 |
+
print("Using existing dataset file at:", self.dataset_filepath)
|
118 |
+
|
119 |
+
def flag(
|
120 |
+
self,
|
121 |
+
flag_data: list[Any],
|
122 |
+
flag_option: str | None = None,
|
123 |
+
username: str | None = None,
|
124 |
+
) -> int:
|
125 |
+
if self.first_time:
|
126 |
+
additional_headers = []
|
127 |
+
if flag_option is not None:
|
128 |
+
additional_headers.append("flag")
|
129 |
+
if username is not None:
|
130 |
+
additional_headers.append("username")
|
131 |
+
self._create_dataset_file(additional_headers=additional_headers)
|
132 |
+
self.first_time = False
|
133 |
+
|
134 |
+
csv_data = []
|
135 |
+
for idx, (component, sample) in enumerate(
|
136 |
+
zip(self.components, flag_data, strict=False)
|
137 |
+
):
|
138 |
+
save_dir = (
|
139 |
+
self.flagging_dir
|
140 |
+
/ client_utils.strip_invalid_filename_characters(
|
141 |
+
getattr(component, "label", None) or f"component {idx}"
|
142 |
+
)
|
143 |
+
)
|
144 |
+
if utils.is_prop_update(sample):
|
145 |
+
csv_data.append(str(sample))
|
146 |
+
else:
|
147 |
+
data = (
|
148 |
+
component.flag(sample, flag_dir=save_dir)
|
149 |
+
if sample is not None
|
150 |
+
else ""
|
151 |
+
)
|
152 |
+
if self.simplify_file_data:
|
153 |
+
data = utils.simplify_file_data_in_str(data)
|
154 |
+
csv_data.append(data)
|
155 |
+
|
156 |
+
if flag_option is not None:
|
157 |
+
csv_data.append(flag_option)
|
158 |
+
if username is not None:
|
159 |
+
csv_data.append(username)
|
160 |
+
csv_data.append(str(datetime.datetime.now()))
|
161 |
+
|
162 |
+
with self.lock:
|
163 |
+
with open(
|
164 |
+
self.dataset_filepath, "a", newline="", encoding="utf-8"
|
165 |
+
) as csvfile:
|
166 |
+
writer = csv.writer(csvfile)
|
167 |
+
writer.writerow(utils.sanitize_list_for_csv(csv_data))
|
168 |
+
with open(self.dataset_filepath, encoding="utf-8") as csvfile:
|
169 |
+
line_count = len(list(csv.reader(csvfile))) - 1
|
170 |
+
|
171 |
+
return line_count
|
search_funcs/helper_functions.py
CHANGED
@@ -4,7 +4,6 @@ import pandas as pd
|
|
4 |
import gradio as gr
|
5 |
import os
|
6 |
import shutil
|
7 |
-
import getpass
|
8 |
import gzip
|
9 |
import zipfile
|
10 |
import pickle
|
@@ -34,6 +33,14 @@ def get_or_create_env_var(var_name, default_value):
|
|
34 |
|
35 |
return value
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# Retrieving or setting output folder
|
38 |
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
39 |
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
@@ -42,8 +49,6 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
|
42 |
# running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
|
43 |
# print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
|
44 |
|
45 |
-
|
46 |
-
|
47 |
def ensure_output_folder_exists(output_folder):
|
48 |
"""Checks if the output folder exists, creates it if not."""
|
49 |
|
@@ -56,72 +61,100 @@ def ensure_output_folder_exists(output_folder):
|
|
56 |
else:
|
57 |
print(f"The output folder already exists:", folder_name)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
async def get_connection_params(request: gr.Request):
|
60 |
base_folder = ""
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
|
84 |
-
CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
|
85 |
-
print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
|
86 |
-
|
87 |
-
if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
88 |
-
if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
|
89 |
-
supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
|
90 |
-
if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
|
91 |
-
print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
|
92 |
else:
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
|
115 |
-
|
116 |
-
else:
|
117 |
-
print("No session parameters found.")
|
118 |
-
return "",""
|
119 |
|
120 |
-
# Attempt to delete content of gradio temp folder
|
121 |
-
# def get_temp_folder_path():
|
122 |
-
# username = getpass.getuser()
|
123 |
-
# return os.path.join('C:\\Users', username, 'AppData\\Local\\Temp\\gradio')
|
124 |
-
|
125 |
def empty_folder(directory_path):
|
126 |
if not os.path.exists(directory_path):
|
127 |
#print(f"The directory {directory_path} does not exist. No temporary files from previous app use found to delete.")
|
@@ -495,15 +528,19 @@ def create_highlighted_excel_wb(df: pd.DataFrame, search_text: str, column_to_hi
|
|
495 |
|
496 |
column_width = 150 # Adjust as needed
|
497 |
relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
|
498 |
-
print(relevant_column_no)
|
499 |
sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
|
500 |
|
|
|
|
|
501 |
# Find substrings in cells and highlight
|
502 |
for r_idx, row in enumerate(df.itertuples(), start=2):
|
503 |
for c_idx, cell_value in enumerate(row[1:], start=1):
|
504 |
sheet.cell(row=r_idx, column=c_idx, value=cell_value)
|
505 |
if df.columns[c_idx - 1] == column_to_highlight:
|
506 |
|
|
|
|
|
507 |
html_text, combined_positions = highlight_found_text(search_text, cell_value)
|
508 |
sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
|
509 |
sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)
|
|
|
4 |
import gradio as gr
|
5 |
import os
|
6 |
import shutil
|
|
|
7 |
import gzip
|
8 |
import zipfile
|
9 |
import pickle
|
|
|
33 |
|
34 |
return value
|
35 |
|
36 |
+
# Retrieving or setting CUSTOM_HEADER
|
37 |
+
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
38 |
+
print(f'CUSTOM_HEADER found')
|
39 |
+
|
40 |
+
# Retrieving or setting CUSTOM_HEADER_VALUE
|
41 |
+
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
42 |
+
print(f'CUSTOM_HEADER_VALUE found')
|
43 |
+
|
44 |
# Retrieving or setting output folder
|
45 |
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
46 |
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
|
|
49 |
# running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
|
50 |
# print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
|
51 |
|
|
|
|
|
52 |
def ensure_output_folder_exists(output_folder):
|
53 |
"""Checks if the output folder exists, creates it if not."""
|
54 |
|
|
|
61 |
else:
|
62 |
print(f"The output folder already exists:", folder_name)
|
63 |
|
64 |
+
def get_input_file_names(file_input):
|
65 |
+
'''
|
66 |
+
Get list of input files to report to logs.
|
67 |
+
'''
|
68 |
+
|
69 |
+
all_relevant_files = []
|
70 |
+
file_name_with_extension = ""
|
71 |
+
full_file_name = ""
|
72 |
+
|
73 |
+
#print("file_input in input file names:", file_input)
|
74 |
+
if isinstance(file_input, dict):
|
75 |
+
file_input = os.path.abspath(file_input["name"])
|
76 |
+
|
77 |
+
if isinstance(file_input, str):
|
78 |
+
file_input_list = [file_input]
|
79 |
+
else:
|
80 |
+
file_input_list = file_input
|
81 |
+
|
82 |
+
for file in file_input_list:
|
83 |
+
if isinstance(file, str):
|
84 |
+
file_path = file
|
85 |
+
else:
|
86 |
+
file_path = file.name
|
87 |
+
|
88 |
+
file_path_without_ext = get_file_path_end(file_path)
|
89 |
+
|
90 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
91 |
+
|
92 |
+
# Check if the file is an image type
|
93 |
+
if file_extension in ['.xlsx', '.csv', '.parquet']:
|
94 |
+
all_relevant_files.append(file_path_without_ext)
|
95 |
+
file_name_with_extension = file_path_without_ext + file_extension
|
96 |
+
full_file_name = file_path
|
97 |
+
|
98 |
+
all_relevant_files_str = ", ".join(all_relevant_files)
|
99 |
+
|
100 |
+
print("all_relevant_files_str:", all_relevant_files_str)
|
101 |
+
|
102 |
+
return all_relevant_files_str, file_name_with_extension, full_file_name
|
103 |
+
|
104 |
async def get_connection_params(request: gr.Request):
|
105 |
base_folder = ""
|
106 |
|
107 |
+
#print("request user:", request.username)
|
108 |
+
|
109 |
+
#request_data = await request.json() # Parse JSON body
|
110 |
+
#print("All request data:", request_data)
|
111 |
+
#context_value = request_data.get('context')
|
112 |
+
#if 'context' in request_data:
|
113 |
+
# print("Request context dictionary:", request_data['context'])
|
114 |
+
|
115 |
+
print("Request headers dictionary:", request.headers)
|
116 |
+
print("All host elements", request.client)
|
117 |
+
print("IP address:", request.client.host)
|
118 |
+
print("Query parameters:", dict(request.query_params))
|
119 |
+
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
120 |
+
#print("Request dictionary to object:", request.request.body())
|
121 |
+
print("Session hash:", request.session_hash)
|
122 |
+
|
123 |
+
if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
|
124 |
+
if CUSTOM_HEADER in request.headers:
|
125 |
+
supplied_custom_header_value = request.headers[CUSTOM_HEADER]
|
126 |
+
if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
|
127 |
+
print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
else:
|
129 |
+
print("Custom header value does not match expected value.")
|
130 |
+
raise ValueError("Custom header value does not match expected value.")
|
131 |
+
else:
|
132 |
+
print("Custom header value not found.")
|
133 |
+
raise ValueError("Custom header value not found.")
|
134 |
|
135 |
+
# Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
|
136 |
|
137 |
+
if request.username:
|
138 |
+
out_session_hash = request.username
|
139 |
+
base_folder = "user-files/"
|
140 |
+
print("Request username found:", out_session_hash)
|
141 |
|
142 |
+
elif 'x-cognito-id' in request.headers:
|
143 |
+
out_session_hash = request.headers['x-cognito-id']
|
144 |
+
base_folder = "user-files/"
|
145 |
+
print("Cognito ID found:", out_session_hash)
|
146 |
|
147 |
+
else:
|
148 |
+
out_session_hash = request.session_hash
|
149 |
+
base_folder = "temp-files/"
|
150 |
+
# print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
|
151 |
|
152 |
+
output_folder = base_folder + out_session_hash + "/"
|
153 |
+
#if bucket_name:
|
154 |
+
# print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
|
155 |
|
156 |
+
return out_session_hash, output_folder, out_session_hash
|
|
|
|
|
|
|
157 |
|
|
|
|
|
|
|
|
|
|
|
158 |
def empty_folder(directory_path):
|
159 |
if not os.path.exists(directory_path):
|
160 |
#print(f"The directory {directory_path} does not exist. No temporary files from previous app use found to delete.")
|
|
|
528 |
|
529 |
column_width = 150 # Adjust as needed
|
530 |
relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
|
531 |
+
print("Relevant column number is:", relevant_column_no)
|
532 |
sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
|
533 |
|
534 |
+
print("search_text is:", search_text)
|
535 |
+
|
536 |
# Find substrings in cells and highlight
|
537 |
for r_idx, row in enumerate(df.itertuples(), start=2):
|
538 |
for c_idx, cell_value in enumerate(row[1:], start=1):
|
539 |
sheet.cell(row=r_idx, column=c_idx, value=cell_value)
|
540 |
if df.columns[c_idx - 1] == column_to_highlight:
|
541 |
|
542 |
+
print("cell value:", cell_value)
|
543 |
+
|
544 |
html_text, combined_positions = highlight_found_text(search_text, cell_value)
|
545 |
sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
|
546 |
sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)
|
search_funcs/semantic_functions.py
CHANGED
@@ -8,7 +8,7 @@ from search_funcs.helper_functions import get_file_path_end, create_highlighted_
|
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
today_rev = datetime.now().strftime("%Y%m%d")
|
10 |
|
11 |
-
def load_embedding_model(embeddings_name = "
|
12 |
|
13 |
from torch import cuda, backends
|
14 |
from sentence_transformers import SentenceTransformer
|
@@ -63,7 +63,7 @@ def docs_to_embed_np_array(
|
|
63 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
64 |
) -> tuple:
|
65 |
"""
|
66 |
-
Process documents to create
|
67 |
|
68 |
Parameters:
|
69 |
- docs_out (list): List of documents to be embedded.
|
@@ -119,7 +119,8 @@ def docs_to_embed_np_array(
|
|
119 |
print("Embedding with MiniLM-L6-v2 model")
|
120 |
|
121 |
if embeddings_compress == "No":
|
122 |
-
print("Embedding with
|
|
|
123 |
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
|
124 |
else:
|
125 |
print("Embedding with int8 precision")
|
@@ -235,7 +236,7 @@ def process_data_from_scores_df(
|
|
235 |
|
236 |
return results_df_out
|
237 |
|
238 |
-
def
|
239 |
query_str: str,
|
240 |
embeddings: np.ndarray,
|
241 |
documents: list,
|
|
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
today_rev = datetime.now().strftime("%Y%m%d")
|
10 |
|
11 |
+
def load_embedding_model(embeddings_name = "sentence-transformers/all-MiniLM-L6-v2", embedding_loc="minilm/"):
|
12 |
|
13 |
from torch import cuda, backends
|
14 |
from sentence_transformers import SentenceTransformer
|
|
|
63 |
progress: gr.Progress = gr.Progress(track_tqdm=True)
|
64 |
) -> tuple:
|
65 |
"""
|
66 |
+
Process documents to create embeddings and save them as a numpy array.
|
67 |
|
68 |
Parameters:
|
69 |
- docs_out (list): List of documents to be embedded.
|
|
|
119 |
print("Embedding with MiniLM-L6-v2 model")
|
120 |
|
121 |
if embeddings_compress == "No":
|
122 |
+
print("Embedding with fp16 precision")
|
123 |
+
embeddings_model.half()
|
124 |
embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
|
125 |
else:
|
126 |
print("Embedding with int8 precision")
|
|
|
236 |
|
237 |
return results_df_out
|
238 |
|
239 |
+
def semantic_search(
|
240 |
query_str: str,
|
241 |
embeddings: np.ndarray,
|
242 |
documents: list,
|
search_funcs/spacy_search_funcs.py
CHANGED
@@ -1,54 +1,68 @@
|
|
1 |
import numpy as np
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
|
|
4 |
from typing import List, Type
|
5 |
from datetime import datetime
|
|
|
|
|
6 |
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
|
|
|
|
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
|
10 |
today_rev = datetime.now().strftime("%Y%m%d")
|
11 |
|
12 |
-
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str,
|
13 |
''' Conduct fuzzy match on a list of data.'''
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
|
|
|
|
|
19 |
# Load spaCy model
|
20 |
nlp = load_spacy_model()
|
21 |
|
22 |
# Convert tokenised data back into a list of strings
|
23 |
df_list = list(map(" ".join, tokenised_data))
|
24 |
|
25 |
-
if len(df_list) >
|
26 |
-
out_message = "Your data has more than
|
27 |
return out_message, None
|
28 |
|
29 |
query = nlp(string_query)
|
30 |
-
tokenised_query = [token.text for token in query]
|
31 |
-
print(tokenised_query)
|
32 |
|
33 |
-
|
|
|
34 |
|
35 |
-
|
36 |
-
if len(tokenised_query) > 1:
|
37 |
-
pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
|
38 |
-
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
|
39 |
-
else:
|
40 |
-
pattern_lemma = [{"LEMMA": tokenised_query[0]}]
|
41 |
-
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
# %%
|
52 |
batch_size = 256
|
53 |
docs = nlp.pipe(df_list, batch_size=batch_size)
|
54 |
|
@@ -59,7 +73,25 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
|
|
59 |
for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
60 |
matches = matcher(doc)
|
61 |
match_count = len(matches)
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
print("Search complete")
|
65 |
|
@@ -76,7 +108,7 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
|
|
76 |
"search_text": df_list,
|
77 |
"search_score_abs": match_scores})
|
78 |
results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
|
79 |
-
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")
|
80 |
|
81 |
# Keep only results with at least one match
|
82 |
results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
|
@@ -97,7 +129,10 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
|
|
97 |
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
98 |
|
99 |
# Out file
|
100 |
-
query_str_file =
|
|
|
|
|
|
|
101 |
results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
102 |
|
103 |
print("Saving search file output")
|
@@ -105,11 +140,21 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
|
|
105 |
|
106 |
#results_df_out.to_excel(results_df_name, index= None)
|
107 |
|
|
|
|
|
|
|
108 |
# Highlight found text and save to file
|
109 |
results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
|
110 |
results_df_out_wb.save(results_df_name)
|
111 |
|
112 |
-
results_first_text = results_df_out[text_column].iloc[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
print("Returning results")
|
115 |
|
|
|
1 |
import numpy as np
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
+
import Levenshtein
|
5 |
from typing import List, Type
|
6 |
from datetime import datetime
|
7 |
+
import re
|
8 |
+
|
9 |
from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
|
10 |
+
from spacy import prefer_gpu
|
11 |
+
from spacy.matcher import Matcher, PhraseMatcher
|
12 |
|
13 |
PandasDataFrame = Type[pd.DataFrame]
|
14 |
|
15 |
today_rev = datetime.now().strftime("%Y%m%d")
|
16 |
|
17 |
+
def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, spelling_mistakes_max:int = 1, search_whole_phrase:bool=False, progress=gr.Progress(track_tqdm=True)):
|
18 |
''' Conduct fuzzy match on a list of data.'''
|
19 |
|
20 |
+
if not tokenised_data:
|
21 |
+
out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
|
22 |
+
print(out_message)
|
23 |
+
return out_message, None
|
24 |
+
|
25 |
+
# Lower case query
|
26 |
+
string_query = string_query.lower()
|
27 |
|
28 |
+
prefer_gpu()
|
29 |
+
|
30 |
# Load spaCy model
|
31 |
nlp = load_spacy_model()
|
32 |
|
33 |
# Convert tokenised data back into a list of strings
|
34 |
df_list = list(map(" ".join, tokenised_data))
|
35 |
|
36 |
+
if len(df_list) > 100000:
|
37 |
+
out_message = "Your data has more than 100,000 rows and will take more than 30 minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
|
38 |
return out_message, None
|
39 |
|
40 |
query = nlp(string_query)
|
|
|
|
|
41 |
|
42 |
+
if search_whole_phrase == False:
|
43 |
+
tokenised_query = [token.text for token in query]
|
44 |
|
45 |
+
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
if len(tokenised_query) > 1:
|
48 |
+
pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
|
49 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
|
50 |
+
else:
|
51 |
+
pattern_lemma = [{"LEMMA": tokenised_query[0]}]
|
52 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
|
53 |
+
|
54 |
+
matcher = Matcher(nlp.vocab)
|
55 |
+
matcher.add(string_query, [pattern_fuzz])
|
56 |
+
matcher.add(string_query, [pattern_lemma])
|
57 |
|
58 |
+
else:
|
59 |
+
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
60 |
+
tokenised_query = [string_query.lower()]
|
61 |
+
# If you want to match the whole phrase, use phrase matcher
|
62 |
+
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
63 |
+
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
64 |
+
matcher.add("PHRASE", patterns)
|
65 |
|
|
|
66 |
batch_size = 256
|
67 |
docs = nlp.pipe(df_list, batch_size=batch_size)
|
68 |
|
|
|
73 |
for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
74 |
matches = matcher(doc)
|
75 |
match_count = len(matches)
|
76 |
+
|
77 |
+
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
78 |
+
if search_whole_phrase==False:
|
79 |
+
all_matches.append(match_count)
|
80 |
+
|
81 |
+
else:
|
82 |
+
for match_id, start, end in matches:
|
83 |
+
span = str(doc[start:end]).strip()
|
84 |
+
query_search = str(query).strip()
|
85 |
+
distance = Levenshtein.distance(query_search, span)
|
86 |
+
|
87 |
+
# Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
88 |
+
if distance > spelling_mistakes_max:
|
89 |
+
# Calculate Levenshtein distance
|
90 |
+
match_count = match_count - 1
|
91 |
+
|
92 |
+
all_matches.append(match_count)
|
93 |
+
|
94 |
+
#print("all_matches:", all_matches)
|
95 |
|
96 |
print("Search complete")
|
97 |
|
|
|
108 |
"search_text": df_list,
|
109 |
"search_score_abs": match_scores})
|
110 |
results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
|
111 |
+
results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left").drop(["index_x", "index_y"], axis=1, errors="ignore")
|
112 |
|
113 |
# Keep only results with at least one match
|
114 |
results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
|
|
|
129 |
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
130 |
|
131 |
# Out file
|
132 |
+
query_str_file = "_".join(tokenised_query).replace(" ", "_") # Replace spaces with underscores
|
133 |
+
query_str_file = re.sub(r'[<>:"/\\|?*]', '', query_str_file) # Remove invalid characters
|
134 |
+
query_str_file = query_str_file[:30] # Limit to 30 characters
|
135 |
+
|
136 |
results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
137 |
|
138 |
print("Saving search file output")
|
|
|
140 |
|
141 |
#results_df_out.to_excel(results_df_name, index= None)
|
142 |
|
143 |
+
print("string_query:", string_query)
|
144 |
+
print(results_df_out)
|
145 |
+
|
146 |
# Highlight found text and save to file
|
147 |
results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
|
148 |
results_df_out_wb.save(results_df_name)
|
149 |
|
150 |
+
#results_first_text = results_df_out[text_column].iloc[0]
|
151 |
+
|
152 |
+
# Check if the DataFrame is empty or if the column does not exist
|
153 |
+
if results_df_out.empty or text_column not in results_df_out.columns:
|
154 |
+
results_first_text = "" #None # or handle it as needed
|
155 |
+
print("Nothing found.")
|
156 |
+
else:
|
157 |
+
results_first_text = results_df_out[text_column].iloc[0]
|
158 |
|
159 |
print("Returning results")
|
160 |
|