seanpedrickcase commited on
Commit
dbad462
·
1 Parent(s): 20b4aa0

Included exact/fuzzy phrase matching. Updated packages and added basic logging.

Browse files
Dockerfile CHANGED
@@ -2,7 +2,7 @@
2
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
4
  # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
5
- # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
6
 
7
  # Update apt
8
  RUN apt-get update && rm -rf /var/lib/apt/lists/*
@@ -14,10 +14,10 @@ WORKDIR /src
14
 
15
  COPY requirements_aws.txt .
16
 
17
- RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
18
- && pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
19
  && pip install --no-cache-dir --target=/install -r requirements_aws.txt \
20
- && pip install --no-cache-dir --target=/install gradio==4.41.0
21
 
22
  # Add /install to the PYTHONPATH
23
  ENV PYTHONPATH="/install:${PYTHONPATH}"
@@ -57,6 +57,7 @@ ENV HOME=/home/user \
57
  GRADIO_NUM_PORTS=1 \
58
  GRADIO_SERVER_NAME=0.0.0.0 \
59
  GRADIO_SERVER_PORT=7860 \
 
60
  GRADIO_THEME=huggingface \
61
  AWS_STS_REGIONAL_ENDPOINT=regional \
62
  SYSTEM=spaces
 
2
  FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
3
 
4
  # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
5
+ # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.4 /lambda-adapter /opt/extensions/lambda-adapter
6
 
7
  # Update apt
8
  RUN apt-get update && rm -rf /var/lib/apt/lists/*
 
14
 
15
  COPY requirements_aws.txt .
16
 
17
+ RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
18
+ && pip install --no-cache-dir --target=/install sentence-transformers==3.3.1 --no-deps \
19
  && pip install --no-cache-dir --target=/install -r requirements_aws.txt \
20
+ && pip install --no-cache-dir --target=/install gradio==5.6.0
21
 
22
  # Add /install to the PYTHONPATH
23
  ENV PYTHONPATH="/install:${PYTHONPATH}"
 
57
  GRADIO_NUM_PORTS=1 \
58
  GRADIO_SERVER_NAME=0.0.0.0 \
59
  GRADIO_SERVER_PORT=7860 \
60
+ GRADIO_ANALYTICS_ENABLED=False \
61
  GRADIO_THEME=huggingface \
62
  AWS_STS_REGIONAL_ENDPOINT=regional \
63
  SYSTEM=spaces
app.py CHANGED
@@ -3,20 +3,35 @@ import gradio as gr
3
  import pandas as pd
4
  import numpy as np
5
  import os
 
 
 
6
  PandasDataFrame = Type[pd.DataFrame]
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
- from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, bge_semantic_search
11
- from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
- from search_funcs.aws_functions import load_data_from_aws
14
  from search_funcs.auth import authenticate_user
 
 
 
15
 
16
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
17
  # temp_folder_path = get_temp_folder_path()
18
  # empty_folder(temp_folder_path)
19
 
 
 
 
 
 
 
 
 
 
20
  ## Gradio app - BM25 search
21
  app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
22
 
@@ -47,6 +62,20 @@ with app:
47
  join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
48
  output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Informational state objects
51
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
52
  presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
@@ -89,7 +118,7 @@ depends on factors such as the type of documents or queries. Information taken f
89
  keyword_query = gr.Textbox(label="Enter your search term")
90
  with gr.Row():
91
  keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
92
- fuzzy_search_button = gr.Button(value="Fuzzy search (slow, < 10k rows)", variant="secondary", scale = 0)
93
  with gr.Row():
94
  output_single_text = gr.Textbox(label="Top result")
95
  output_file = gr.File(label="File output")
@@ -131,7 +160,7 @@ depends on factors such as the type of documents or queries. Information taken f
131
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
132
  embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
133
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
134
- with gr.Accordion(label="Keyword search options", open = False):
135
  with gr.Row():
136
  in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
137
  in_k1_button = gr.Button(value = "k1 value info", scale = 1)
@@ -147,7 +176,8 @@ depends on factors such as the type of documents or queries. Information taken f
147
  with gr.Row():
148
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
149
  with gr.Accordion(label="Fuzzy search options", open = False):
150
- no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
 
151
 
152
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
153
  in_join_file = gr.File(label="Upload your data to join here")
@@ -181,19 +211,20 @@ depends on factors such as the type of documents or queries. Information taken f
181
 
182
  ### BM25 SEARCH ###
183
  # Update dropdowns upon initial file load
184
- in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load")
 
185
  in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
186
 
187
  # Load in BM25 data
188
  load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, prepared_keyword_data_state, tokenised_prepared_keyword_data_state, in_clean_data, return_intermediate_files], outputs=[tokenised_prepared_keyword_data_state, load_finished_message, prepared_keyword_data_state, output_file, output_file, in_bm25_column], api_name="load_keyword").\
189
- then(fn=prepare_bm25, inputs=[tokenised_prepared_keyword_data_state, in_bm25_file, in_bm25_column, bm25_search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file, bm25_search_index_state, tokenised_prepared_keyword_data_state], api_name="prepare_keyword") # keyword_data_list_state
190
 
191
  # BM25 search functions on click or enter
192
  keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file], api_name="keyword_search")
193
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file])
194
 
195
  # Fuzzy search functions on click
196
- fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, tokenised_prepared_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy_search")
197
 
198
  ### SEMANTIC SEARCH ###
199
 
@@ -205,10 +236,39 @@ depends on factors such as the type of documents or queries. Information taken f
205
  then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
206
 
207
  # Semantic search query
208
- semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
209
- semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
212
 
213
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
214
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 
3
  import pandas as pd
4
  import numpy as np
5
  import os
6
+ import socket
7
+ from datetime import datetime
8
+
9
  PandasDataFrame = Type[pd.DataFrame]
10
 
11
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
12
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
13
+ from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, semantic_search
14
+ from search_funcs.helper_functions import display_info, get_input_file_names, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var
15
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
16
+ from search_funcs.aws_functions import load_data_from_aws, upload_file_to_s3
17
  from search_funcs.auth import authenticate_user
18
+ from search_funcs.custom_csvlogger import CSVLogger_custom
19
+
20
+ today_rev = datetime.now().strftime("%Y%m%d")
21
 
22
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
23
  # temp_folder_path = get_temp_folder_path()
24
  # empty_folder(temp_folder_path)
25
 
26
+ host_name = socket.gethostname()
27
+
28
+ # Logging state
29
+ log_file_name = 'log.csv'
30
+
31
+ feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
32
+ access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
33
+ usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
34
+
35
  ## Gradio app - BM25 search
36
  app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
37
 
 
62
  join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
63
  output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
64
 
65
+ feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
66
+ feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
67
+ access_logs_state = gr.State(access_logs_folder + log_file_name)
68
+ access_s3_logs_loc_state = gr.State(access_logs_folder)
69
+ usage_logs_state = gr.State(usage_logs_folder + log_file_name)
70
+ usage_s3_logs_loc_state = gr.State(usage_logs_folder)
71
+
72
+ data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
73
+ doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
74
+ data_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
75
+ data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
76
+ s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
77
+ session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
78
+
79
  # Informational state objects
80
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
81
  presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
 
118
  keyword_query = gr.Textbox(label="Enter your search term")
119
  with gr.Row():
120
  keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
121
+ fuzzy_search_button = gr.Button(value="Fuzzy search (slow)", variant="secondary", scale = 0)
122
  with gr.Row():
123
  output_single_text = gr.Textbox(label="Top result")
124
  output_file = gr.File(label="File output")
 
160
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
161
  embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
162
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
163
+ with gr.Accordion(label="BM25 search options", open = False):
164
  with gr.Row():
165
  in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
166
  in_k1_button = gr.Button(value = "k1 value info", scale = 1)
 
176
  with gr.Row():
177
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
178
  with gr.Accordion(label="Fuzzy search options", open = False):
179
+ search_whole_phrase_bool = gr.Checkbox(label= "Search for the whole phrase (rather than individual words within also)", value=True)
180
+ spelling_mistakes_max_num = gr.Slider(label = "Maximum number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
181
 
182
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
183
  in_join_file = gr.File(label="Upload your data to join here")
 
211
 
212
  ### BM25 SEARCH ###
213
  # Update dropdowns upon initial file load
214
+ in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load").then(fn=get_input_file_names, inputs=[in_bm25_file], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, doc_full_file_name_textbox])
215
+
216
  in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
217
 
218
  # Load in BM25 data
219
  load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, prepared_keyword_data_state, tokenised_prepared_keyword_data_state, in_clean_data, return_intermediate_files], outputs=[tokenised_prepared_keyword_data_state, load_finished_message, prepared_keyword_data_state, output_file, output_file, in_bm25_column], api_name="load_keyword").\
220
+ then(fn=prepare_bm25, inputs=[tokenised_prepared_keyword_data_state, in_bm25_file, in_bm25_column, bm25_search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file, bm25_search_index_state, tokenised_prepared_keyword_data_state], api_name="prepare_keyword")
221
 
222
  # BM25 search functions on click or enter
223
  keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file], api_name="keyword_search")
224
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file])
225
 
226
  # Fuzzy search functions on click
227
+ fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, tokenised_prepared_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, spelling_mistakes_max_num, search_whole_phrase_bool], outputs=[output_single_text, output_file], api_name="fuzzy_search")
228
 
229
  ### SEMANTIC SEARCH ###
230
 
 
236
  then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
237
 
238
  # Semantic search query
239
+ semantic_submit.click(semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
240
+ semantic_query.submit(semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
241
+
242
+ ###
243
+ # APP LOAD AND LOGGING FUNCTIONS
244
+ ###
245
+
246
+ app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
247
+
248
+ # Log usernames and times of access to file (to know who is using the app when running on AWS)
249
+ access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
250
+ access_callback.setup([session_hash_textbox], access_logs_folder)
251
+
252
+ session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
253
+ then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
254
+
255
+ # User submitted feedback for pdf redactions
256
+ # pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
257
+ # pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, data_file_name_no_extension_textbox], feedback_logs_folder)
258
+ # pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, data_file_name_no_extension_textbox], None, preprocess=False).\
259
+ # then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
260
+
261
+ # Log processing time/token usage when making a query
262
+ usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
263
+ usage_callback.setup([session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], usage_logs_folder)
264
+
265
+ # If output files are created, write logs to s3 (if possible)
266
+ output_file.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], preprocess=False).\
267
+ then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
268
+
269
+ semantic_output_file.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], preprocess=False).\
270
+ then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
271
 
 
272
 
273
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
274
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
requirements.txt CHANGED
@@ -1,11 +1,12 @@
1
- pandas==2.2.2
2
  polars==0.20.3
3
- pyarrow==14.0.2
4
  openpyxl==3.1.3
5
- torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
6
- spacy
7
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
- gradio
9
- sentence_transformers==3.0.1
10
  lxml==5.2.2
11
- boto3==1.34.142
 
 
1
+ pandas==2.2.3
2
  polars==0.20.3
3
+ pyarrow==17.0.0
4
  openpyxl==3.1.3
5
+ torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
6
+ spacy==3.8.0
7
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
8
+ gradio==5.6.0
9
+ sentence_transformers==3.3.1
10
  lxml==5.2.2
11
+ boto3==1.35.71
12
+ python-levenshtein==0.26.1
requirements_aws.txt CHANGED
@@ -1,13 +1,13 @@
1
- pandas==2.2.2
2
  polars==0.20.3
3
- pyarrow==14.0.2
4
  openpyxl==3.1.3
5
- spacy==3.7.5
6
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
7
  lxml==5.2.2
8
- boto3==1.34.158
9
- transformers==4.44.0
10
  scikit-learn==1.5.1
11
- scipy==1.11.4
12
  tqdm==4.66.5
13
- numpy==1.26.4
 
 
1
+ pandas==2.2.3
2
  polars==0.20.3
3
+ pyarrow==17.0.0
4
  openpyxl==3.1.3
5
+ spacy==3.8.0
6
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
7
  lxml==5.2.2
8
+ boto3==1.35.71
9
+ transformers==4.46.3
10
  scikit-learn==1.5.1
 
11
  tqdm==4.66.5
12
+ numpy==1.26.4
13
+ python-levenshtein==0.26.1
requirements_gpu.txt CHANGED
@@ -1,11 +1,12 @@
1
- pandas==2.2.2
2
  polars==0.20.3
3
- pyarrow==14.0.2
4
  openpyxl==3.1.3
5
- torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
6
- spacy
7
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
- gradio
9
- sentence_transformers==3.0.1
10
  lxml==5.2.2
11
- boto3==1.34.103
 
 
1
+ pandas==2.2.3
2
  polars==0.20.3
3
+ pyarrow==17.0.0
4
  openpyxl==3.1.3
5
+ torch==2.5.1 --index-url https://download.pytorch.org/whl/nightly/cu121
6
+ spacy==3.8.0
7
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
8
+ gradio==5.6.0
9
+ sentence_transformers==3.3.1
10
  lxml==5.2.2
11
+ boto3==1.35.71
12
+ python-levenshtein==0.26.1
requirements_keyword_only.txt CHANGED
@@ -1,11 +1,12 @@
1
- pandas==2.2.2
2
  polars==0.20.3
3
- pyarrow==14.0.2
4
  openpyxl==3.1.3
5
- #torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
6
- spacy
7
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
- gradio
9
- #sentence_transformers==3.0.1
10
  lxml==5.2.2
11
- #boto3==1.34.103
 
 
1
+ pandas==2.2.3
2
  polars==0.20.3
3
+ pyarrow==17.0.0
4
  openpyxl==3.1.3
5
+ #torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
6
+ spacy==3.8.0
7
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
8
+ gradio==5.6.1
9
+ #sentence_transformers==3.3.1
10
  lxml==5.2.2
11
+ #boto3==1.35.71
12
+ python-levenshtein==0.26.1
search_funcs/aws_functions.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Type
2
  import pandas as pd
3
  import boto3
4
  import tempfile
@@ -166,3 +166,46 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
166
 
167
  return files, out_message
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type, List
2
  import pandas as pd
3
  import boto3
4
  import tempfile
 
166
 
167
  return files, out_message
168
 
169
+ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
170
+ """
171
+ Uploads a file from local machine to Amazon S3.
172
+
173
+ Args:
174
+ - local_file_path: Local file path(s) of the file(s) to upload.
175
+ - s3_key: Key (path) to the file in the S3 bucket.
176
+ - s3_bucket: Name of the S3 bucket.
177
+
178
+ Returns:
179
+ - Message as variable/printed to console
180
+ """
181
+ final_out_message = []
182
+
183
+ s3_client = boto3.client('s3')
184
+
185
+ if isinstance(local_file_paths, str):
186
+ local_file_paths = [local_file_paths]
187
+
188
+ for file in local_file_paths:
189
+ if s3_client:
190
+ #print(s3_client)
191
+ try:
192
+ # Get file name off file path
193
+ file_name = os.path.basename(file)
194
+
195
+ s3_key_full = s3_key + file_name
196
+ print("S3 key: ", s3_key_full)
197
+
198
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
199
+ out_message = "File " + file_name + " uploaded successfully!"
200
+ print(out_message)
201
+
202
+ except Exception as e:
203
+ out_message = f"Error uploading file(s): {e}"
204
+ print(out_message)
205
+
206
+ final_out_message.append(out_message)
207
+ final_out_message_str = '\n'.join(final_out_message)
208
+
209
+ else: final_out_message_str = "Could not connect to AWS."
210
+
211
+ return final_out_message_str
search_funcs/bm25_functions.py CHANGED
@@ -345,7 +345,7 @@ def prepare_bm25_input_data(
345
 
346
  progress(0.4, desc = "Tokenising text")
347
 
348
- print("Tokenised state:", tokenised_state)
349
 
350
  if tokenised_state:
351
  prepared_search_text_list = tokenised_state.iloc[:,0].tolist()
 
345
 
346
  progress(0.4, desc = "Tokenising text")
347
 
348
+ #print("Tokenised state:", tokenised_state)
349
 
350
  if tokenised_state:
351
  prepared_search_text_list = tokenised_state.iloc[:,0].tolist()
search_funcs/custom_csvlogger.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import contextlib
3
+ import csv
4
+ import datetime
5
+ import os
6
+ import re
7
+ from collections.abc import Sequence
8
+ from multiprocessing import Lock
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from gradio_client import utils as client_utils
13
+
14
+ import gradio as gr
15
+ from gradio import utils, wasm_utils
16
+
17
+ if TYPE_CHECKING:
18
+ from gradio.components import Component
19
+ from gradio.flagging import FlaggingCallback
20
+ from threading import Lock
21
+
22
+ class CSVLogger_custom(FlaggingCallback):
23
+ """
24
+ The default implementation of the FlaggingCallback abstract class in gradio>=5.0. Each flagged
25
+ sample (both the input and output data) is logged to a CSV file with headers on the machine running
26
+ the gradio app. Unlike ClassicCSVLogger, this implementation is concurrent-safe and it creates a new
27
+ dataset file every time the headers of the CSV (derived from the labels of the components) change. It also
28
+ only creates columns for "username" and "flag" if the flag_option and username are provided, respectively.
29
+
30
+ Example:
31
+ import gradio as gr
32
+ def image_classifier(inp):
33
+ return {'cat': 0.3, 'dog': 0.7}
34
+ demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
35
+ flagging_callback=CSVLogger())
36
+ Guides: using-flagging
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ simplify_file_data: bool = True,
42
+ verbose: bool = True,
43
+ dataset_file_name: str | None = None,
44
+ ):
45
+ """
46
+ Parameters:
47
+ simplify_file_data: If True, the file data will be simplified before being written to the CSV file. If CSVLogger is being used to cache examples, this is set to False to preserve the original FileData class
48
+ verbose: If True, prints messages to the console about the dataset file creation
49
+ dataset_file_name: The name of the dataset file to be created (should end in ".csv"). If None, the dataset file will be named "dataset1.csv" or the next available number.
50
+ """
51
+ self.simplify_file_data = simplify_file_data
52
+ self.verbose = verbose
53
+ self.dataset_file_name = dataset_file_name
54
+ self.lock = (
55
+ Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
56
+ ) # The multiprocessing module doesn't work on Lite.
57
+
58
+ def setup(
59
+ self,
60
+ components: Sequence[Component],
61
+ flagging_dir: str | Path,
62
+ ):
63
+ self.components = components
64
+ self.flagging_dir = Path(flagging_dir)
65
+ self.first_time = True
66
+
67
+ def _create_dataset_file(self, additional_headers: list[str] | None = None):
68
+ os.makedirs(self.flagging_dir, exist_ok=True)
69
+
70
+ if additional_headers is None:
71
+ additional_headers = []
72
+ headers = (
73
+ [
74
+ getattr(component, "label", None) or f"component {idx}"
75
+ for idx, component in enumerate(self.components)
76
+ ]
77
+ + additional_headers
78
+ + [
79
+ "timestamp",
80
+ ]
81
+ )
82
+ headers = utils.sanitize_list_for_csv(headers)
83
+ dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
84
+
85
+ if self.dataset_file_name:
86
+ self.dataset_filepath = self.flagging_dir / self.dataset_file_name
87
+ elif dataset_files:
88
+ try:
89
+ latest_file = max(
90
+ dataset_files, key=lambda f: int(re.findall(r"\d+", f.stem)[0])
91
+ )
92
+ latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
93
+
94
+ with open(latest_file, newline="", encoding="utf-8") as csvfile:
95
+ reader = csv.reader(csvfile)
96
+ existing_headers = next(reader, None)
97
+
98
+ if existing_headers != headers:
99
+ new_num = latest_num + 1
100
+ self.dataset_filepath = self.flagging_dir / f"dataset{new_num}.csv"
101
+ else:
102
+ self.dataset_filepath = latest_file
103
+ except Exception:
104
+ self.dataset_filepath = self.flagging_dir / "dataset1.csv"
105
+ else:
106
+ self.dataset_filepath = self.flagging_dir / "dataset1.csv"
107
+
108
+ if not Path(self.dataset_filepath).exists():
109
+ with open(
110
+ self.dataset_filepath, "w", newline="", encoding="utf-8"
111
+ ) as csvfile:
112
+ writer = csv.writer(csvfile)
113
+ writer.writerow(utils.sanitize_list_for_csv(headers))
114
+ if self.verbose:
115
+ print("Created dataset file at:", self.dataset_filepath)
116
+ elif self.verbose:
117
+ print("Using existing dataset file at:", self.dataset_filepath)
118
+
119
+ def flag(
120
+ self,
121
+ flag_data: list[Any],
122
+ flag_option: str | None = None,
123
+ username: str | None = None,
124
+ ) -> int:
125
+ if self.first_time:
126
+ additional_headers = []
127
+ if flag_option is not None:
128
+ additional_headers.append("flag")
129
+ if username is not None:
130
+ additional_headers.append("username")
131
+ self._create_dataset_file(additional_headers=additional_headers)
132
+ self.first_time = False
133
+
134
+ csv_data = []
135
+ for idx, (component, sample) in enumerate(
136
+ zip(self.components, flag_data, strict=False)
137
+ ):
138
+ save_dir = (
139
+ self.flagging_dir
140
+ / client_utils.strip_invalid_filename_characters(
141
+ getattr(component, "label", None) or f"component {idx}"
142
+ )
143
+ )
144
+ if utils.is_prop_update(sample):
145
+ csv_data.append(str(sample))
146
+ else:
147
+ data = (
148
+ component.flag(sample, flag_dir=save_dir)
149
+ if sample is not None
150
+ else ""
151
+ )
152
+ if self.simplify_file_data:
153
+ data = utils.simplify_file_data_in_str(data)
154
+ csv_data.append(data)
155
+
156
+ if flag_option is not None:
157
+ csv_data.append(flag_option)
158
+ if username is not None:
159
+ csv_data.append(username)
160
+ csv_data.append(str(datetime.datetime.now()))
161
+
162
+ with self.lock:
163
+ with open(
164
+ self.dataset_filepath, "a", newline="", encoding="utf-8"
165
+ ) as csvfile:
166
+ writer = csv.writer(csvfile)
167
+ writer.writerow(utils.sanitize_list_for_csv(csv_data))
168
+ with open(self.dataset_filepath, encoding="utf-8") as csvfile:
169
+ line_count = len(list(csv.reader(csvfile))) - 1
170
+
171
+ return line_count
search_funcs/helper_functions.py CHANGED
@@ -4,7 +4,6 @@ import pandas as pd
4
  import gradio as gr
5
  import os
6
  import shutil
7
- import getpass
8
  import gzip
9
  import zipfile
10
  import pickle
@@ -34,6 +33,14 @@ def get_or_create_env_var(var_name, default_value):
34
 
35
  return value
36
 
 
 
 
 
 
 
 
 
37
  # Retrieving or setting output folder
38
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
39
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
@@ -42,8 +49,6 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
42
  # running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
43
  # print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
44
 
45
-
46
-
47
  def ensure_output_folder_exists(output_folder):
48
  """Checks if the output folder exists, creates it if not."""
49
 
@@ -56,72 +61,100 @@ def ensure_output_folder_exists(output_folder):
56
  else:
57
  print(f"The output folder already exists:", folder_name)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  async def get_connection_params(request: gr.Request):
60
  base_folder = ""
61
 
62
- if request:
63
- #print("request user:", request.username)
64
-
65
- #request_data = await request.json() # Parse JSON body
66
- #print("All request data:", request_data)
67
- #context_value = request_data.get('context')
68
- #if 'context' in request_data:
69
- # print("Request context dictionary:", request_data['context'])
70
-
71
- # print("Request headers dictionary:", request.headers)
72
- # print("All host elements", request.client)
73
- # print("IP address:", request.client.host)
74
- # print("Query parameters:", dict(request.query_params))
75
- # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
76
- #print("Request dictionary to object:", request.request.body())
77
- print("Session hash:", request.session_hash)
78
-
79
- # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
80
- CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
81
- print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
82
-
83
- # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
84
- CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
85
- print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
86
-
87
- if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
88
- if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
89
- supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
90
- if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
91
- print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
92
  else:
93
- raise(ValueError, "Custom Cloudfront header value does not match expected value.")
 
 
 
 
94
 
95
- # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
96
 
97
- if request.username:
98
- out_session_hash = request.username
99
- base_folder = "user-files/"
 
100
 
101
- elif 'x-cognito-id' in request.headers:
102
- out_session_hash = request.headers['x-cognito-id']
103
- base_folder = "user-files/"
104
- print("Cognito ID found:", out_session_hash)
105
 
106
- else:
107
- out_session_hash = request.session_hash
108
- base_folder = "temp-files/"
109
- # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
110
 
111
- output_folder = base_folder + out_session_hash + "/"
112
- #if bucket_name:
113
- # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
114
 
115
- return out_session_hash, output_folder
116
- else:
117
- print("No session parameters found.")
118
- return "",""
119
 
120
- # Attempt to delete content of gradio temp folder
121
- # def get_temp_folder_path():
122
- # username = getpass.getuser()
123
- # return os.path.join('C:\\Users', username, 'AppData\\Local\\Temp\\gradio')
124
-
125
  def empty_folder(directory_path):
126
  if not os.path.exists(directory_path):
127
  #print(f"The directory {directory_path} does not exist. No temporary files from previous app use found to delete.")
@@ -495,15 +528,19 @@ def create_highlighted_excel_wb(df: pd.DataFrame, search_text: str, column_to_hi
495
 
496
  column_width = 150 # Adjust as needed
497
  relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
498
- print(relevant_column_no)
499
  sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
500
 
 
 
501
  # Find substrings in cells and highlight
502
  for r_idx, row in enumerate(df.itertuples(), start=2):
503
  for c_idx, cell_value in enumerate(row[1:], start=1):
504
  sheet.cell(row=r_idx, column=c_idx, value=cell_value)
505
  if df.columns[c_idx - 1] == column_to_highlight:
506
 
 
 
507
  html_text, combined_positions = highlight_found_text(search_text, cell_value)
508
  sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
509
  sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)
 
4
  import gradio as gr
5
  import os
6
  import shutil
 
7
  import gzip
8
  import zipfile
9
  import pickle
 
33
 
34
  return value
35
 
36
+ # Retrieving or setting CUSTOM_HEADER
37
+ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
38
+ print(f'CUSTOM_HEADER found')
39
+
40
+ # Retrieving or setting CUSTOM_HEADER_VALUE
41
+ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
42
+ print(f'CUSTOM_HEADER_VALUE found')
43
+
44
  # Retrieving or setting output folder
45
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
46
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 
49
  # running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
50
  # print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
51
 
 
 
52
  def ensure_output_folder_exists(output_folder):
53
  """Checks if the output folder exists, creates it if not."""
54
 
 
61
  else:
62
  print(f"The output folder already exists:", folder_name)
63
 
64
+ def get_input_file_names(file_input):
65
+ '''
66
+ Get list of input files to report to logs.
67
+ '''
68
+
69
+ all_relevant_files = []
70
+ file_name_with_extension = ""
71
+ full_file_name = ""
72
+
73
+ #print("file_input in input file names:", file_input)
74
+ if isinstance(file_input, dict):
75
+ file_input = os.path.abspath(file_input["name"])
76
+
77
+ if isinstance(file_input, str):
78
+ file_input_list = [file_input]
79
+ else:
80
+ file_input_list = file_input
81
+
82
+ for file in file_input_list:
83
+ if isinstance(file, str):
84
+ file_path = file
85
+ else:
86
+ file_path = file.name
87
+
88
+ file_path_without_ext = get_file_path_end(file_path)
89
+
90
+ file_extension = os.path.splitext(file_path)[1].lower()
91
+
92
+ # Check if the file is an image type
93
+ if file_extension in ['.xlsx', '.csv', '.parquet']:
94
+ all_relevant_files.append(file_path_without_ext)
95
+ file_name_with_extension = file_path_without_ext + file_extension
96
+ full_file_name = file_path
97
+
98
+ all_relevant_files_str = ", ".join(all_relevant_files)
99
+
100
+ print("all_relevant_files_str:", all_relevant_files_str)
101
+
102
+ return all_relevant_files_str, file_name_with_extension, full_file_name
103
+
104
  async def get_connection_params(request: gr.Request):
105
  base_folder = ""
106
 
107
+ #print("request user:", request.username)
108
+
109
+ #request_data = await request.json() # Parse JSON body
110
+ #print("All request data:", request_data)
111
+ #context_value = request_data.get('context')
112
+ #if 'context' in request_data:
113
+ # print("Request context dictionary:", request_data['context'])
114
+
115
+ print("Request headers dictionary:", request.headers)
116
+ print("All host elements", request.client)
117
+ print("IP address:", request.client.host)
118
+ print("Query parameters:", dict(request.query_params))
119
+ # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
120
+ #print("Request dictionary to object:", request.request.body())
121
+ print("Session hash:", request.session_hash)
122
+
123
+ if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
124
+ if CUSTOM_HEADER in request.headers:
125
+ supplied_custom_header_value = request.headers[CUSTOM_HEADER]
126
+ if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
127
+ print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
 
 
 
 
 
 
 
 
 
128
  else:
129
+ print("Custom header value does not match expected value.")
130
+ raise ValueError("Custom header value does not match expected value.")
131
+ else:
132
+ print("Custom header value not found.")
133
+ raise ValueError("Custom header value not found.")
134
 
135
+ # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
136
 
137
+ if request.username:
138
+ out_session_hash = request.username
139
+ base_folder = "user-files/"
140
+ print("Request username found:", out_session_hash)
141
 
142
+ elif 'x-cognito-id' in request.headers:
143
+ out_session_hash = request.headers['x-cognito-id']
144
+ base_folder = "user-files/"
145
+ print("Cognito ID found:", out_session_hash)
146
 
147
+ else:
148
+ out_session_hash = request.session_hash
149
+ base_folder = "temp-files/"
150
+ # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
151
 
152
+ output_folder = base_folder + out_session_hash + "/"
153
+ #if bucket_name:
154
+ # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
155
 
156
+ return out_session_hash, output_folder, out_session_hash
 
 
 
157
 
 
 
 
 
 
158
  def empty_folder(directory_path):
159
  if not os.path.exists(directory_path):
160
  #print(f"The directory {directory_path} does not exist. No temporary files from previous app use found to delete.")
 
528
 
529
  column_width = 150 # Adjust as needed
530
  relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
531
+ print("Relevant column number is:", relevant_column_no)
532
  sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
533
 
534
+ print("search_text is:", search_text)
535
+
536
  # Find substrings in cells and highlight
537
  for r_idx, row in enumerate(df.itertuples(), start=2):
538
  for c_idx, cell_value in enumerate(row[1:], start=1):
539
  sheet.cell(row=r_idx, column=c_idx, value=cell_value)
540
  if df.columns[c_idx - 1] == column_to_highlight:
541
 
542
+ print("cell value:", cell_value)
543
+
544
  html_text, combined_positions = highlight_found_text(search_text, cell_value)
545
  sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
546
  sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)
search_funcs/semantic_functions.py CHANGED
@@ -8,7 +8,7 @@ from search_funcs.helper_functions import get_file_path_end, create_highlighted_
8
  PandasDataFrame = Type[pd.DataFrame]
9
  today_rev = datetime.now().strftime("%Y%m%d")
10
 
11
- def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):
12
 
13
  from torch import cuda, backends
14
  from sentence_transformers import SentenceTransformer
@@ -63,7 +63,7 @@ def docs_to_embed_np_array(
63
  progress: gr.Progress = gr.Progress(track_tqdm=True)
64
  ) -> tuple:
65
  """
66
- Process documents to create BGE embeddings and save them as a numpy array.
67
 
68
  Parameters:
69
  - docs_out (list): List of documents to be embedded.
@@ -119,7 +119,8 @@ def docs_to_embed_np_array(
119
  print("Embedding with MiniLM-L6-v2 model")
120
 
121
  if embeddings_compress == "No":
122
- print("Embedding with full fp32 precision")
 
123
  embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
124
  else:
125
  print("Embedding with int8 precision")
@@ -235,7 +236,7 @@ def process_data_from_scores_df(
235
 
236
  return results_df_out
237
 
238
- def bge_semantic_search(
239
  query_str: str,
240
  embeddings: np.ndarray,
241
  documents: list,
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
  today_rev = datetime.now().strftime("%Y%m%d")
10
 
11
+ def load_embedding_model(embeddings_name = "sentence-transformers/all-MiniLM-L6-v2", embedding_loc="minilm/"):
12
 
13
  from torch import cuda, backends
14
  from sentence_transformers import SentenceTransformer
 
63
  progress: gr.Progress = gr.Progress(track_tqdm=True)
64
  ) -> tuple:
65
  """
66
+ Process documents to create embeddings and save them as a numpy array.
67
 
68
  Parameters:
69
  - docs_out (list): List of documents to be embedded.
 
119
  print("Embedding with MiniLM-L6-v2 model")
120
 
121
  if embeddings_compress == "No":
122
+ print("Embedding with fp16 precision")
123
+ embeddings_model.half()
124
  embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
125
  else:
126
  print("Embedding with int8 precision")
 
236
 
237
  return results_df_out
238
 
239
+ def semantic_search(
240
  query_str: str,
241
  embeddings: np.ndarray,
242
  documents: list,
search_funcs/spacy_search_funcs.py CHANGED
@@ -1,54 +1,68 @@
1
  import numpy as np
2
  import gradio as gr
3
  import pandas as pd
 
4
  from typing import List, Type
5
  from datetime import datetime
 
 
6
  from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
 
 
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
 
10
  today_rev = datetime.now().strftime("%Y%m%d")
11
 
12
- def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
13
  ''' Conduct fuzzy match on a list of data.'''
14
 
15
- import spacy
16
- spacy.prefer_gpu()
17
- from spacy.matcher import Matcher
 
 
 
 
18
 
 
 
19
  # Load spaCy model
20
  nlp = load_spacy_model()
21
 
22
  # Convert tokenised data back into a list of strings
23
  df_list = list(map(" ".join, tokenised_data))
24
 
25
- if len(df_list) > 10000:
26
- out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
27
  return out_message, None
28
 
29
  query = nlp(string_query)
30
- tokenised_query = [token.text for token in query]
31
- print(tokenised_query)
32
 
33
- spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)
 
34
 
35
- # %%
36
- if len(tokenised_query) > 1:
37
- pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
38
- pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
39
- else:
40
- pattern_lemma = [{"LEMMA": tokenised_query[0]}]
41
- pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
42
 
43
-
44
- # %%
45
- matcher = Matcher(nlp.vocab)
 
 
 
 
 
 
 
46
 
47
- # %%
48
- matcher.add(string_query, [pattern_fuzz])
49
- matcher.add(string_query, [pattern_lemma])
 
 
 
 
50
 
51
- # %%
52
  batch_size = 256
53
  docs = nlp.pipe(df_list, batch_size=batch_size)
54
 
@@ -59,7 +73,25 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
59
  for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
60
  matches = matcher(doc)
61
  match_count = len(matches)
62
- all_matches.append(match_count)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  print("Search complete")
65
 
@@ -76,7 +108,7 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
76
  "search_text": df_list,
77
  "search_score_abs": match_scores})
78
  results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
79
- results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")
80
 
81
  # Keep only results with at least one match
82
  results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
@@ -97,7 +129,10 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
97
  results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
98
 
99
  # Out file
100
- query_str_file = ("_").join(tokenised_query)
 
 
 
101
  results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
102
 
103
  print("Saving search file output")
@@ -105,11 +140,21 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
105
 
106
  #results_df_out.to_excel(results_df_name, index= None)
107
 
 
 
 
108
  # Highlight found text and save to file
109
  results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
110
  results_df_out_wb.save(results_df_name)
111
 
112
- results_first_text = results_df_out[text_column].iloc[0]
 
 
 
 
 
 
 
113
 
114
  print("Returning results")
115
 
 
1
  import numpy as np
2
  import gradio as gr
3
  import pandas as pd
4
+ import Levenshtein
5
  from typing import List, Type
6
  from datetime import datetime
7
+ import re
8
+
9
  from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
10
+ from spacy import prefer_gpu
11
+ from spacy.matcher import Matcher, PhraseMatcher
12
 
13
  PandasDataFrame = Type[pd.DataFrame]
14
 
15
  today_rev = datetime.now().strftime("%Y%m%d")
16
 
17
+ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, spelling_mistakes_max:int = 1, search_whole_phrase:bool=False, progress=gr.Progress(track_tqdm=True)):
18
  ''' Conduct fuzzy match on a list of data.'''
19
 
20
+ if not tokenised_data:
21
+ out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
22
+ print(out_message)
23
+ return out_message, None
24
+
25
+ # Lower case query
26
+ string_query = string_query.lower()
27
 
28
+ prefer_gpu()
29
+
30
  # Load spaCy model
31
  nlp = load_spacy_model()
32
 
33
  # Convert tokenised data back into a list of strings
34
  df_list = list(map(" ".join, tokenised_data))
35
 
36
+ if len(df_list) > 100000:
37
+ out_message = "Your data has more than 100,000 rows and will take more than 30 minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
38
  return out_message, None
39
 
40
  query = nlp(string_query)
 
 
41
 
42
+ if search_whole_phrase == False:
43
+ tokenised_query = [token.text for token in query]
44
 
45
+ spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
 
 
 
 
 
 
46
 
47
+ if len(tokenised_query) > 1:
48
+ pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
49
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
50
+ else:
51
+ pattern_lemma = [{"LEMMA": tokenised_query[0]}]
52
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
53
+
54
+ matcher = Matcher(nlp.vocab)
55
+ matcher.add(string_query, [pattern_fuzz])
56
+ matcher.add(string_query, [pattern_lemma])
57
 
58
+ else:
59
+ # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
60
+ tokenised_query = [string_query.lower()]
61
+ # If you want to match the whole phrase, use phrase matcher
62
+ matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
63
+ patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
64
+ matcher.add("PHRASE", patterns)
65
 
 
66
  batch_size = 256
67
  docs = nlp.pipe(df_list, batch_size=batch_size)
68
 
 
73
  for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
74
  matches = matcher(doc)
75
  match_count = len(matches)
76
+
77
+ # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
78
+ if search_whole_phrase==False:
79
+ all_matches.append(match_count)
80
+
81
+ else:
82
+ for match_id, start, end in matches:
83
+ span = str(doc[start:end]).strip()
84
+ query_search = str(query).strip()
85
+ distance = Levenshtein.distance(query_search, span)
86
+
87
+ # Compute a semantic similarity estimate. Defaults to cosine over vectors.
88
+ if distance > spelling_mistakes_max:
89
+ # Calculate Levenshtein distance
90
+ match_count = match_count - 1
91
+
92
+ all_matches.append(match_count)
93
+
94
+ #print("all_matches:", all_matches)
95
 
96
  print("Search complete")
97
 
 
108
  "search_text": df_list,
109
  "search_score_abs": match_scores})
110
  results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
111
+ results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left").drop(["index_x", "index_y"], axis=1, errors="ignore")
112
 
113
  # Keep only results with at least one match
114
  results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
 
129
  results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
130
 
131
  # Out file
132
+ query_str_file = "_".join(tokenised_query).replace(" ", "_") # Replace spaces with underscores
133
+ query_str_file = re.sub(r'[<>:"/\\|?*]', '', query_str_file) # Remove invalid characters
134
+ query_str_file = query_str_file[:30] # Limit to 30 characters
135
+
136
  results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
137
 
138
  print("Saving search file output")
 
140
 
141
  #results_df_out.to_excel(results_df_name, index= None)
142
 
143
+ print("string_query:", string_query)
144
+ print(results_df_out)
145
+
146
  # Highlight found text and save to file
147
  results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
148
  results_df_out_wb.save(results_df_name)
149
 
150
+ #results_first_text = results_df_out[text_column].iloc[0]
151
+
152
+ # Check if the DataFrame is empty or if the column does not exist
153
+ if results_df_out.empty or text_column not in results_df_out.columns:
154
+ results_first_text = "" #None # or handle it as needed
155
+ print("Nothing found.")
156
+ else:
157
+ results_first_text = results_df_out[text_column].iloc[0]
158
 
159
  print("Returning results")
160