Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Dec 2, 2024

Commit

dbad462

1 Parent(s): 20b4aa0

Included exact/fuzzy phrase matching. Updated packages and added basic logging.

Browse files

Files changed (12) hide show

Dockerfile +5 -4
app.py +72 -12
requirements.txt +9 -8
requirements_aws.txt +8 -8
requirements_gpu.txt +9 -8
requirements_keyword_only.txt +9 -8
search_funcs/aws_functions.py +44 -1
search_funcs/bm25_functions.py +1 -1
search_funcs/custom_csvlogger.py +171 -0
search_funcs/helper_functions.py +96 -59
search_funcs/semantic_functions.py +5 -4
search_funcs/spacy_search_funcs.py +72 -27

Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@
 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
 # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
-# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
 # Update apt
 RUN apt-get update && rm -rf /var/lib/apt/lists/*
@@ -14,10 +14,10 @@ WORKDIR /src
 COPY requirements_aws.txt .
-RUN pip install torch==2.4.0+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
-&& pip install --no-cache-dir --target=/install sentence-transformers==3.0.1 --no-deps \
 && pip install --no-cache-dir --target=/install -r requirements_aws.txt \
-&& pip install --no-cache-dir --target=/install gradio==4.41.0
 # Add /install to the PYTHONPATH
 ENV PYTHONPATH="/install:${PYTHONPATH}"
@@ -57,6 +57,7 @@ ENV HOME=/home/user \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_SERVER_PORT=7860 \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	SYSTEM=spaces

 FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
 # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
+# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.4 /lambda-adapter /opt/extensions/lambda-adapter
 # Update apt
 RUN apt-get update && rm -rf /var/lib/apt/lists/*
 COPY requirements_aws.txt .
+RUN pip install torch==2.5.1+cpu --target=/install --index-url https://download.pytorch.org/whl/cpu \
+&& pip install --no-cache-dir --target=/install sentence-transformers==3.3.1 --no-deps \
 && pip install --no-cache-dir --target=/install -r requirements_aws.txt \
+&& pip install --no-cache-dir --target=/install gradio==5.6.0
 # Add /install to the PYTHONPATH
 ENV PYTHONPATH="/install:${PYTHONPATH}"
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_SERVER_PORT=7860 \
+	GRADIO_ANALYTICS_ENABLED=False \
 	GRADIO_THEME=huggingface \
 	AWS_STS_REGIONAL_ENDPOINT=regional \
 	SYSTEM=spaces

app.py CHANGED Viewed

@@ -3,20 +3,35 @@ import gradio as gr
 import pandas as pd
 import numpy as np
 import os
 PandasDataFrame = Type[pd.DataFrame]
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
-from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, bge_semantic_search
-from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
-from search_funcs.aws_functions import load_data_from_aws
 from search_funcs.auth import authenticate_user
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
 # temp_folder_path = get_temp_folder_path()
 # empty_folder(temp_folder_path)
 ## Gradio app - BM25 search
 app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
@@ -47,6 +62,20 @@ with app:
     join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
     output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
     # Informational state objects
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
@@ -89,7 +118,7 @@ depends on factors such as the type of documents or queries. Information taken f
             keyword_query = gr.Textbox(label="Enter your search term")
             with gr.Row():
                 keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
-                fuzzy_search_button = gr.Button(value="Fuzzy search (slow, < 10k rows)", variant="secondary", scale = 0)
             with gr.Row():
                 output_single_text = gr.Textbox(label="Top result")
                 output_file = gr.File(label="File output")
@@ -131,7 +160,7 @@ depends on factors such as the type of documents or queries. Information taken f
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
                 embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
-        with gr.Accordion(label="Keyword search options", open = False):
             with gr.Row():
                 in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
                 in_k1_button = gr.Button(value = "k1 value info", scale = 1)
@@ -147,7 +176,8 @@ depends on factors such as the type of documents or queries. Information taken f
             with gr.Row():
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Fuzzy search options", open = False):
-                no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
@@ -181,19 +211,20 @@ depends on factors such as the type of documents or queries. Information taken f
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load")
     in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, prepared_keyword_data_state, tokenised_prepared_keyword_data_state, in_clean_data, return_intermediate_files], outputs=[tokenised_prepared_keyword_data_state, load_finished_message, prepared_keyword_data_state, output_file, output_file, in_bm25_column], api_name="load_keyword").\
-    then(fn=prepare_bm25, inputs=[tokenised_prepared_keyword_data_state, in_bm25_file, in_bm25_column, bm25_search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file, bm25_search_index_state, tokenised_prepared_keyword_data_state], api_name="prepare_keyword") # keyword_data_list_state
     # BM25 search functions on click or enter
     keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file], api_name="keyword_search")
     keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
-    fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, tokenised_prepared_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, no_spelling_mistakes], outputs=[output_single_text, output_file], api_name="fuzzy_search")
     ### SEMANTIC SEARCH ###
@@ -205,10 +236,39 @@ depends on factors such as the type of documents or queries. Information taken f
         then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
     # Semantic search query
-    semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
-    semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
-    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')

 import pandas as pd
 import numpy as np
 import os
+import socket
+from datetime import datetime
 PandasDataFrame = Type[pd.DataFrame]
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
+from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, semantic_search
+from search_funcs.helper_functions import display_info, get_input_file_names, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
+from search_funcs.aws_functions import load_data_from_aws, upload_file_to_s3
 from search_funcs.auth import authenticate_user
+from search_funcs.custom_csvlogger import CSVLogger_custom
+today_rev = datetime.now().strftime("%Y%m%d")
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
 # temp_folder_path = get_temp_folder_path()
 # empty_folder(temp_folder_path)
+host_name = socket.gethostname()
+# Logging state
+log_file_name = 'log.csv'
+feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
+access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
+usage_logs_folder = 'usage/' + today_rev + '/' + host_name + '/'
 ## Gradio app - BM25 search
 app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
     join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
     output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
+    feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
+    feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
+    access_logs_state = gr.State(access_logs_folder + log_file_name)
+    access_s3_logs_loc_state = gr.State(access_logs_folder)
+    usage_logs_state = gr.State(usage_logs_folder + log_file_name)
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
+    data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
+    doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
+    data_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
+    data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
+    s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
+    session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     # Informational state objects
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
             keyword_query = gr.Textbox(label="Enter your search term")
             with gr.Row():
                 keyword_search_button = gr.Button(value="Keyword search", variant="primary", scale=1)
+                fuzzy_search_button = gr.Button(value="Fuzzy search (slow)", variant="secondary", scale = 0)
             with gr.Row():
                 output_single_text = gr.Textbox(label="Top result")
                 output_file = gr.File(label="File output")
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
                 embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
+        with gr.Accordion(label="BM25 search options", open = False):
             with gr.Row():
                 in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
                 in_k1_button = gr.Button(value = "k1 value info", scale = 1)
             with gr.Row():
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Fuzzy search options", open = False):
+                search_whole_phrase_bool = gr.Checkbox(label= "Search for the whole phrase (rather than individual words within also)", value=True)
+                spelling_mistakes_max_num = gr.Slider(label = "Maximum number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load").then(fn=get_input_file_names, inputs=[in_bm25_file], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, doc_full_file_name_textbox])
     in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, prepared_keyword_data_state, tokenised_prepared_keyword_data_state, in_clean_data, return_intermediate_files], outputs=[tokenised_prepared_keyword_data_state, load_finished_message, prepared_keyword_data_state, output_file, output_file, in_bm25_column], api_name="load_keyword").\
+    then(fn=prepare_bm25, inputs=[tokenised_prepared_keyword_data_state, in_bm25_file, in_bm25_column, bm25_search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file, bm25_search_index_state, tokenised_prepared_keyword_data_state], api_name="prepare_keyword")
     # BM25 search functions on click or enter
     keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file], api_name="keyword_search")
     keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, in_clean_data, bm25_search_index_state, tokenised_prepared_keyword_data_state, in_join_column, search_df_join_column, in_k1, in_b, in_alpha], outputs=[output_single_text, output_file])
     # Fuzzy search functions on click
+    fuzzy_search_button.click(fn=spacy_fuzzy_search, inputs=[keyword_query, tokenised_prepared_keyword_data_state, prepared_keyword_data_state, in_bm25_column, join_data_state, search_df_join_column, in_join_column, spelling_mistakes_max_num, search_whole_phrase_bool], outputs=[output_single_text, output_file], api_name="fuzzy_search")
     ### SEMANTIC SEARCH ###
         then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
     # Semantic search query
+    semantic_submit.click(semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
+    semantic_query.submit(semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
+    ###
+    # APP LOAD AND LOGGING FUNCTIONS
+    ###
+    app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
+    # Log usernames and times of access to file (to know who is using the app when running on AWS)
+    access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    access_callback.setup([session_hash_textbox], access_logs_folder)
+    session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+    # User submitted feedback for pdf redactions
+    # pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    # pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, data_file_name_no_extension_textbox], feedback_logs_folder)
+    # pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, data_file_name_no_extension_textbox], None, preprocess=False).\
+    # then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
+    # Log processing time/token usage when making a query
+    usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    usage_callback.setup([session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], usage_logs_folder)
+    # If output files are created, write logs to s3 (if possible)
+    output_file.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+    semantic_output_file.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, data_file_name_no_extension_textbox, data_file_name_textbox], preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')

requirements.txt CHANGED Viewed

@@ -1,11 +1,12 @@
-pandas==2.2.2
 polars==0.20.3
-pyarrow==14.0.2
 openpyxl==3.1.3
-torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
-spacy
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio
-sentence_transformers==3.0.1
 lxml==5.2.2
-boto3==1.34.142

+pandas==2.2.3
 polars==0.20.3
+pyarrow==17.0.0
 openpyxl==3.1.3
+torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.6.0
+sentence_transformers==3.3.1
 lxml==5.2.2
+boto3==1.35.71
+python-levenshtein==0.26.1

requirements_aws.txt CHANGED Viewed

@@ -1,13 +1,13 @@
-pandas==2.2.2
 polars==0.20.3
-pyarrow==14.0.2
 openpyxl==3.1.3
-spacy==3.7.5
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 lxml==5.2.2
-boto3==1.34.158
-transformers==4.44.0
 scikit-learn==1.5.1
-scipy==1.11.4
 tqdm==4.66.5
-numpy==1.26.4

+pandas==2.2.3
 polars==0.20.3
+pyarrow==17.0.0
 openpyxl==3.1.3
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 lxml==5.2.2
+boto3==1.35.71
+transformers==4.46.3
 scikit-learn==1.5.1
 tqdm==4.66.5
+numpy==1.26.4
+python-levenshtein==0.26.1

requirements_gpu.txt CHANGED Viewed

@@ -1,11 +1,12 @@
-pandas==2.2.2
 polars==0.20.3
-pyarrow==14.0.2
 openpyxl==3.1.3
-torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
-spacy
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio
-sentence_transformers==3.0.1
 lxml==5.2.2
-boto3==1.34.103

+pandas==2.2.3
 polars==0.20.3
+pyarrow==17.0.0
 openpyxl==3.1.3
+torch==2.5.1 --index-url https://download.pytorch.org/whl/nightly/cu121
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.6.0
+sentence_transformers==3.3.1
 lxml==5.2.2
+boto3==1.35.71
+python-levenshtein==0.26.1

requirements_keyword_only.txt CHANGED Viewed

@@ -1,11 +1,12 @@
-pandas==2.2.2
 polars==0.20.3
-pyarrow==14.0.2
 openpyxl==3.1.3
-#torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
-spacy
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio
-#sentence_transformers==3.0.1
 lxml==5.2.2
-#boto3==1.34.103

+pandas==2.2.3
 polars==0.20.3
+pyarrow==17.0.0
 openpyxl==3.1.3
+#torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.6.1
+#sentence_transformers==3.3.1
 lxml==5.2.2
+#boto3==1.35.71
+python-levenshtein==0.26.1

search_funcs/aws_functions.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Type
 import pandas as pd
 import boto3
 import tempfile
@@ -166,3 +166,46 @@ def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_
     return files, out_message

+from typing import Type, List
 import pandas as pd
 import boto3
 import tempfile
     return files, out_message
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=bucket_name):
+    """
+    Uploads a file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    s3_client = boto3.client('s3')
+    if isinstance(local_file_paths, str):
+        local_file_paths = [local_file_paths]
+    for file in local_file_paths:
+        if s3_client:
+            #print(s3_client)
+            try:
+                # Get file name off file path
+                file_name = os.path.basename(file)
+                s3_key_full = s3_key + file_name
+                print("S3 key: ", s3_key_full)
+                s3_client.upload_file(file, s3_bucket, s3_key_full)
+                out_message = "File " + file_name + " uploaded successfully!"
+                print(out_message)
+            except Exception as e:
+                out_message = f"Error uploading file(s): {e}"
+                print(out_message)
+            final_out_message.append(out_message)
+            final_out_message_str = '\n'.join(final_out_message)
+        else: final_out_message_str = "Could not connect to AWS."
+    return final_out_message_str

search_funcs/bm25_functions.py CHANGED Viewed

@@ -345,7 +345,7 @@ def prepare_bm25_input_data(
 	progress(0.4, desc = "Tokenising text")
-	print("Tokenised state:", tokenised_state)
 	if tokenised_state:
 		prepared_search_text_list = tokenised_state.iloc[:,0].tolist()

 	progress(0.4, desc = "Tokenising text")
+	#print("Tokenised state:", tokenised_state)
 	if tokenised_state:
 		prepared_search_text_list = tokenised_state.iloc[:,0].tolist()

search_funcs/custom_csvlogger.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from __future__ import annotations
+import contextlib
+import csv
+import datetime
+import os
+import re
+from collections.abc import Sequence
+from multiprocessing import Lock
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from gradio_client import utils as client_utils
+import gradio as gr
+from gradio import utils, wasm_utils
+if TYPE_CHECKING:
+    from gradio.components import Component
+from gradio.flagging import FlaggingCallback
+from threading import Lock
+class CSVLogger_custom(FlaggingCallback):
+    """
+    The default implementation of the FlaggingCallback abstract class in gradio>=5.0. Each flagged
+    sample (both the input and output data) is logged to a CSV file with headers on the machine running
+    the gradio app. Unlike ClassicCSVLogger, this implementation is concurrent-safe and it creates a new
+    dataset file every time the headers of the CSV (derived from the labels of the components) change. It also
+    only creates columns for "username" and "flag" if the flag_option and username are provided, respectively.
+    Example:
+        import gradio as gr
+        def image_classifier(inp):
+            return {'cat': 0.3, 'dog': 0.7}
+        demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
+                            flagging_callback=CSVLogger())
+    Guides: using-flagging
+    """
+    def __init__(
+        self,
+        simplify_file_data: bool = True,
+        verbose: bool = True,
+        dataset_file_name: str | None = None,
+    ):
+        """
+        Parameters:
+            simplify_file_data: If True, the file data will be simplified before being written to the CSV file. If CSVLogger is being used to cache examples, this is set to False to preserve the original FileData class
+            verbose: If True, prints messages to the console about the dataset file creation
+            dataset_file_name: The name of the dataset file to be created (should end in ".csv"). If None, the dataset file will be named "dataset1.csv" or the next available number.
+        """
+        self.simplify_file_data = simplify_file_data
+        self.verbose = verbose
+        self.dataset_file_name = dataset_file_name
+        self.lock = (
+            Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
+        )  # The multiprocessing module doesn't work on Lite.
+    def setup(
+        self,
+        components: Sequence[Component],
+        flagging_dir: str | Path,
+    ):
+        self.components = components
+        self.flagging_dir = Path(flagging_dir)
+        self.first_time = True
+    def _create_dataset_file(self, additional_headers: list[str] | None = None):
+        os.makedirs(self.flagging_dir, exist_ok=True)
+        if additional_headers is None:
+            additional_headers = []
+        headers = (
+            [
+                getattr(component, "label", None) or f"component {idx}"
+                for idx, component in enumerate(self.components)
+            ]
+            + additional_headers
+            + [
+                "timestamp",
+            ]
+        )
+        headers = utils.sanitize_list_for_csv(headers)
+        dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
+        if self.dataset_file_name:
+            self.dataset_filepath = self.flagging_dir / self.dataset_file_name
+        elif dataset_files:
+            try:
+                latest_file = max(
+                    dataset_files, key=lambda f: int(re.findall(r"\d+", f.stem)[0])
+                )
+                latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
+                with open(latest_file, newline="", encoding="utf-8") as csvfile:
+                    reader = csv.reader(csvfile)
+                    existing_headers = next(reader, None)
+                if existing_headers != headers:
+                    new_num = latest_num + 1
+                    self.dataset_filepath = self.flagging_dir / f"dataset{new_num}.csv"
+                else:
+                    self.dataset_filepath = latest_file
+            except Exception:
+                self.dataset_filepath = self.flagging_dir / "dataset1.csv"
+        else:
+            self.dataset_filepath = self.flagging_dir / "dataset1.csv"
+        if not Path(self.dataset_filepath).exists():
+            with open(
+                self.dataset_filepath, "w", newline="", encoding="utf-8"
+            ) as csvfile:
+                writer = csv.writer(csvfile)
+                writer.writerow(utils.sanitize_list_for_csv(headers))
+            if self.verbose:
+                print("Created dataset file at:", self.dataset_filepath)
+        elif self.verbose:
+            print("Using existing dataset file at:", self.dataset_filepath)
+    def flag(
+        self,
+        flag_data: list[Any],
+        flag_option: str | None = None,
+        username: str | None = None,
+    ) -> int:
+        if self.first_time:
+            additional_headers = []
+            if flag_option is not None:
+                additional_headers.append("flag")
+            if username is not None:
+                additional_headers.append("username")
+            self._create_dataset_file(additional_headers=additional_headers)
+            self.first_time = False
+        csv_data = []
+        for idx, (component, sample) in enumerate(
+            zip(self.components, flag_data, strict=False)
+        ):
+            save_dir = (
+                self.flagging_dir
+                / client_utils.strip_invalid_filename_characters(
+                    getattr(component, "label", None) or f"component {idx}"
+                )
+            )
+            if utils.is_prop_update(sample):
+                csv_data.append(str(sample))
+            else:
+                data = (
+                    component.flag(sample, flag_dir=save_dir)
+                    if sample is not None
+                    else ""
+                )
+                if self.simplify_file_data:
+                    data = utils.simplify_file_data_in_str(data)
+                csv_data.append(data)
+        if flag_option is not None:
+            csv_data.append(flag_option)
+        if username is not None:
+            csv_data.append(username)
+        csv_data.append(str(datetime.datetime.now()))
+        with self.lock:
+            with open(
+                self.dataset_filepath, "a", newline="", encoding="utf-8"
+            ) as csvfile:
+                writer = csv.writer(csvfile)
+                writer.writerow(utils.sanitize_list_for_csv(csv_data))
+            with open(self.dataset_filepath, encoding="utf-8") as csvfile:
+                line_count = len(list(csv.reader(csvfile))) - 1
+        return line_count

search_funcs/helper_functions.py CHANGED Viewed

@@ -4,7 +4,6 @@ import pandas as pd
 import gradio as gr
 import os
 import shutil
-import getpass
 import gzip
 import zipfile
 import pickle
@@ -34,6 +33,14 @@ def get_or_create_env_var(var_name, default_value):
     return value
 # Retrieving or setting output folder
 output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
 print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
@@ -42,8 +49,6 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 # running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
 # print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
 def ensure_output_folder_exists(output_folder):
     """Checks if the output folder exists, creates it if not."""
@@ -56,72 +61,100 @@ def ensure_output_folder_exists(output_folder):
     else:
         print(f"The output folder already exists:", folder_name)
 async def get_connection_params(request: gr.Request):
     base_folder = ""
-    if request:
-        #print("request user:", request.username)
-        #request_data = await request.json()  # Parse JSON body
-        #print("All request data:", request_data)
-        #context_value = request_data.get('context')
-        #if 'context' in request_data:
-        #     print("Request context dictionary:", request_data['context'])
-        # print("Request headers dictionary:", request.headers)
-        # print("All host elements", request.client)
-        # print("IP address:", request.client.host)
-        # print("Query parameters:", dict(request.query_params))
-        # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
-        #print("Request dictionary to object:", request.request.body())
-        print("Session hash:", request.session_hash)
-        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
-        CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
-        print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
-        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
-        CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
-        print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
-        if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
-            if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
-                supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
-                if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
-                    print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
                 else:
-                    raise(ValueError, "Custom Cloudfront header value does not match expected value.")
-        # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
-        if request.username:
-            out_session_hash = request.username
-            base_folder = "user-files/"
-        elif 'x-cognito-id' in request.headers:
-            out_session_hash = request.headers['x-cognito-id']
-            base_folder = "user-files/"
-            print("Cognito ID found:", out_session_hash)
-        else:
-            out_session_hash = request.session_hash
-            base_folder = "temp-files/"
-            # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
-        output_folder = base_folder + out_session_hash + "/"
-        #if bucket_name:
-        #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
-        return out_session_hash, output_folder
-    else:
-        print("No session parameters found.")
-        return "",""
-# Attempt to delete content of gradio temp folder
-# def get_temp_folder_path():
-#     username = getpass.getuser()
-#     return os.path.join('C:\\Users', username, 'AppData\\Local\\Temp\\gradio')
 def empty_folder(directory_path):
     if not os.path.exists(directory_path):
         #print(f"The directory {directory_path} does not exist. No temporary files from previous app use found to delete.")
@@ -495,15 +528,19 @@ def create_highlighted_excel_wb(df: pd.DataFrame, search_text: str, column_to_hi
     column_width = 150  # Adjust as needed
     relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
-    print(relevant_column_no)
     sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
     # Find substrings in cells and highlight
     for r_idx, row in enumerate(df.itertuples(), start=2):
         for c_idx, cell_value in enumerate(row[1:], start=1):
             sheet.cell(row=r_idx, column=c_idx, value=cell_value)
             if df.columns[c_idx - 1] == column_to_highlight:
                 html_text, combined_positions = highlight_found_text(search_text, cell_value)
                 sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
                 sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)

 import gradio as gr
 import os
 import shutil
 import gzip
 import zipfile
 import pickle
     return value
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
+print(f'CUSTOM_HEADER found')
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
+print(f'CUSTOM_HEADER_VALUE found')
 # Retrieving or setting output folder
 output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
 print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 # running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
 # print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
 def ensure_output_folder_exists(output_folder):
     """Checks if the output folder exists, creates it if not."""
     else:
         print(f"The output folder already exists:", folder_name)
+def get_input_file_names(file_input):
+    '''
+    Get list of input files to report to logs.
+    '''
+    all_relevant_files = []
+    file_name_with_extension = ""
+    full_file_name = ""
+    #print("file_input in input file names:", file_input)
+    if isinstance(file_input, dict):
+        file_input = os.path.abspath(file_input["name"])
+    if isinstance(file_input, str):
+        file_input_list = [file_input]
+    else:
+        file_input_list = file_input
+    for file in file_input_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+        file_path_without_ext = get_file_path_end(file_path)
+        file_extension = os.path.splitext(file_path)[1].lower()
+        # Check if the file is an image type
+        if file_extension in ['.xlsx', '.csv', '.parquet']:
+            all_relevant_files.append(file_path_without_ext)
+            file_name_with_extension = file_path_without_ext + file_extension
+            full_file_name = file_path
+    all_relevant_files_str = ", ".join(all_relevant_files)
+    print("all_relevant_files_str:", all_relevant_files_str)
+    return all_relevant_files_str, file_name_with_extension, full_file_name
 async def get_connection_params(request: gr.Request):
     base_folder = ""
+    #print("request user:", request.username)
+    #request_data = await request.json()  # Parse JSON body
+    #print("All request data:", request_data)
+    #context_value = request_data.get('context')
+    #if 'context' in request_data:
+    #     print("Request context dictionary:", request_data['context'])
+    print("Request headers dictionary:", request.headers)
+    print("All host elements", request.client)
+    print("IP address:", request.client.host)
+    print("Query parameters:", dict(request.query_params))
+    # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
+    #print("Request dictionary to object:", request.request.body())
+    print("Session hash:", request.session_hash)
+    if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
+            if CUSTOM_HEADER in request.headers:
+                supplied_custom_header_value = request.headers[CUSTOM_HEADER]
+                if supplied_custom_header_value == CUSTOM_HEADER_VALUE:
+                    print("Custom header supplied and matches CUSTOM_HEADER_VALUE")
                 else:
+                    print("Custom header value does not match expected value.")
+                    raise ValueError("Custom header value does not match expected value.")
+            else:
+                print("Custom header value not found.")
+                raise ValueError("Custom header value not found.")
+    # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
+    if request.username:
+        out_session_hash = request.username
+        base_folder = "user-files/"
+        print("Request username found:", out_session_hash)
+    elif 'x-cognito-id' in request.headers:
+        out_session_hash = request.headers['x-cognito-id']
+        base_folder = "user-files/"
+        print("Cognito ID found:", out_session_hash)
+    else:
+        out_session_hash = request.session_hash
+        base_folder = "temp-files/"
+        # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
+    output_folder = base_folder + out_session_hash + "/"
+    #if bucket_name:
+    #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
+    return out_session_hash, output_folder, out_session_hash
 def empty_folder(directory_path):
     if not os.path.exists(directory_path):
         #print(f"The directory {directory_path} does not exist. No temporary files from previous app use found to delete.")
     column_width = 150  # Adjust as needed
     relevant_column_no = (df.columns == column_to_highlight).argmax() + 1
+    print("Relevant column number is:", relevant_column_no)
     sheet.column_dimensions[sheet.cell(row=1, column=relevant_column_no).column_letter].width = column_width
+    print("search_text is:", search_text)
     # Find substrings in cells and highlight
     for r_idx, row in enumerate(df.itertuples(), start=2):
         for c_idx, cell_value in enumerate(row[1:], start=1):
             sheet.cell(row=r_idx, column=c_idx, value=cell_value)
             if df.columns[c_idx - 1] == column_to_highlight:
+                print("cell value:", cell_value)
                 html_text, combined_positions = highlight_found_text(search_text, cell_value)
                 sheet.cell(row=r_idx, column=c_idx).value = create_rich_text_cell_from_positions(cell_value, combined_positions)
                 sheet.cell(row=r_idx, column=c_idx).alignment = Alignment(wrap_text=True)

search_funcs/semantic_functions.py CHANGED Viewed

@@ -8,7 +8,7 @@ from search_funcs.helper_functions import get_file_path_end, create_highlighted_
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
-def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):
     from torch import cuda, backends
     from sentence_transformers import SentenceTransformer
@@ -63,7 +63,7 @@ def docs_to_embed_np_array(
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> tuple:
     """
-    Process documents to create BGE embeddings and save them as a numpy array.
     Parameters:
     - docs_out (list): List of documents to be embedded.
@@ -119,7 +119,8 @@ def docs_to_embed_np_array(
             print("Embedding with MiniLM-L6-v2 model")
         if embeddings_compress == "No":
-            print("Embedding with full fp32 precision")
             embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
         else:
             print("Embedding with int8 precision")
@@ -235,7 +236,7 @@ def process_data_from_scores_df(
     return results_df_out
-def bge_semantic_search(
     query_str: str,
     embeddings: np.ndarray,
     documents: list,

 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
+def load_embedding_model(embeddings_name = "sentence-transformers/all-MiniLM-L6-v2", embedding_loc="minilm/"):
     from torch import cuda, backends
     from sentence_transformers import SentenceTransformer
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> tuple:
     """
+    Process documents to create embeddings and save them as a numpy array.
     Parameters:
     - docs_out (list): List of documents to be embedded.
             print("Embedding with MiniLM-L6-v2 model")
         if embeddings_compress == "No":
+            print("Embedding with fp16 precision")
+            embeddings_model.half()
             embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
         else:
             print("Embedding with int8 precision")
     return results_df_out
+def semantic_search(
     query_str: str,
     embeddings: np.ndarray,
     documents: list,

search_funcs/spacy_search_funcs.py CHANGED Viewed

@@ -1,54 +1,68 @@
 import numpy as np
 import gradio as gr
 import pandas as pd
 from typing import List, Type
 from datetime import datetime
 from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
-def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
-    import spacy
-    spacy.prefer_gpu()
-    from spacy.matcher import Matcher
     # Load spaCy model
     nlp = load_spacy_model()
     # Convert tokenised data back into a list of strings
     df_list = list(map(" ".join, tokenised_data))
-    if len(df_list) > 10000:
-         out_message = "Your data has more than 10,000 rows and will take more than three minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
          return out_message, None
     query = nlp(string_query)
-    tokenised_query = [token.text for token in query]
-    print(tokenised_query)
-    spelling_mistakes_fuzzy_pattern = "FUZZY" + str(no_spelling_mistakes)
-    # %%
-    if len(tokenised_query) > 1:
-        pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
-        pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
-    else:
-        pattern_lemma = [{"LEMMA": tokenised_query[0]}]
-        pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
-    # %%
-    matcher = Matcher(nlp.vocab)
-    # %%
-    matcher.add(string_query, [pattern_fuzz])
-    matcher.add(string_query, [pattern_lemma])
-    # %%
     batch_size = 256
     docs = nlp.pipe(df_list, batch_size=batch_size)
@@ -59,7 +73,25 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
     for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
         matches = matcher(doc)
         match_count = len(matches)
-        all_matches.append(match_count)
     print("Search complete")
@@ -76,7 +108,7 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
                                     "search_text": df_list,
                                     "search_score_abs": match_scores})
     results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
-    results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")
     # Keep only results with at least one match
     results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
@@ -97,7 +129,10 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
     results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
     # Out file
-    query_str_file = ("_").join(tokenised_query)
     results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     print("Saving search file output")
@@ -105,11 +140,21 @@ def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], origin
     #results_df_out.to_excel(results_df_name, index= None)
     # Highlight found text and save to file
     results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
     results_df_out_wb.save(results_df_name)
-    results_first_text = results_df_out[text_column].iloc[0]
     print("Returning results")

 import numpy as np
 import gradio as gr
 import pandas as pd
+import Levenshtein
 from typing import List, Type
 from datetime import datetime
+import re
 from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
+from spacy import prefer_gpu
+from spacy.matcher import Matcher, PhraseMatcher
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
+def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, spelling_mistakes_max:int = 1, search_whole_phrase:bool=False, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
+    if not tokenised_data:
+        out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
+        print(out_message)
+        return out_message, None
+    # Lower case query
+    string_query = string_query.lower()
+    prefer_gpu()
     # Load spaCy model
     nlp = load_spacy_model()
     # Convert tokenised data back into a list of strings
     df_list = list(map(" ".join, tokenised_data))
+    if len(df_list) > 100000:
+         out_message = "Your data has more than 100,000 rows and will take more than 30 minutes to do a fuzzy search. Please try keyword or semantic search for data of this size."
          return out_message, None
     query = nlp(string_query)
+    if search_whole_phrase == False:
+        tokenised_query = [token.text for token in query]
+        spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
+        if len(tokenised_query) > 1:
+            pattern_lemma = [{"LEMMA": {"IN": tokenised_query}}]
+            pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": tokenised_query}}}]
+        else:
+            pattern_lemma = [{"LEMMA": tokenised_query[0]}]
+            pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: tokenised_query[0]}}]
+        matcher = Matcher(nlp.vocab)
+        matcher.add(string_query, [pattern_fuzz])
+        matcher.add(string_query, [pattern_lemma])
+    else:
+        # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
+        tokenised_query = [string_query.lower()]
+        # If you want to match the whole phrase, use phrase matcher
+        matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
+        patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
+        matcher.add("PHRASE", patterns)
     batch_size = 256
     docs = nlp.pipe(df_list, batch_size=batch_size)
     for doc in progress.tqdm(docs, desc = "Searching text", unit = "rows"):
         matches = matcher(doc)
         match_count = len(matches)
+        # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
+        if search_whole_phrase==False:
+            all_matches.append(match_count)
+        else:
+            for match_id, start, end in matches:
+                span = str(doc[start:end]).strip()
+                query_search = str(query).strip()
+                distance = Levenshtein.distance(query_search, span)
+                # Compute a semantic similarity estimate. Defaults to cosine over vectors.
+                if distance > spelling_mistakes_max:
+                    # Calculate Levenshtein distance
+                    match_count = match_count - 1
+            all_matches.append(match_count)
+    #print("all_matches:", all_matches)
     print("Search complete")
                                     "search_text": df_list,
                                     "search_score_abs": match_scores})
     results_df['search_score_abs'] = abs(round(results_df['search_score_abs']*100, 2))
+    results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left").drop(["index_x", "index_y"], axis=1, errors="ignore")
     # Keep only results with at least one match
     results_df_out = results_df_out.loc[results_df["search_score_abs"] > 0, :]
     results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
     # Out file
+    query_str_file = "_".join(tokenised_query).replace(" ", "_")  # Replace spaces with underscores
+    query_str_file = re.sub(r'[<>:"/\\|?*]', '', query_str_file)  # Remove invalid characters
+    query_str_file = query_str_file[:30]  # Limit to 30 characters
     results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
     print("Saving search file output")
     #results_df_out.to_excel(results_df_name, index= None)
+    print("string_query:", string_query)
+    print(results_df_out)
     # Highlight found text and save to file
     results_df_out_wb = create_highlighted_excel_wb(results_df_out, string_query, "search_text")
     results_df_out_wb.save(results_df_name)
+    #results_first_text = results_df_out[text_column].iloc[0]
+    # Check if the DataFrame is empty or if the column does not exist
+    if results_df_out.empty or text_column not in results_df_out.columns:
+        results_first_text = "" #None  # or handle it as needed
+        print("Nothing found.")
+    else:
+        results_first_text = results_df_out[text_column].iloc[0]
     print("Returning results")