Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Mar 19, 2024

Commit

e0fe055

1 Parent(s): 7e9dd76

Gradio 4.21. Limitations on file size and creating embeddings. Added AWS integration

Browse files

Files changed (6) hide show

.gitignore +1 -0
README.md +1 -1
app.py +33 -7
requirements.txt +3 -2
search_funcs/aws_functions.py +164 -0
search_funcs/helper_functions.py +14 -1

.gitignore CHANGED Viewed

@@ -16,6 +16,7 @@
 *.pkl
 *.pkl.gz
 *.pem
 docs/*
 build/*
 dist/*

 *.pkl
 *.pkl.gz
 *.pem
+*.json.out
 docs/*
 build/*
 dist/*

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔍
 colorFrom: purple
 colorTo: green
 sdk: gradio
-sdk_version: 4.20.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: purple
 colorTo: green
 sdk: gradio
+sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -11,9 +11,10 @@ from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
 from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
 from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
-from fastapi import FastAPI
-app = FastAPI()
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
 temp_folder_path = get_temp_folder_path()
@@ -155,19 +156,34 @@ depends on factors such as the type of documents or queries. Information taken f
             in_join_message = gr.Textbox(label="Join file load progress")
             in_join_column = gr.Dropdown(label="Column to join in new data frame")
             search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
-        in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
     # ---
     in_k1_button.click(display_info, inputs=in_k1_info)
     in_b_button.click(display_info, inputs=in_b_info)
     in_alpha_button.click(display_info, inputs=in_alpha_info)
     in_no_search_results_button.click(display_info, inputs=in_no_search_info)
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
-    in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
@@ -184,7 +200,7 @@ depends on factors such as the type of documents or queries. Information taken f
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
         csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
         then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
@@ -196,6 +212,16 @@ depends on factors such as the type of documents or queries. Information taken f
 # Simple run for HF spaces or local on your computer
 #block.queue().launch(debug=True)
 # Running on server (e.g. AWS) without specifying port
 block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",

 from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
 from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
+from search_funcs.aws_functions import load_data_from_aws
+#from fastapi import FastAPI
+#app = FastAPI()
 # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
 temp_folder_path = get_temp_folder_path()
             in_join_message = gr.Textbox(label="Join file load progress")
             in_join_column = gr.Dropdown(label="Column to join in new data frame")
             search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
+        with gr.Accordion(label = "AWS data access", open = False):
+            with gr.Row():
+                in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
+                load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
+            with gr.Row():
+                in_aws_semantic_file = gr.Dropdown(label="Choose semantic file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - semantic search"])
+                load_aws_semantic_data_button = gr.Button(value="Load semantic data from AWS", variant="secondary")
+            out_aws_data_message = gr.Textbox(label="AWS data load progress")
+    # Changing search parameters button
+    in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
     # ---
     in_k1_button.click(display_info, inputs=in_k1_info)
     in_b_button.click(display_info, inputs=in_b_info)
     in_alpha_button.click(display_info, inputs=in_alpha_info)
     in_no_search_results_button.click(display_info, inputs=in_no_search_info)
+    ### Loading AWS data ###
+    load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file], outputs=[in_bm25_file, out_aws_data_message])
+    load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file], outputs=[in_semantic_file, out_aws_data_message])
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.change(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
+    in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
         csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
         then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
 # Simple run for HF spaces or local on your computer
 #block.queue().launch(debug=True)
+#def get_params(request: gr.Request):
+#    if request:
+#        print("Request headers dictionary:", request.headers)
+#        print("IP address:", request.client.host)
+#        print("Query parameters:", dict(request.query_params))
+#    return request.query_params
+#request_params = get_params()
+#print(request_params)
 # Running on server (e.g. AWS) without specifying port
 block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ openpyxl==3.1.2
 torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio==4.20.1
 sentence_transformers==2.3.1
-lxml==5.1.0

 torch==2.1.2
 spacy==3.7.2
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
+gradio==4.21.0
 sentence_transformers==2.3.1
+lxml==5.1.0
+boto3==1.34.63

search_funcs/aws_functions.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from typing import Type
+import pandas as pd
+import boto3
+import tempfile
+import os
+PandasDataFrame = Type[pd.DataFrame]
+bucket_name = 'data-text-search-data'
+# Create a Session with the IAM role ARN
+aws_role = 'arn:aws:iam::460501890304:role/ecsTaskExecutionRole'
+try:
+    session = boto3.Session(profile_name="default")
+except Exception as e:
+    print(e)
+#sts = session.client("sts")
+# response = sts.assume_role(
+#     RoleArn=aws_role,
+#     RoleSessionName="ecs-test-session"
+# )
+# print(response)
+def get_assumed_role_info():
+    sts = boto3.client('sts')
+    response = sts.get_caller_identity()
+    # Extract ARN of the assumed role
+    assumed_role_arn = response['Arn']
+    # Extract the name of the assumed role from the ARN
+    assumed_role_name = assumed_role_arn.split('/')[-1]
+    return assumed_role_arn, assumed_role_name
+assumed_role_arn, assumed_role_name = get_assumed_role_info()
+print("Assumed Role ARN:", assumed_role_arn)
+print("Assumed Role Name:", assumed_role_name)
+# Download direct from S3 - requires login credentials
+def download_file_from_s3(bucket_name, key, local_file_path):
+    s3 = boto3.client('s3')
+    s3.download_file(bucket_name, key, local_file_path)
+    print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
+#download_file_from_s3(bucket_name, object_key, local_file_loc)
+def download_folder_from_s3(bucket_name, s3_folder, local_folder):
+    """
+    Download all files from an S3 folder to a local folder.
+    """
+    s3 = boto3.client('s3')
+    # List objects in the specified S3 folder
+    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+    # Download each object
+    for obj in response.get('Contents', []):
+        # Extract object key and construct local file path
+        object_key = obj['Key']
+        local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
+        # Create directories if necessary
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object
+        try:
+            s3.download_file(bucket_name, object_key, local_file_path)
+            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+        except Exception as e:
+            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
+    """
+    Download specific files from an S3 folder to a local folder.
+    """
+    s3 = boto3.client('s3')
+    if filenames == '*':
+        # List all objects in the S3 folder
+        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+        filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
+    for filename in filenames:
+        object_key = os.path.join(s3_folder, filename)
+        local_file_path = os.path.join(local_folder, filename)
+        # Create directories if necessary
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object
+        try:
+            s3.download_file(bucket_name, object_key, local_file_path)
+            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+        except Exception as e:
+            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def load_data_from_aws(in_aws_keyword_file, bucket_name=bucket_name):
+    temp_dir = tempfile.mkdtemp()
+    local_keyword_stub = temp_dir + '/keyword/'
+    local_semantic_stub = temp_dir + '/semantic/'
+    files = []
+    if "Bioasq - Biomedical example data" in in_aws_keyword_file:
+        s3_folder_stub = 'example_data/bioasq/latest/'
+        if 'keyword' in in_aws_keyword_file:
+            s3_folder_stub = s3_folder_stub + 'keyword/'
+            local_folder_path = local_keyword_stub
+        if 'semantic' in in_aws_keyword_file:
+            s3_folder_stub = s3_folder_stub + 'semantic/'
+            local_folder_path = local_semantic_stub
+        # Check if folder exists
+        if not os.path.exists(local_folder_path):
+            print(f"Folder {local_folder_path} does not exist! Making folder.")
+            os.mkdir(local_folder_path)
+        # Check if folder is empty
+        if len(os.listdir(local_folder_path)) == 0:
+            print(f"Folder {local_folder_path} is empty")
+            if 'keyword' in in_aws_keyword_file:
+                # Download keyword folder
+                download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
+            if 'semantic' in in_aws_keyword_file:
+                # Download keyword folder
+                download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames=['mini-bioasq-0000_cleaned_bge_embedding_compress.npz', 'mini-bioasq-0000_cleaned_prepared_docs.pkl.gz'])
+            print("AWS data downloaded")
+        else:
+            print(f"Folder {local_folder_path} is not empty")
+        #files = os.listdir(local_folder_stub)
+        #print(files)
+        files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
+        out_message = "Data successfully loaded from AWS"
+        print(out_message)
+    else:
+        out_message = "Data not loaded from AWS"
+        print(out_message)
+    return files, out_message

search_funcs/helper_functions.py CHANGED Viewed

@@ -15,6 +15,10 @@ from openpyxl.cell.text import InlineFont
 from openpyxl.cell.rich_text import TextBlock, CellRichText
 from openpyxl.styles import Font, Alignment
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()
@@ -115,7 +119,7 @@ def initial_data_load(in_file):
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
         print(out_message)
-        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, out_message
     # This if you have loaded in a documents object for the semantic search
     if "pkl" in data_file_names[0]:
@@ -129,6 +133,15 @@ def initial_data_load(in_file):
             current_source = current_source + get_file_path_end_with_ext(file) + " "
             df_new = read_file(file)
             df = pd.concat([df, df_new], ignore_index = True)

 from openpyxl.cell.rich_text import TextBlock, CellRichText
 from openpyxl.styles import Font, Alignment
+megabyte = 1024 * 1024  # Bytes in a megabyte
+file_size_mb = 500  # Size in megabytes
+file_size_bytes_500mb =  megabyte * file_size_mb
 # Attempt to delete content of gradio temp folder
 def get_temp_folder_path():
     username = getpass.getuser()
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
         print(out_message)
+        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
     # This if you have loaded in a documents object for the semantic search
     if "pkl" in data_file_names[0]:
             current_source = current_source + get_file_path_end_with_ext(file) + " "
+            # Get the size of the file
+            print("Checking file size")
+            file_size = os.path.getsize(file)
+            if file_size > file_size_bytes_500mb:
+                out_message = "Data file greater than 500mb in size. Please use smaller sizes."
+                print(out_message)
+                return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
             df_new = read_file(file)
             df = pd.concat([df, df_new], ignore_index = True)