Spaces:
Sleeping
Sleeping
Commit
·
e0fe055
1
Parent(s):
7e9dd76
Gradio 4.21. Limitations on file size and creating embeddings. Added AWS integration
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +33 -7
- requirements.txt +3 -2
- search_funcs/aws_functions.py +164 -0
- search_funcs/helper_functions.py +14 -1
.gitignore
CHANGED
@@ -16,6 +16,7 @@
|
|
16 |
*.pkl
|
17 |
*.pkl.gz
|
18 |
*.pem
|
|
|
19 |
docs/*
|
20 |
build/*
|
21 |
dist/*
|
|
|
16 |
*.pkl
|
17 |
*.pkl.gz
|
18 |
*.pem
|
19 |
+
*.json.out
|
20 |
docs/*
|
21 |
build/*
|
22 |
dist/*
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🔍
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.21.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -11,9 +11,10 @@ from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
|
11 |
from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
|
12 |
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
|
13 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
|
|
14 |
|
15 |
-
from fastapi import FastAPI
|
16 |
-
app = FastAPI()
|
17 |
|
18 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
|
19 |
temp_folder_path = get_temp_folder_path()
|
@@ -155,19 +156,34 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
155 |
in_join_message = gr.Textbox(label="Join file load progress")
|
156 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
157 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
-
|
160 |
-
|
|
|
161 |
# ---
|
162 |
in_k1_button.click(display_info, inputs=in_k1_info)
|
163 |
in_b_button.click(display_info, inputs=in_b_info)
|
164 |
in_alpha_button.click(display_info, inputs=in_alpha_info)
|
165 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
### BM25 SEARCH ###
|
168 |
# Update dropdowns upon initial file load
|
169 |
-
in_bm25_file.
|
170 |
-
in_join_file.
|
171 |
|
172 |
# Load in BM25 data
|
173 |
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
|
@@ -184,7 +200,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
184 |
### SEMANTIC SEARCH ###
|
185 |
|
186 |
# Load in a csv/excel file for semantic search
|
187 |
-
in_semantic_file.
|
188 |
load_semantic_data_button.click(
|
189 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
|
190 |
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
|
@@ -196,6 +212,16 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
196 |
# Simple run for HF spaces or local on your computer
|
197 |
#block.queue().launch(debug=True)
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
# Running on server (e.g. AWS) without specifying port
|
200 |
block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
|
201 |
|
|
|
11 |
from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
|
12 |
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
|
13 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
14 |
+
from search_funcs.aws_functions import load_data_from_aws
|
15 |
|
16 |
+
#from fastapi import FastAPI
|
17 |
+
#app = FastAPI()
|
18 |
|
19 |
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
|
20 |
temp_folder_path = get_temp_folder_path()
|
|
|
156 |
in_join_message = gr.Textbox(label="Join file load progress")
|
157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
159 |
+
with gr.Accordion(label = "AWS data access", open = False):
|
160 |
+
with gr.Row():
|
161 |
+
in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
|
162 |
+
load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
|
163 |
+
with gr.Row():
|
164 |
+
in_aws_semantic_file = gr.Dropdown(label="Choose semantic file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - semantic search"])
|
165 |
+
load_aws_semantic_data_button = gr.Button(value="Load semantic data from AWS", variant="secondary")
|
166 |
+
|
167 |
+
out_aws_data_message = gr.Textbox(label="AWS data load progress")
|
168 |
|
169 |
+
# Changing search parameters button
|
170 |
+
in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
|
171 |
+
|
172 |
# ---
|
173 |
in_k1_button.click(display_info, inputs=in_k1_info)
|
174 |
in_b_button.click(display_info, inputs=in_b_info)
|
175 |
in_alpha_button.click(display_info, inputs=in_alpha_info)
|
176 |
in_no_search_results_button.click(display_info, inputs=in_no_search_info)
|
177 |
+
|
178 |
+
### Loading AWS data ###
|
179 |
+
load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file], outputs=[in_bm25_file, out_aws_data_message])
|
180 |
+
load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file], outputs=[in_semantic_file, out_aws_data_message])
|
181 |
+
|
182 |
|
183 |
### BM25 SEARCH ###
|
184 |
# Update dropdowns upon initial file load
|
185 |
+
in_bm25_file.change(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
|
186 |
+
in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
|
187 |
|
188 |
# Load in BM25 data
|
189 |
load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
|
|
|
200 |
### SEMANTIC SEARCH ###
|
201 |
|
202 |
# Load in a csv/excel file for semantic search
|
203 |
+
in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
|
204 |
load_semantic_data_button.click(
|
205 |
csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
|
206 |
then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
|
|
|
212 |
# Simple run for HF spaces or local on your computer
|
213 |
#block.queue().launch(debug=True)
|
214 |
|
215 |
+
#def get_params(request: gr.Request):
|
216 |
+
# if request:
|
217 |
+
# print("Request headers dictionary:", request.headers)
|
218 |
+
# print("IP address:", request.client.host)
|
219 |
+
# print("Query parameters:", dict(request.query_params))
|
220 |
+
# return request.query_params
|
221 |
+
|
222 |
+
#request_params = get_params()
|
223 |
+
#print(request_params)
|
224 |
+
|
225 |
# Running on server (e.g. AWS) without specifying port
|
226 |
block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
|
227 |
|
requirements.txt
CHANGED
@@ -7,6 +7,7 @@ openpyxl==3.1.2
|
|
7 |
torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
-
gradio==4.
|
11 |
sentence_transformers==2.3.1
|
12 |
-
lxml==5.1.0
|
|
|
|
7 |
torch==2.1.2
|
8 |
spacy==3.7.2
|
9 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
10 |
+
gradio==4.21.0
|
11 |
sentence_transformers==2.3.1
|
12 |
+
lxml==5.1.0
|
13 |
+
boto3==1.34.63
|
search_funcs/aws_functions.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type
|
2 |
+
import pandas as pd
|
3 |
+
import boto3
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
|
7 |
+
PandasDataFrame = Type[pd.DataFrame]
|
8 |
+
|
9 |
+
bucket_name = 'data-text-search-data'
|
10 |
+
|
11 |
+
# Create a Session with the IAM role ARN
|
12 |
+
aws_role = 'arn:aws:iam::460501890304:role/ecsTaskExecutionRole'
|
13 |
+
|
14 |
+
try:
|
15 |
+
session = boto3.Session(profile_name="default")
|
16 |
+
except Exception as e:
|
17 |
+
print(e)
|
18 |
+
|
19 |
+
#sts = session.client("sts")
|
20 |
+
# response = sts.assume_role(
|
21 |
+
# RoleArn=aws_role,
|
22 |
+
# RoleSessionName="ecs-test-session"
|
23 |
+
# )
|
24 |
+
# print(response)
|
25 |
+
|
26 |
+
|
27 |
+
def get_assumed_role_info():
|
28 |
+
sts = boto3.client('sts')
|
29 |
+
response = sts.get_caller_identity()
|
30 |
+
|
31 |
+
# Extract ARN of the assumed role
|
32 |
+
assumed_role_arn = response['Arn']
|
33 |
+
|
34 |
+
# Extract the name of the assumed role from the ARN
|
35 |
+
assumed_role_name = assumed_role_arn.split('/')[-1]
|
36 |
+
|
37 |
+
return assumed_role_arn, assumed_role_name
|
38 |
+
|
39 |
+
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
40 |
+
|
41 |
+
print("Assumed Role ARN:", assumed_role_arn)
|
42 |
+
print("Assumed Role Name:", assumed_role_name)
|
43 |
+
|
44 |
+
|
45 |
+
# Download direct from S3 - requires login credentials
|
46 |
+
def download_file_from_s3(bucket_name, key, local_file_path):
|
47 |
+
|
48 |
+
s3 = boto3.client('s3')
|
49 |
+
s3.download_file(bucket_name, key, local_file_path)
|
50 |
+
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
|
51 |
+
|
52 |
+
#download_file_from_s3(bucket_name, object_key, local_file_loc)
|
53 |
+
|
54 |
+
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
55 |
+
"""
|
56 |
+
Download all files from an S3 folder to a local folder.
|
57 |
+
"""
|
58 |
+
s3 = boto3.client('s3')
|
59 |
+
|
60 |
+
# List objects in the specified S3 folder
|
61 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
62 |
+
|
63 |
+
# Download each object
|
64 |
+
for obj in response.get('Contents', []):
|
65 |
+
# Extract object key and construct local file path
|
66 |
+
object_key = obj['Key']
|
67 |
+
local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
|
68 |
+
|
69 |
+
# Create directories if necessary
|
70 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
71 |
+
|
72 |
+
# Download the object
|
73 |
+
try:
|
74 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
75 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
78 |
+
|
79 |
+
|
80 |
+
def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
81 |
+
"""
|
82 |
+
Download specific files from an S3 folder to a local folder.
|
83 |
+
"""
|
84 |
+
s3 = boto3.client('s3')
|
85 |
+
|
86 |
+
if filenames == '*':
|
87 |
+
# List all objects in the S3 folder
|
88 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
89 |
+
filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
|
90 |
+
|
91 |
+
for filename in filenames:
|
92 |
+
object_key = os.path.join(s3_folder, filename)
|
93 |
+
local_file_path = os.path.join(local_folder, filename)
|
94 |
+
|
95 |
+
# Create directories if necessary
|
96 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
97 |
+
|
98 |
+
# Download the object
|
99 |
+
try:
|
100 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
101 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
102 |
+
except Exception as e:
|
103 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def load_data_from_aws(in_aws_keyword_file, bucket_name=bucket_name):
|
108 |
+
|
109 |
+
temp_dir = tempfile.mkdtemp()
|
110 |
+
local_keyword_stub = temp_dir + '/keyword/'
|
111 |
+
local_semantic_stub = temp_dir + '/semantic/'
|
112 |
+
|
113 |
+
files = []
|
114 |
+
|
115 |
+
if "Bioasq - Biomedical example data" in in_aws_keyword_file:
|
116 |
+
|
117 |
+
s3_folder_stub = 'example_data/bioasq/latest/'
|
118 |
+
|
119 |
+
if 'keyword' in in_aws_keyword_file:
|
120 |
+
s3_folder_stub = s3_folder_stub + 'keyword/'
|
121 |
+
local_folder_path = local_keyword_stub
|
122 |
+
|
123 |
+
if 'semantic' in in_aws_keyword_file:
|
124 |
+
s3_folder_stub = s3_folder_stub + 'semantic/'
|
125 |
+
local_folder_path = local_semantic_stub
|
126 |
+
|
127 |
+
|
128 |
+
# Check if folder exists
|
129 |
+
if not os.path.exists(local_folder_path):
|
130 |
+
print(f"Folder {local_folder_path} does not exist! Making folder.")
|
131 |
+
|
132 |
+
os.mkdir(local_folder_path)
|
133 |
+
|
134 |
+
# Check if folder is empty
|
135 |
+
if len(os.listdir(local_folder_path)) == 0:
|
136 |
+
print(f"Folder {local_folder_path} is empty")
|
137 |
+
|
138 |
+
if 'keyword' in in_aws_keyword_file:
|
139 |
+
# Download keyword folder
|
140 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
|
141 |
+
|
142 |
+
if 'semantic' in in_aws_keyword_file:
|
143 |
+
# Download keyword folder
|
144 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames=['mini-bioasq-0000_cleaned_bge_embedding_compress.npz', 'mini-bioasq-0000_cleaned_prepared_docs.pkl.gz'])
|
145 |
+
|
146 |
+
print("AWS data downloaded")
|
147 |
+
|
148 |
+
else:
|
149 |
+
print(f"Folder {local_folder_path} is not empty")
|
150 |
+
|
151 |
+
#files = os.listdir(local_folder_stub)
|
152 |
+
#print(files)
|
153 |
+
|
154 |
+
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
155 |
+
|
156 |
+
out_message = "Data successfully loaded from AWS"
|
157 |
+
print(out_message)
|
158 |
+
|
159 |
+
else:
|
160 |
+
out_message = "Data not loaded from AWS"
|
161 |
+
print(out_message)
|
162 |
+
|
163 |
+
return files, out_message
|
164 |
+
|
search_funcs/helper_functions.py
CHANGED
@@ -15,6 +15,10 @@ from openpyxl.cell.text import InlineFont
|
|
15 |
from openpyxl.cell.rich_text import TextBlock, CellRichText
|
16 |
from openpyxl.styles import Font, Alignment
|
17 |
|
|
|
|
|
|
|
|
|
18 |
# Attempt to delete content of gradio temp folder
|
19 |
def get_temp_folder_path():
|
20 |
username = getpass.getuser()
|
@@ -115,7 +119,7 @@ def initial_data_load(in_file):
|
|
115 |
if not data_file_names:
|
116 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
117 |
print(out_message)
|
118 |
-
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, out_message
|
119 |
|
120 |
# This if you have loaded in a documents object for the semantic search
|
121 |
if "pkl" in data_file_names[0]:
|
@@ -129,6 +133,15 @@ def initial_data_load(in_file):
|
|
129 |
|
130 |
current_source = current_source + get_file_path_end_with_ext(file) + " "
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
df_new = read_file(file)
|
133 |
|
134 |
df = pd.concat([df, df_new], ignore_index = True)
|
|
|
15 |
from openpyxl.cell.rich_text import TextBlock, CellRichText
|
16 |
from openpyxl.styles import Font, Alignment
|
17 |
|
18 |
+
megabyte = 1024 * 1024 # Bytes in a megabyte
|
19 |
+
file_size_mb = 500 # Size in megabytes
|
20 |
+
file_size_bytes_500mb = megabyte * file_size_mb
|
21 |
+
|
22 |
# Attempt to delete content of gradio temp folder
|
23 |
def get_temp_folder_path():
|
24 |
username = getpass.getuser()
|
|
|
119 |
if not data_file_names:
|
120 |
out_message = "Please load in at least one csv/Excel/parquet data file."
|
121 |
print(out_message)
|
122 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
|
123 |
|
124 |
# This if you have loaded in a documents object for the semantic search
|
125 |
if "pkl" in data_file_names[0]:
|
|
|
133 |
|
134 |
current_source = current_source + get_file_path_end_with_ext(file) + " "
|
135 |
|
136 |
+
# Get the size of the file
|
137 |
+
print("Checking file size")
|
138 |
+
file_size = os.path.getsize(file)
|
139 |
+
if file_size > file_size_bytes_500mb:
|
140 |
+
out_message = "Data file greater than 500mb in size. Please use smaller sizes."
|
141 |
+
print(out_message)
|
142 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
|
143 |
+
|
144 |
+
|
145 |
df_new = read_file(file)
|
146 |
|
147 |
df = pd.concat([df, df_new], ignore_index = True)
|