akshansh36 commited on
Commit
102d995
·
verified ·
1 Parent(s): 3d1d0ce

Update search_page.py

Browse files
Files changed (1) hide show
  1. search_page.py +0 -757
search_page.py CHANGED
@@ -7,15 +7,6 @@ import re
7
  import json
8
  import streamlit as st
9
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
10
- import osfrom datetime import datetime
11
- from pymongo import MongoClient
12
- from langchain_core.prompts import ChatPromptTemplate
13
- from langchain_google_genai import ChatGoogleGenerativeAI
14
- from langchain_core.messages import HumanMessage
15
- import re
16
- import json
17
- import streamlit as st
18
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
19
  import os
20
  import pinecone
21
  from dotenv import load_dotenv
@@ -766,751 +757,3 @@ def search():
766
 
767
 
768
 
769
-
770
- import pinecone
771
- from dotenv import load_dotenv
772
- from bson import ObjectId
773
- import google.generativeai as genai
774
- import requests
775
- import fitz
776
- import base64
777
- from PIL import Image
778
- from concurrent.futures import ThreadPoolExecutor, as_completed
779
- import time
780
-
781
- load_dotenv()
782
- MONGO_URI = os.getenv("MONGO_URI")
783
- DB_NAME = os.getenv("DB_NAME")
784
- COLLECTION_NAME = os.getenv("COLLECTION_NAME")
785
- ABOUT_COMPANY_COLLECTION=os.getenv("COMPANY_COLLECTION_NAME")
786
- FLASH_API = os.getenv("FLASH_API")
787
- PINECONE_API=os.getenv("PINECONE_API")
788
- PINECONE_INDEX=os.getenv("PINECONE_INDEX")
789
-
790
- mongo_client = MongoClient(MONGO_URI)
791
- db = mongo_client[DB_NAME]
792
- collection = db[COLLECTION_NAME]
793
- collection2=db[ABOUT_COMPANY_COLLECTION]
794
-
795
- genai.configure(api_key=FLASH_API)
796
-
797
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=FLASH_API)
798
- model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=None, google_api_key=FLASH_API)
799
- model2 = genai.GenerativeModel('models/gemini-1.5-flash')
800
- pc = pinecone.Pinecone(
801
- api_key=PINECONE_API # Your Pinecone API key
802
- )
803
-
804
- index = pc.Index(PINECONE_INDEX)
805
-
806
- temp_audio_folder = "temp-audio"
807
- os.makedirs(temp_audio_folder, exist_ok=True)
808
-
809
- about_company_doc=collection2.find_one({"type":"about_company"})
810
- if about_company_doc:
811
- about_company=about_company_doc.get('company_description','')
812
-
813
-
814
- DOWNLOAD_DIR = "downloaded_pdfs"
815
- IMAGE_DIR = "extracted_images"
816
-
817
-
818
- os.makedirs(DOWNLOAD_DIR, exist_ok=True)
819
- os.makedirs(IMAGE_DIR, exist_ok=True)
820
-
821
-
822
- if 'images' not in st.session_state:
823
- st.session_state.images = []
824
- if 'pdfs' not in st.session_state:
825
- st.session_state.pdfs = []
826
-
827
- if 'query_submitted' not in st.session_state:
828
- st.session_state.query_submitted = False
829
-
830
- if 'audio' not in st.session_state:
831
- st.session_state.audio = False
832
-
833
- def cleanup_directories():
834
- # Cleanup the PDF download directory
835
- for filename in os.listdir(DOWNLOAD_DIR):
836
- file_path = os.path.join(DOWNLOAD_DIR, filename)
837
- try:
838
- if os.path.isfile(file_path) or os.path.islink(file_path):
839
- os.unlink(file_path) # Remove the file
840
- print(f"Deleted PDF file: {file_path}")
841
- except Exception as e:
842
- print(f"Failed to delete {file_path}. Reason: {e}")
843
-
844
- # Cleanup the image extraction directory
845
- for filename in os.listdir(IMAGE_DIR):
846
- file_path = os.path.join(IMAGE_DIR, filename)
847
- try:
848
- if os.path.isfile(file_path) or os.path.islink(file_path):
849
- os.unlink(file_path) # Remove the file
850
- print(f"Deleted Image file: {file_path}")
851
- except Exception as e:
852
- print(f"Failed to delete {file_path}. Reason: {e}")
853
-
854
-
855
- def process_user_query(user_query, about_company=""):
856
- try:
857
- # No f-string here, so we avoid additional formatting complications
858
- prompt_template = ChatPromptTemplate.from_template("""
859
- Given is a user query. Your task is to first translate the user query from any other language to English if not already in English.
860
- Then you have to extract important keywords from this query. Return the result in the format given below.
861
-
862
- Instructions:
863
- 1. Give the output in JSON format defined below
864
-
865
- Expected output format:
866
- {{"query":"String",
867
- "keywords":["String"]
868
- }}
869
- This query will be related to Ministry of Statistics and Programme Implementation(MOSPI), and the statistics stored by this organisation.
870
- "Query":
871
- {user_query}
872
- """)
873
-
874
- # Chain the prompt with LLM for response generation
875
- chain = prompt_template | model
876
- result = chain.invoke({
877
- "about_company": about_company,
878
- "user_query": user_query
879
- })
880
- print(f"Model response for reformulated query is {result.content}")
881
-
882
- # Use non-greedy regex and handle multiline content
883
- match = re.search(r"\{[\s\S]*?\}", result.content.strip())
884
- if match:
885
- json_data = match.group(0) # Extract JSON-like content as a string
886
- json_data = json_data.replace("'", '"')
887
- data = json.loads(json_data)
888
- enhanced_query = data.get('query', '')
889
- keywords = data.get('keywords', [])
890
- return enhanced_query, keywords
891
- else:
892
- print("No JSON data found in the model response.")
893
- return None, None
894
-
895
- except Exception as e:
896
- print(f"Error occurred while processing query using LLM: {e}")
897
- return None, None
898
-
899
-
900
-
901
-
902
- def filter_chunks_by_keywords_images(chunks, keywords):
903
- keywords_set = set(kw.strip().lower() for kw in keywords)
904
- chunks_with_keyword_counts = []
905
-
906
- for chunk in chunks:
907
- chunk_text = chunk['metadata'].get('description', '').lower()
908
- keyword_count = sum(1 for kw in keywords_set if kw in chunk_text)
909
- chunks_with_keyword_counts.append({
910
- 'chunk': chunk,
911
- 'keyword_count': keyword_count
912
- })
913
-
914
- # Sort chunks based on keyword count and similarity score
915
- sorted_chunks = sorted(
916
- chunks_with_keyword_counts,
917
- key=lambda x: (x['keyword_count'], x['chunk']['score']),
918
- reverse=True
919
- )
920
-
921
- # Filter chunks that have at least one keyword match
922
- chunks_with_keywords = [item for item in sorted_chunks if item['keyword_count'] > 0]
923
-
924
- if len(chunks_with_keywords) >= 3:
925
- # If 3 or more chunks have keyword matches, return the top 3 of those
926
- return chunks_with_keywords[:3]
927
- elif len(chunks_with_keywords) > 0:
928
- # If fewer than 3 chunks have keyword matches, return all that have matches
929
- return chunks_with_keywords
930
- else:
931
- # If no chunks have keyword matches, return the top 3 by similarity score alone
932
- sorted_by_similarity = sorted(
933
- chunks_with_keyword_counts,
934
- key=lambda x: x['chunk']['score'],
935
- reverse=True
936
- )
937
- return sorted_by_similarity[:3]
938
-
939
- def filter_chunks_by_keywords_pdf(chunks, keywords):
940
- keywords_set = set(kw.strip().lower() for kw in keywords)
941
- pdf_chunk_map = {}
942
-
943
- # Step 1: Calculate keyword count and similarity for each chunk, grouped by PDF URL
944
- for chunk in chunks:
945
- chunk_text = chunk['metadata'].get('description', '').lower()
946
- pdf_url = chunk['metadata'].get('url') # Unique identifier for each PDF
947
- keyword_count = sum(1 for kw in keywords_set if kw in chunk_text)
948
-
949
- # Structure each chunk with its metadata and computed values
950
- chunk_data = {
951
- 'chunk': chunk,
952
- 'keyword_count': keyword_count,
953
- 'similarity_score': chunk['score']
954
- }
955
-
956
- # Group chunks by PDF URL, keeping only the most relevant chunk per PDF
957
- if pdf_url not in pdf_chunk_map:
958
- pdf_chunk_map[pdf_url] = chunk_data
959
- else:
960
- existing_chunk = pdf_chunk_map[pdf_url]
961
- # Keep the chunk with higher relevance (more keywords or higher similarity)
962
- if (chunk_data['keyword_count'], chunk_data['similarity_score']) > (existing_chunk['keyword_count'], existing_chunk['similarity_score']):
963
- pdf_chunk_map[pdf_url] = chunk_data
964
-
965
- # Step 2: Collect the top chunk from each PDF, sort by keyword count and similarity score
966
- sorted_chunks = sorted(
967
- pdf_chunk_map.values(),
968
- key=lambda x: (x['keyword_count'], x['similarity_score']),
969
- reverse=True
970
- )
971
-
972
- # Step 3: Select the top 3 chunks from different PDFs
973
- top_chunks = sorted_chunks[:3] if len(sorted_chunks) >= 3 else sorted_chunks
974
-
975
- return top_chunks
976
-
977
-
978
- def get_images_from_chunks(chunks):
979
- images = []
980
- for item in chunks:
981
- chunk = item['chunk']
982
- mongo_id_str = chunk['metadata'].get('mongo_id')
983
- if mongo_id_str:
984
-
985
- mongo_id = ObjectId(mongo_id_str)
986
- image = collection.find_one({"_id": mongo_id})
987
- if image:
988
- images.append({
989
- 'image': image,
990
- 'similarity_score': chunk['score']
991
- })
992
- return images
993
-
994
- def get_pdfs_from_chunks(chunks):
995
- pdfs = []
996
- for item in chunks:
997
- chunk = item['chunk']
998
- mongo_id_str = chunk['metadata'].get('mongo_id')
999
- page_number=chunk['metadata'].get('page_number')
1000
- if mongo_id_str:
1001
-
1002
- mongo_id = ObjectId(mongo_id_str)
1003
- pdf = collection.find_one({"_id": mongo_id})
1004
- if pdf:
1005
- pdfs.append({
1006
- 'pdf': pdf,
1007
- 'similarity_score': chunk['score'],
1008
- 'page_number': page_number
1009
-
1010
- })
1011
- return pdfs
1012
-
1013
-
1014
- def format_date(timestamp):
1015
- """Convert timestamp to a readable date format."""
1016
- return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
1017
-
1018
-
1019
-
1020
-
1021
-
1022
- # Dialog function for viewing the PDF page image
1023
- # Dialog function for viewing the PDF page image
1024
- @st.dialog("View PDF Page Image", width="large")
1025
- def show_dialog(image, chunk,page_number):
1026
- print("entering dialog box")
1027
- try:
1028
- # Display the image
1029
- img = Image.open(image)
1030
- st.image(img, caption=f"Page Number: {page_number}", use_container_width=True)
1031
-
1032
- st.markdown("### Relevant Chunk", unsafe_allow_html=True)
1033
-
1034
- # Display chunk with styling
1035
- st.markdown(f"""
1036
- <div style="background-color: #FFEB3B; padding: 10px; border-radius: 10px; color: black;">
1037
- {chunk}
1038
- </div>
1039
- """, unsafe_allow_html=True)
1040
-
1041
- except Exception as e:
1042
- print(f"Error occurred in displaying image: {e}")
1043
-
1044
-
1045
- # Display the results of the image and PDFs
1046
- def display_results(images, pdfs,type):
1047
- # Display Images Section
1048
- images = sorted(images, key=lambda x: x['similarity_score'], reverse=True)
1049
- num_images = len(images)
1050
-
1051
- if num_images > 0:
1052
- st.write("### Here are the matching images for your query")
1053
- for start_idx in range(0, num_images, 3):
1054
- num_cols = min(3, num_images - start_idx)
1055
- cols = st.columns(num_cols)
1056
-
1057
- # Display images in the current row
1058
- for idx in range(num_cols):
1059
- img_info = images[start_idx + idx]
1060
- col = cols[idx]
1061
- with col:
1062
- image_data = img_info['image']
1063
- similarity_score = img_info['similarity_score']
1064
-
1065
- st.markdown(
1066
- f"""
1067
- <div style='text-align: center;'>
1068
- <img src='{image_data['object_url']}' alt='Image' style='width:250px; height:250px; object-fit: cover; border-radius: 8px;' />
1069
- <p><strong>Similarity Score:</strong> {similarity_score:.4f}</p>
1070
- </div>
1071
- """,
1072
- unsafe_allow_html=True
1073
- )
1074
-
1075
- with st.expander("View Image Details"):
1076
- st.write(f"**File Name:** {image_data.get('name', 'N/A')}")
1077
- st.write(f"**Date Uploaded:** {format_date(image_data.get('upload_date', datetime.now().timestamp()))}")
1078
- st.write(f"**Description:** {image_data.get('description', 'No description available')}")
1079
-
1080
- tags = ", ".join(image_data.get("tags", []))
1081
- st.write(f"**Tags:** {tags if tags else 'No tags'}")
1082
-
1083
- categories = ", ".join(image_data.get("categories", []))
1084
- st.write(f"**Categories:** {categories if categories else 'No categories'}")
1085
-
1086
- st.markdown(
1087
- f"<a href='{image_data['object_url']}' class='download-link' download>Download Image</a>",
1088
- unsafe_allow_html=True
1089
- )
1090
-
1091
- else:
1092
- st.write("No images to display.")
1093
-
1094
- # Display PDFs Section in rows of three columns
1095
- pdfs = sorted(pdfs, key=lambda x: x['similarity_score'], reverse=True)
1096
- num_pdfs = len(pdfs)
1097
-
1098
- if num_pdfs > 0:
1099
- st.write("### Here are the matching PDFs for your query")
1100
-
1101
- for start_idx in range(0, num_pdfs, 3):
1102
- num_cols = min(3, num_pdfs - start_idx)
1103
- cols = st.columns(num_cols)
1104
-
1105
- for idx in range(num_cols):
1106
- pdf_info = pdfs[start_idx + idx]
1107
- col = cols[idx]
1108
- with col:
1109
- pdf_data = pdf_info['pdf']
1110
- similarity_score = pdf_info['similarity_score']
1111
- extracted_image_path=pdf_info["image"]
1112
- relevant_chunk=pdf_info["relevant_chunk"]
1113
-
1114
- print(f"extracted image path is : {extracted_image_path}")
1115
-
1116
-
1117
-
1118
- # Expander for each PDF in a column
1119
- with st.expander(f"{pdf_data.get('name', 'PDF Document')}"):
1120
- st.write(f"**File Name:** {pdf_data.get('name', 'N/A')}")
1121
- st.write(f"**Page Number:** {int(pdf_info['page_number'])}")
1122
- st.write(
1123
- f"**Date Uploaded:** {format_date(pdf_data.get('upload_date', datetime.now().timestamp()))}")
1124
- tags = ", ".join(pdf_data.get("tags", []))
1125
- st.write(f"**Tags:** {tags if tags else 'No tags'}")
1126
-
1127
- categories = ", ".join(pdf_data.get("categories", []))
1128
- st.write(f"**Categories:** {categories if categories else 'No categories'}")
1129
-
1130
- st.markdown(
1131
- f"<a href='{pdf_data['object_url']}' class='download-link' download>Download PDF</a>",
1132
- unsafe_allow_html=True
1133
- )
1134
-
1135
- # Button that will trigger the dialog
1136
- if st.button("View chunk", key=f"chunk_{type}_{pdf_data['name']}"):
1137
- print(f"button is pressed")
1138
- # Call the dialog function when the button is pressed
1139
- show_dialog(extracted_image_path,relevant_chunk,int(pdf_info["page_number"]))
1140
-
1141
-
1142
-
1143
- st.markdown(
1144
- f"""<div style='text-align: center;'>
1145
- <p><strong>Similarity Score:</strong> {similarity_score:.4f}</p></div>""",
1146
- unsafe_allow_html=True
1147
- )
1148
-
1149
- else:
1150
- st.write("No PDFs to display.")
1151
-
1152
-
1153
-
1154
- def upload_audio_google(audio_path):
1155
- try:
1156
- audio_file = genai.upload_file(path=audio_path, display_name="Query Audio")
1157
- print(f"Uploaded file '{audio_file.display_name}' as: {audio_file.uri}")
1158
- return audio_file
1159
-
1160
- except Exception as e:
1161
- print(f"error occured while uploading audio to google : {e}")
1162
- return None
1163
-
1164
- def extract_query_from_audio(audio_file):
1165
-
1166
- try:
1167
-
1168
- prompt=f""" Given is a user query related in form of audio, your task is to understand the user query and convert it to text. If the audio is not in english then transalte it to english textual query.Also extract important keywords from the query.
1169
-
1170
- Expected output format : {{
1171
- "query":"String",
1172
- "keywords":["String"]
1173
- }}
1174
-
1175
- """
1176
- response = model2.generate_content(
1177
- [prompt, audio_file]
1178
- )
1179
-
1180
- if response:
1181
- print(response.text)
1182
- match = re.search(r"\{[\s\S]*?\}", response.text)
1183
- if match:
1184
- json_data = match.group(0) # Extract JSON-like content as a string
1185
- json_data = json_data.replace("'", '"')
1186
- data = json.loads(json_data)
1187
- enhanced_query = data.get('query', '')
1188
- keywords = data.get('keywords', [])
1189
- return enhanced_query, keywords
1190
-
1191
- else:
1192
- print("No JSON data found in the model response.")
1193
- return None,None
1194
-
1195
- except Exception as e:
1196
- print(f"error occured in extracting query from audio {e}")
1197
- return None,None
1198
-
1199
-
1200
-
1201
-
1202
- def search_pinecone(k,filetype,query_embedding):
1203
- search_results = index.query(
1204
- vector=query_embedding,
1205
- top_k=k,
1206
- include_metadata=True,
1207
- filter={"tag": filetype}
1208
- )
1209
-
1210
- return search_results
1211
-
1212
-
1213
- def download_pdf(url,filename):
1214
- try:
1215
- response = requests.get(url)
1216
- response.raise_for_status() # Raise an error for bad responses
1217
- pdf_path = os.path.join(DOWNLOAD_DIR, filename)
1218
-
1219
- with open(pdf_path, 'wb') as f:
1220
- f.write(response.content)
1221
-
1222
- print(f"Downloaded PDF: {pdf_path}")
1223
- return pdf_path
1224
- except requests.exceptions.RequestException as e:
1225
- print(f"Error downloading PDF: {e}")
1226
- return None
1227
-
1228
-
1229
-
1230
-
1231
- def convert_pdf_page_to_image(pdf_path, page_number):
1232
- try:
1233
- doc = fitz.open(pdf_path)
1234
-
1235
- page_number=int(page_number)
1236
-
1237
- # Check if the page number is valid
1238
- if page_number < 1 or page_number > doc.page_count:
1239
- print(f"Error: Page number {page_number} is out of bounds. Document only has {doc.page_count} pages.")
1240
- return None
1241
-
1242
- # Load the page (PyMuPDF uses 0-based indexing, so subtract 1 from the page number)
1243
- page = doc.load_page(page_number - 1)
1244
-
1245
- # Render the page as an image (pixmap)
1246
- pix = page.get_pixmap()
1247
-
1248
- # Save the image
1249
- image_path = f"extracted_images/page_{page_number}.png"
1250
- pix.save(image_path)
1251
-
1252
- print(f"Converted page {page_number} to image: {image_path}")
1253
- return image_path
1254
- except Exception as e:
1255
- print(f"Error converting PDF to image: {e}")
1256
- return None
1257
-
1258
-
1259
- def process_pdf(page_number,chunk,doc):
1260
- object_url=doc.get("object_url")
1261
- filename=doc.get("name")
1262
-
1263
- pdf_path=download_pdf(object_url,filename)
1264
- if pdf_path:
1265
- image_path = convert_pdf_page_to_image(pdf_path, page_number)
1266
- return image_path
1267
-
1268
- system_prompt_text=f"""
1269
- Given is the extracted image of a PDF page relevant to a user query. Your task is to see the user query and see the page image and then return the relevant chunk of text from the page, which might be related to user query.
1270
- Follow the instructions given below:
1271
- 1. Do not summarise the text, you have to give the chunk as it is from the page.
1272
- 2. Give the output in a JSON format defined below
1273
-
1274
- Output Format: {{
1275
- "chunk" : "String"
1276
- }}
1277
- """
1278
-
1279
- def process_image_using_llm(image):
1280
-
1281
- try:
1282
- # Send the image and system prompt to the LLM
1283
- message = HumanMessage(
1284
- content=[
1285
- {"type": "text", "text": system_prompt_text},
1286
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
1287
- ],
1288
- )
1289
- response = model.invoke([message])
1290
- print(f" LLM response for page : {response}")
1291
-
1292
- # Retry only if JSON data is missing or JSON decode error occurs
1293
-
1294
- # Check for JSON content in the response
1295
- if response:
1296
- result=response.content
1297
- print(f"llm result for relevant chunk extraction is :{result}")
1298
- if result:
1299
- json_string = result.strip('```json\n').strip('```')
1300
- if json_string:
1301
- parsed_data = json.loads(json_string)
1302
- chunk = parsed_data["chunk"]
1303
- if chunk:
1304
- return chunk
1305
-
1306
- else:
1307
- return None
1308
-
1309
- except Exception as e:
1310
- print(f"error occurred in in llm call for chunk extraction: {e}")
1311
- return None
1312
-
1313
-
1314
-
1315
-
1316
- def get_relevant_chunk_from_image(image_path,enhanced_query):
1317
- with open(image_path, "rb") as image_file:
1318
- image_data = base64.b64encode(image_file.read()).decode("utf-8")
1319
- chunk = process_image_using_llm(image_data)
1320
- return chunk
1321
-
1322
-
1323
- def process_single_pdf(pdf, enhanced_query):
1324
- try:
1325
- page_number = pdf.get("page_number")
1326
- chunk = pdf.get("chunk")
1327
- mongo_doc = pdf.get("pdf")
1328
- image_path = process_pdf(page_number, chunk, mongo_doc)
1329
-
1330
- if image_path:
1331
- relevant_chunk = get_relevant_chunk_from_image(image_path, enhanced_query)
1332
- pdf["relevant_chunk"] = relevant_chunk
1333
- pdf["image"] = image_path
1334
- return pdf
1335
- except Exception as e:
1336
- print(f"Error processing PDF: {e}")
1337
- return None
1338
-
1339
-
1340
- def process_pdfs_in_parallel(pdfs, enhanced_query):
1341
- processed_pdfs = []
1342
-
1343
- with ThreadPoolExecutor() as executor:
1344
- # Submit all PDF processing tasks to the thread pool
1345
- futures = [executor.submit(process_single_pdf, pdf, enhanced_query) for pdf in pdfs]
1346
-
1347
- # Wait for the results and collect them
1348
- for future in as_completed(futures):
1349
- result = future.result()
1350
- if result:
1351
- processed_pdfs.append(result)
1352
-
1353
- return processed_pdfs
1354
-
1355
-
1356
- def search():
1357
- if st.button("Back", key="back_button"):
1358
- st.session_state.images = [] # Reset images
1359
- st.session_state.pdfs = [] # Reset pdfs
1360
- st.session_state.query_submitted = False # Reset query state
1361
- st.session_state.audio = False # Reset audio state
1362
- st.session_state.page = "home" # Reset page
1363
- st.rerun() # Reload the app
1364
-
1365
- st.title("AI Inspired Smart Search Engine")
1366
- st.subheader("Multilingual text search 🖊️")
1367
-
1368
- user_query = st.text_input("Enter your search query:")
1369
-
1370
- # Text query submission
1371
- if user_query and st.button("Submit Query", key="submit_query"):
1372
- # Reset previous results before processing new text query
1373
-
1374
- st.session_state.query_submitted = False # Reset query submitted flag
1375
- st.session_state.audio = False # Reset audio flag
1376
-
1377
- with st.spinner("Processing your query, please wait..."):
1378
- enhanced_query, keywords = process_user_query(user_query, about_company)
1379
-
1380
- if enhanced_query and keywords:
1381
- query_embedding = embeddings.embed_query(enhanced_query)
1382
- search_results_image = search_pinecone(5, "Image", query_embedding)
1383
- search_result_pdfs = search_pinecone(20, "PDF", query_embedding)
1384
-
1385
- matches_pdf = search_result_pdfs['matches']
1386
- matches_image = search_results_image['matches']
1387
- images = []
1388
- pdfs = []
1389
-
1390
- if not matches_image and not matches_pdf:
1391
- st.write("No matching PDFs and Images found for your query")
1392
- else:
1393
- if matches_image:
1394
- top_chunks_images = filter_chunks_by_keywords_images(matches_image, keywords)
1395
- if top_chunks_images:
1396
- images = get_images_from_chunks(top_chunks_images)
1397
-
1398
- if matches_pdf:
1399
- top_chunks_pdf = filter_chunks_by_keywords_pdf(matches_pdf, keywords)
1400
- if top_chunks_pdf:
1401
- pdfs = get_pdfs_from_chunks(top_chunks_pdf)
1402
-
1403
- if pdfs:
1404
- # Process PDFs in parallel
1405
- processed_pdfs = process_pdfs_in_parallel(pdfs, enhanced_query)
1406
- st.session_state.pdfs = processed_pdfs
1407
-
1408
- # Store the results in session state
1409
- st.session_state.images = images
1410
- st.session_state.pdfs = pdfs
1411
- st.session_state.query_submitted = True # Mark query as submitted
1412
- st.session_state.audio = False # Ensure it's not audio after text query
1413
-
1414
- # Display results
1415
- display_results(images, pdfs,"text")
1416
-
1417
- # Display results if query has been submitted and it's a text query
1418
- elif st.session_state.query_submitted and not st.session_state.audio:
1419
- display_results(st.session_state.images, st.session_state.pdfs,"text")
1420
-
1421
- # Audio query section
1422
- st.markdown("<hr>", unsafe_allow_html=True)
1423
-
1424
- st.subheader("Multilingual Audio Search 🗣️")
1425
- audio_value = st.audio_input("Record your query")
1426
-
1427
- if audio_value and st.button("Submit Audio", key="audio-button"):
1428
- # Reset previous results before processing new audio query
1429
- st.session_state.query_submitted = False # Reset query submitted flag
1430
- st.session_state.audio = True # Mark it as an audio query
1431
-
1432
- with st.spinner("Processing your query, please wait..."):
1433
- timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
1434
- audio_file_path = os.path.join(temp_audio_folder, f"audio_query_{timestamp}.wav")
1435
-
1436
- # Save the audio input to the file
1437
- with open(audio_file_path, "wb") as f:
1438
- f.write(audio_value.getvalue())
1439
-
1440
- audio_file = upload_audio_google(audio_file_path)
1441
- if audio_file:
1442
- audio_query, audio_keywords = extract_query_from_audio(audio_file)
1443
-
1444
- if audio_query and audio_keywords:
1445
- query_embedding = embeddings.embed_query(audio_query)
1446
- search_results_image = search_pinecone(5, "Image", query_embedding)
1447
- search_result_pdfs = search_pinecone(20, "PDF", query_embedding)
1448
-
1449
- matches_pdf = search_result_pdfs['matches']
1450
- matches_image = search_results_image['matches']
1451
- images = []
1452
- pdfs = []
1453
-
1454
- if not matches_image and not matches_pdf:
1455
- st.write("No matching PDFs and Images found for your query")
1456
- else:
1457
- if matches_image:
1458
- top_chunks_images = filter_chunks_by_keywords_images(matches_image, audio_keywords)
1459
- if top_chunks_images:
1460
- images = get_images_from_chunks(top_chunks_images)
1461
-
1462
- if matches_pdf:
1463
- top_chunks_pdf = filter_chunks_by_keywords_pdf(matches_pdf, audio_keywords)
1464
- if top_chunks_pdf:
1465
- pdfs = get_pdfs_from_chunks(top_chunks_pdf)
1466
-
1467
- if pdfs:
1468
- # Process PDFs in parallel
1469
- processed_pdfs = process_pdfs_in_parallel(pdfs, audio_query)
1470
- st.session_state.pdfs = processed_pdfs
1471
-
1472
- # Store the results in session state
1473
- st.session_state.images = images
1474
- st.session_state.pdfs = pdfs
1475
- st.session_state.query_submitted = True # Mark query as submitted
1476
- st.session_state.audio = True # Set it to audio after audio query
1477
-
1478
- # Display the results
1479
- display_results(images, pdfs,"audio")
1480
-
1481
- else:
1482
- st.error(f"Sorry, could not process your request, please try again later!")
1483
-
1484
- # Clean up: Delete the audio file from Google and remove temp files
1485
- try:
1486
- genai.delete_file(audio_file.name)
1487
- except Exception as e:
1488
- print(f"Failed to delete audio file from Google storage: {e}")
1489
-
1490
- # Delete files inside the temp directory
1491
- for filename in os.listdir(temp_audio_folder):
1492
- file_path = os.path.join(temp_audio_folder, filename)
1493
- try:
1494
- if os.path.isfile(file_path) or os.path.islink(file_path):
1495
- os.unlink(file_path)
1496
- except Exception as e:
1497
- print(f"Failed to delete {file_path}. Reason: {e}")
1498
-
1499
- cleanup_directories()
1500
-
1501
- # Display results if audio query has been processed
1502
- elif st.session_state.query_submitted and st.session_state.audio:
1503
- display_results(st.session_state.images, st.session_state.pdfs,"audio")
1504
-
1505
-
1506
-
1507
-
1508
-
1509
-
1510
-
1511
-
1512
-
1513
-
1514
-
1515
-
1516
-
 
7
  import json
8
  import streamlit as st
9
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
 
 
 
 
 
 
 
 
 
10
  import os
11
  import pinecone
12
  from dotenv import load_dotenv
 
757
 
758
 
759