Shreyas094 commited on
Commit
201ffe7
·
verified ·
1 Parent(s): 07bfb82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -9
app.py CHANGED
@@ -20,6 +20,7 @@ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
20
  from langchain_core.documents import Document
21
  from sklearn.feature_extraction.text import TfidfVectorizer
22
  from sklearn.metrics.pairwise import cosine_similarity
 
23
 
24
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
25
 
@@ -142,7 +143,7 @@ _useragent_list = [
142
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
143
  ]
144
 
145
- def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
146
  escaped_term = urllib.parse.quote_plus(term)
147
  start = 0
148
  all_results = []
@@ -221,6 +222,30 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
221
 
222
  return all_results
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  def ask_question(question, temperature, top_p, repetition_penalty, web_search):
225
  global conversation_history
226
 
@@ -235,19 +260,19 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
235
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
236
  else:
237
  database = None
238
-
239
  if web_search:
240
  search_results = google_search(question)
241
- web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
242
 
243
- if database is None:
244
- database = FAISS.from_documents(web_docs, embed)
245
- else:
246
- database.add_documents(web_docs)
247
 
248
- database.save_local("faiss_database")
249
 
250
- context_str = "\n".join([doc.page_content for doc in web_docs])
 
251
 
252
  prompt_template = """
253
  Answer the question based on the following web search results:
@@ -325,6 +350,48 @@ def update_vectors(files, use_recursive_splitter):
325
 
326
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  def extract_db_to_excel():
329
  embed = get_embeddings()
330
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
@@ -384,6 +451,10 @@ with gr.Blocks() as demo:
384
 
385
  submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
386
 
 
 
 
 
387
  extract_button = gr.Button("Extract Database to Excel")
388
  excel_output = gr.File(label="Download Excel File")
389
  extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
 
20
  from langchain_core.documents import Document
21
  from sklearn.feature_extraction.text import TfidfVectorizer
22
  from sklearn.metrics.pairwise import cosine_similarity
23
+ from datetime import datetime
24
 
25
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
26
 
 
143
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
144
  ]
145
 
146
+ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl_verify=None):
147
  escaped_term = urllib.parse.quote_plus(term)
148
  start = 0
149
  all_results = []
 
222
 
223
  return all_results
224
 
225
+ def summarize_content(content, model):
226
+ summary_prompt = f"""
227
+ Summarize the following content in a concise manner:
228
+ {content}
229
+ Summary:
230
+ """
231
+ summary = generate_chunked_response(model, summary_prompt, max_tokens=200)
232
+ return summary
233
+
234
+ def rank_search_results(titles, summaries, model):
235
+ ranking_prompt = f"""
236
+ Rank the following search results from a financial analyst perspective.
237
+ Assign a rank from 1 to {len(titles)} based on relevance, with 1 being the most relevant.
238
+ Return only the numeric ranks in order, separated by commas.
239
+
240
+ Titles and summaries:
241
+ {', '.join([f"{i+1}. Title: {title}\nSummary: {summary}" for i, (title, summary) in enumerate(zip(titles, summaries))])}
242
+
243
+ Ranks:
244
+ """
245
+ ranks_str = generate_chunked_response(model, ranking_prompt)
246
+ ranks = [float(rank.strip()) for rank in ranks_str.split(',')]
247
+ return ranks
248
+
249
  def ask_question(question, temperature, top_p, repetition_penalty, web_search):
250
  global conversation_history
251
 
 
260
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
261
  else:
262
  database = None
263
+
264
  if web_search:
265
  search_results = google_search(question)
266
+ model = get_model(temperature, top_p, repetition_penalty)
267
 
268
+ summaries = [summarize_content(result["text"], model) for result in search_results]
269
+ titles = [result["title"] for result in search_results]
270
+ ranks = rank_search_results(titles, summaries, model)
 
271
 
272
+ update_vector_db_with_search_results(search_results, summaries, ranks)
273
 
274
+ context_str = "\n".join([f"Title: {result['title']}\nSummary: {summary}\nRank: {rank}"
275
+ for result, summary, rank in zip(search_results, summaries, ranks)])
276
 
277
  prompt_template = """
278
  Answer the question based on the following web search results:
 
350
 
351
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
352
 
353
+ def update_vector_db_with_search_results(search_results, summaries, ranks):
354
+ embed = get_embeddings()
355
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True) if os.path.exists("faiss_database") else FAISS.from_documents([], embed)
356
+
357
+ current_date = datetime.now().strftime("%Y-%m-%d")
358
+
359
+ for result, summary, rank in zip(search_results, summaries, ranks):
360
+ doc = Document(
361
+ page_content=summary,
362
+ metadata={
363
+ "search_date": current_date,
364
+ "search_title": result["title"],
365
+ "search_content": result["text"],
366
+ "search_summary": summary,
367
+ "rank": rank
368
+ }
369
+ )
370
+ database.add_documents([doc])
371
+
372
+ database.save_local("faiss_database")
373
+
374
+ def export_vector_db_to_excel():
375
+ embed = get_embeddings()
376
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
377
+
378
+ documents = database.docstore._dict.values()
379
+ data = [{
380
+ "Search Date": doc.metadata["search_date"],
381
+ "Search Title": doc.metadata["search_title"],
382
+ "Search Content": doc.metadata["search_content"],
383
+ "Search Summary": doc.metadata["search_summary"],
384
+ "Rank": doc.metadata["rank"]
385
+ } for doc in documents]
386
+
387
+ df = pd.DataFrame(data)
388
+
389
+ with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
390
+ excel_path = tmp.name
391
+ df.to_excel(excel_path, index=False)
392
+
393
+ return excel_path
394
+
395
  def extract_db_to_excel():
396
  embed = get_embeddings()
397
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
 
451
 
452
  submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
453
 
454
+ export_vector_db_button = gr.Button("Export Vector DB to Excel")
455
+ vector_db_excel_output = gr.File(label="Download Vector DB Excel File")
456
+ export_vector_db_button.click(export_vector_db_to_excel, inputs=[], outputs=vector_db_excel_output)
457
+
458
  extract_button = gr.Button("Extract Database to Excel")
459
  excel_output = gr.File(label="Download Excel File")
460
  extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)