Spaces:
Paused
Paused
Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -20,6 +20,7 @@ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
|
|
20 |
from langchain_core.documents import Document
|
21 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
22 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
23 |
|
24 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
25 |
|
@@ -142,7 +143,7 @@ _useragent_list = [
|
|
142 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
|
143 |
]
|
144 |
|
145 |
-
def google_search(term, num_results=
|
146 |
escaped_term = urllib.parse.quote_plus(term)
|
147 |
start = 0
|
148 |
all_results = []
|
@@ -221,6 +222,30 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
|
|
221 |
|
222 |
return all_results
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
225 |
global conversation_history
|
226 |
|
@@ -235,19 +260,19 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
|
235 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
236 |
else:
|
237 |
database = None
|
238 |
-
|
239 |
if web_search:
|
240 |
search_results = google_search(question)
|
241 |
-
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
database.add_documents(web_docs)
|
247 |
|
248 |
-
|
249 |
|
250 |
-
context_str = "\n".join([
|
|
|
251 |
|
252 |
prompt_template = """
|
253 |
Answer the question based on the following web search results:
|
@@ -325,6 +350,48 @@ def update_vectors(files, use_recursive_splitter):
|
|
325 |
|
326 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
|
327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
def extract_db_to_excel():
|
329 |
embed = get_embeddings()
|
330 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
@@ -384,6 +451,10 @@ with gr.Blocks() as demo:
|
|
384 |
|
385 |
submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
|
386 |
|
|
|
|
|
|
|
|
|
387 |
extract_button = gr.Button("Extract Database to Excel")
|
388 |
excel_output = gr.File(label="Download Excel File")
|
389 |
extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
|
|
|
20 |
from langchain_core.documents import Document
|
21 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
22 |
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
from datetime import datetime
|
24 |
|
25 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
26 |
|
|
|
143 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
|
144 |
]
|
145 |
|
146 |
+
def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl_verify=None):
|
147 |
escaped_term = urllib.parse.quote_plus(term)
|
148 |
start = 0
|
149 |
all_results = []
|
|
|
222 |
|
223 |
return all_results
|
224 |
|
225 |
+
def summarize_content(content, model):
|
226 |
+
summary_prompt = f"""
|
227 |
+
Summarize the following content in a concise manner:
|
228 |
+
{content}
|
229 |
+
Summary:
|
230 |
+
"""
|
231 |
+
summary = generate_chunked_response(model, summary_prompt, max_tokens=200)
|
232 |
+
return summary
|
233 |
+
|
234 |
+
def rank_search_results(titles, summaries, model):
|
235 |
+
ranking_prompt = f"""
|
236 |
+
Rank the following search results from a financial analyst perspective.
|
237 |
+
Assign a rank from 1 to {len(titles)} based on relevance, with 1 being the most relevant.
|
238 |
+
Return only the numeric ranks in order, separated by commas.
|
239 |
+
|
240 |
+
Titles and summaries:
|
241 |
+
{', '.join([f"{i+1}. Title: {title}\nSummary: {summary}" for i, (title, summary) in enumerate(zip(titles, summaries))])}
|
242 |
+
|
243 |
+
Ranks:
|
244 |
+
"""
|
245 |
+
ranks_str = generate_chunked_response(model, ranking_prompt)
|
246 |
+
ranks = [float(rank.strip()) for rank in ranks_str.split(',')]
|
247 |
+
return ranks
|
248 |
+
|
249 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
250 |
global conversation_history
|
251 |
|
|
|
260 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
261 |
else:
|
262 |
database = None
|
263 |
+
|
264 |
if web_search:
|
265 |
search_results = google_search(question)
|
266 |
+
model = get_model(temperature, top_p, repetition_penalty)
|
267 |
|
268 |
+
summaries = [summarize_content(result["text"], model) for result in search_results]
|
269 |
+
titles = [result["title"] for result in search_results]
|
270 |
+
ranks = rank_search_results(titles, summaries, model)
|
|
|
271 |
|
272 |
+
update_vector_db_with_search_results(search_results, summaries, ranks)
|
273 |
|
274 |
+
context_str = "\n".join([f"Title: {result['title']}\nSummary: {summary}\nRank: {rank}"
|
275 |
+
for result, summary, rank in zip(search_results, summaries, ranks)])
|
276 |
|
277 |
prompt_template = """
|
278 |
Answer the question based on the following web search results:
|
|
|
350 |
|
351 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
|
352 |
|
353 |
+
def update_vector_db_with_search_results(search_results, summaries, ranks):
|
354 |
+
embed = get_embeddings()
|
355 |
+
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True) if os.path.exists("faiss_database") else FAISS.from_documents([], embed)
|
356 |
+
|
357 |
+
current_date = datetime.now().strftime("%Y-%m-%d")
|
358 |
+
|
359 |
+
for result, summary, rank in zip(search_results, summaries, ranks):
|
360 |
+
doc = Document(
|
361 |
+
page_content=summary,
|
362 |
+
metadata={
|
363 |
+
"search_date": current_date,
|
364 |
+
"search_title": result["title"],
|
365 |
+
"search_content": result["text"],
|
366 |
+
"search_summary": summary,
|
367 |
+
"rank": rank
|
368 |
+
}
|
369 |
+
)
|
370 |
+
database.add_documents([doc])
|
371 |
+
|
372 |
+
database.save_local("faiss_database")
|
373 |
+
|
374 |
+
def export_vector_db_to_excel():
|
375 |
+
embed = get_embeddings()
|
376 |
+
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
377 |
+
|
378 |
+
documents = database.docstore._dict.values()
|
379 |
+
data = [{
|
380 |
+
"Search Date": doc.metadata["search_date"],
|
381 |
+
"Search Title": doc.metadata["search_title"],
|
382 |
+
"Search Content": doc.metadata["search_content"],
|
383 |
+
"Search Summary": doc.metadata["search_summary"],
|
384 |
+
"Rank": doc.metadata["rank"]
|
385 |
+
} for doc in documents]
|
386 |
+
|
387 |
+
df = pd.DataFrame(data)
|
388 |
+
|
389 |
+
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
390 |
+
excel_path = tmp.name
|
391 |
+
df.to_excel(excel_path, index=False)
|
392 |
+
|
393 |
+
return excel_path
|
394 |
+
|
395 |
def extract_db_to_excel():
|
396 |
embed = get_embeddings()
|
397 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
|
|
451 |
|
452 |
submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
|
453 |
|
454 |
+
export_vector_db_button = gr.Button("Export Vector DB to Excel")
|
455 |
+
vector_db_excel_output = gr.File(label="Download Vector DB Excel File")
|
456 |
+
export_vector_db_button.click(export_vector_db_to_excel, inputs=[], outputs=vector_db_excel_output)
|
457 |
+
|
458 |
extract_button = gr.Button("Extract Database to Excel")
|
459 |
excel_output = gr.File(label="Download Excel File")
|
460 |
extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
|