Shreyas094 commited on
Commit
8ac8380
·
verified ·
1 Parent(s): 4234e59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -3
app.py CHANGED
@@ -10,6 +10,8 @@ import urllib.parse
10
  from tempfile import NamedTemporaryFile
11
  from typing import List
12
  from bs4 import BeautifulSoup
 
 
13
  from langchain_core.prompts import ChatPromptTemplate
14
  from langchain_community.vectorstores import FAISS
15
  from langchain_community.document_loaders import PyPDFLoader
@@ -22,6 +24,7 @@ from langchain_core.documents import Document
22
  from sklearn.feature_extraction.text import TfidfVectorizer
23
  from sklearn.metrics.pairwise import cosine_similarity
24
 
 
25
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
26
 
27
  # Memory database to store question-answer pairs
@@ -302,13 +305,16 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
302
  clean_content = article["title"]
303
 
304
  full_summary, cleaned_summary = summarize_news_content(clean_content, model)
 
 
305
  processed_article = {
306
  "published_date": article["published_date"],
307
  "title": article["title"],
308
  "url": article["url"],
309
  "content": clean_content,
310
  "summary": full_summary,
311
- "cleaned_summary": cleaned_summary
 
312
  }
313
  processed_articles.append(processed_article)
314
  except Exception as e:
@@ -321,7 +327,8 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
321
  docs = [Document(page_content=article["cleaned_summary"], metadata={
322
  "source": article["url"],
323
  "title": article["title"],
324
- "published_date": article["published_date"]
 
325
  }) for article in processed_articles]
326
 
327
  try:
@@ -341,7 +348,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
341
  except Exception as e:
342
  return f"Error adding articles to the database: {str(e)}"
343
 
344
-
345
  def fetch_articles_from_page(url):
346
  response = requests.get(url)
347
  response.raise_for_status()
@@ -449,12 +455,45 @@ def export_news_to_excel():
449
  df['summary'] = df['cleaned_summary']
450
  df = df.drop(columns=['cleaned_summary']) # Remove the extra column
451
 
 
 
 
 
452
  with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
453
  excel_path = tmp.name
454
  df.to_excel(excel_path, index=False)
455
 
456
  return excel_path
457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
459
  global conversation_history
460
 
 
10
  from tempfile import NamedTemporaryFile
11
  from typing import List
12
  from bs4 import BeautifulSoup
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain.chains import LLMChain
15
  from langchain_core.prompts import ChatPromptTemplate
16
  from langchain_community.vectorstores import FAISS
17
  from langchain_community.document_loaders import PyPDFLoader
 
24
  from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
26
 
27
+
28
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
29
 
30
  # Memory database to store question-answer pairs
 
305
  clean_content = article["title"]
306
 
307
  full_summary, cleaned_summary = summarize_news_content(clean_content, model)
308
+ relevance_score = calculate_relevance_score(cleaned_summary, model)
309
+
310
  processed_article = {
311
  "published_date": article["published_date"],
312
  "title": article["title"],
313
  "url": article["url"],
314
  "content": clean_content,
315
  "summary": full_summary,
316
+ "cleaned_summary": cleaned_summary,
317
+ "relevance_score": relevance_score
318
  }
319
  processed_articles.append(processed_article)
320
  except Exception as e:
 
327
  docs = [Document(page_content=article["cleaned_summary"], metadata={
328
  "source": article["url"],
329
  "title": article["title"],
330
+ "published_date": article["published_date"],
331
+ "relevance_score": article["relevance_score"]
332
  }) for article in processed_articles]
333
 
334
  try:
 
348
  except Exception as e:
349
  return f"Error adding articles to the database: {str(e)}"
350
 
 
351
  def fetch_articles_from_page(url):
352
  response = requests.get(url)
353
  response.raise_for_status()
 
455
  df['summary'] = df['cleaned_summary']
456
  df = df.drop(columns=['cleaned_summary']) # Remove the extra column
457
 
458
+ # Reorder columns to put relevance_score after summary
459
+ columns = ['published_date', 'title', 'url', 'content', 'summary', 'relevance_score']
460
+ df = df[columns]
461
+
462
  with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
463
  excel_path = tmp.name
464
  df.to_excel(excel_path, index=False)
465
 
466
  return excel_path
467
 
468
+ def calculate_relevance_score(summary, model):
469
+ prompt_template = PromptTemplate(
470
+ input_variables=["summary"],
471
+ template="""You are a financial analyst tasked with providing a relevance score to news summaries.
472
+ The score should be based on the financial significance and impact of the news.
473
+ Use the following scoring guide:
474
+ - 0.00-0.20: Not relevant to finance or economics
475
+ - 0.21-0.40: Slightly relevant, but minimal financial impact
476
+ - 0.41-0.60: Moderately relevant, some financial implications
477
+ - 0.61-0.80: Highly relevant, significant financial impact
478
+ - 0.81-1.00: Extremely relevant, major financial implications
479
+
480
+ Provide a score between 0.00 and 1.00, where 0.00 is not relevant at all, and 1.00 is extremely relevant from a financial perspective.
481
+
482
+ Summary: {summary}
483
+
484
+ Relevance Score:"""
485
+ )
486
+
487
+ chain = LLMChain(llm=model, prompt=prompt_template)
488
+ response = chain.run(summary=summary)
489
+
490
+ try:
491
+ score = float(response.strip())
492
+ return min(max(score, 0.00), 1.00) # Ensure the score is between 0.00 and 1.00
493
+ except ValueError:
494
+ print(f"Error parsing relevance score: {response}")
495
+ return 0.00
496
+
497
  def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
498
  global conversation_history
499