Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ import urllib.parse
|
|
10 |
from tempfile import NamedTemporaryFile
|
11 |
from typing import List
|
12 |
from bs4 import BeautifulSoup
|
|
|
|
|
13 |
from langchain_core.prompts import ChatPromptTemplate
|
14 |
from langchain_community.vectorstores import FAISS
|
15 |
from langchain_community.document_loaders import PyPDFLoader
|
@@ -22,6 +24,7 @@ from langchain_core.documents import Document
|
|
22 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
23 |
from sklearn.metrics.pairwise import cosine_similarity
|
24 |
|
|
|
25 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
26 |
|
27 |
# Memory database to store question-answer pairs
|
@@ -302,13 +305,16 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
302 |
clean_content = article["title"]
|
303 |
|
304 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
|
|
|
|
305 |
processed_article = {
|
306 |
"published_date": article["published_date"],
|
307 |
"title": article["title"],
|
308 |
"url": article["url"],
|
309 |
"content": clean_content,
|
310 |
"summary": full_summary,
|
311 |
-
"cleaned_summary": cleaned_summary
|
|
|
312 |
}
|
313 |
processed_articles.append(processed_article)
|
314 |
except Exception as e:
|
@@ -321,7 +327,8 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
321 |
docs = [Document(page_content=article["cleaned_summary"], metadata={
|
322 |
"source": article["url"],
|
323 |
"title": article["title"],
|
324 |
-
"published_date": article["published_date"]
|
|
|
325 |
}) for article in processed_articles]
|
326 |
|
327 |
try:
|
@@ -341,7 +348,6 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
|
|
341 |
except Exception as e:
|
342 |
return f"Error adding articles to the database: {str(e)}"
|
343 |
|
344 |
-
|
345 |
def fetch_articles_from_page(url):
|
346 |
response = requests.get(url)
|
347 |
response.raise_for_status()
|
@@ -449,12 +455,45 @@ def export_news_to_excel():
|
|
449 |
df['summary'] = df['cleaned_summary']
|
450 |
df = df.drop(columns=['cleaned_summary']) # Remove the extra column
|
451 |
|
|
|
|
|
|
|
|
|
452 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
453 |
excel_path = tmp.name
|
454 |
df.to_excel(excel_path, index=False)
|
455 |
|
456 |
return excel_path
|
457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
459 |
global conversation_history
|
460 |
|
|
|
10 |
from tempfile import NamedTemporaryFile
|
11 |
from typing import List
|
12 |
from bs4 import BeautifulSoup
|
13 |
+
from langchain.prompts import PromptTemplate
|
14 |
+
from langchain.chains import LLMChain
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
from langchain_community.vectorstores import FAISS
|
17 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
24 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
25 |
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
|
27 |
+
|
28 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
29 |
|
30 |
# Memory database to store question-answer pairs
|
|
|
305 |
clean_content = article["title"]
|
306 |
|
307 |
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
308 |
+
relevance_score = calculate_relevance_score(cleaned_summary, model)
|
309 |
+
|
310 |
processed_article = {
|
311 |
"published_date": article["published_date"],
|
312 |
"title": article["title"],
|
313 |
"url": article["url"],
|
314 |
"content": clean_content,
|
315 |
"summary": full_summary,
|
316 |
+
"cleaned_summary": cleaned_summary,
|
317 |
+
"relevance_score": relevance_score
|
318 |
}
|
319 |
processed_articles.append(processed_article)
|
320 |
except Exception as e:
|
|
|
327 |
docs = [Document(page_content=article["cleaned_summary"], metadata={
|
328 |
"source": article["url"],
|
329 |
"title": article["title"],
|
330 |
+
"published_date": article["published_date"],
|
331 |
+
"relevance_score": article["relevance_score"]
|
332 |
}) for article in processed_articles]
|
333 |
|
334 |
try:
|
|
|
348 |
except Exception as e:
|
349 |
return f"Error adding articles to the database: {str(e)}"
|
350 |
|
|
|
351 |
def fetch_articles_from_page(url):
|
352 |
response = requests.get(url)
|
353 |
response.raise_for_status()
|
|
|
455 |
df['summary'] = df['cleaned_summary']
|
456 |
df = df.drop(columns=['cleaned_summary']) # Remove the extra column
|
457 |
|
458 |
+
# Reorder columns to put relevance_score after summary
|
459 |
+
columns = ['published_date', 'title', 'url', 'content', 'summary', 'relevance_score']
|
460 |
+
df = df[columns]
|
461 |
+
|
462 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
463 |
excel_path = tmp.name
|
464 |
df.to_excel(excel_path, index=False)
|
465 |
|
466 |
return excel_path
|
467 |
|
468 |
+
def calculate_relevance_score(summary, model):
|
469 |
+
prompt_template = PromptTemplate(
|
470 |
+
input_variables=["summary"],
|
471 |
+
template="""You are a financial analyst tasked with providing a relevance score to news summaries.
|
472 |
+
The score should be based on the financial significance and impact of the news.
|
473 |
+
Use the following scoring guide:
|
474 |
+
- 0.00-0.20: Not relevant to finance or economics
|
475 |
+
- 0.21-0.40: Slightly relevant, but minimal financial impact
|
476 |
+
- 0.41-0.60: Moderately relevant, some financial implications
|
477 |
+
- 0.61-0.80: Highly relevant, significant financial impact
|
478 |
+
- 0.81-1.00: Extremely relevant, major financial implications
|
479 |
+
|
480 |
+
Provide a score between 0.00 and 1.00, where 0.00 is not relevant at all, and 1.00 is extremely relevant from a financial perspective.
|
481 |
+
|
482 |
+
Summary: {summary}
|
483 |
+
|
484 |
+
Relevance Score:"""
|
485 |
+
)
|
486 |
+
|
487 |
+
chain = LLMChain(llm=model, prompt=prompt_template)
|
488 |
+
response = chain.run(summary=summary)
|
489 |
+
|
490 |
+
try:
|
491 |
+
score = float(response.strip())
|
492 |
+
return min(max(score, 0.00), 1.00) # Ensure the score is between 0.00 and 1.00
|
493 |
+
except ValueError:
|
494 |
+
print(f"Error parsing relevance score: {response}")
|
495 |
+
return 0.00
|
496 |
+
|
497 |
def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
|
498 |
global conversation_history
|
499 |
|