Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,6 @@ from datetime import datetime, timedelta
|
|
7 |
import requests
|
8 |
from threading import Thread, Event
|
9 |
import logging
|
10 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
11 |
from typing import Dict, List
|
12 |
from bs4 import BeautifulSoup
|
13 |
|
@@ -15,7 +14,7 @@ app = Flask(__name__)
|
|
15 |
|
16 |
# Configuration
|
17 |
NEWS_API_KEY = os.environ.get('NEWS_API_KEY', '352f67b35a544f408c58c74c654cfd7e')
|
18 |
-
MAX_NEWS_ARTICLES =
|
19 |
API_CALL_INTERVAL = 10 # seconds
|
20 |
REFRESH_INTERVAL = 7200 # 2 hours (increased to reduce CPU load)
|
21 |
CACHE_EXPIRY_DURATION = 3600 # 60 minutes (increased to reduce API calls)
|
@@ -24,6 +23,7 @@ last_fetch_time = None
|
|
24 |
last_api_call = 0
|
25 |
cached_articles = []
|
26 |
cache_expiry = None
|
|
|
27 |
|
28 |
# List of Indian finance news websites (reduced to avoid HTTP errors)
|
29 |
WEBSITES = [
|
@@ -245,65 +245,12 @@ def calculate_age(published):
|
|
245 |
except ValueError:
|
246 |
return "Unknown time"
|
247 |
|
248 |
-
# Chatbot Models (Initialized on-demand)
|
249 |
-
qa_pipeline = None
|
250 |
-
t5_tokenizer = None
|
251 |
-
t5_model = None
|
252 |
-
qa_loaded = Event()
|
253 |
-
t5_loaded = Event()
|
254 |
-
|
255 |
-
def load_qa_model():
|
256 |
-
global qa_pipeline
|
257 |
-
if not qa_loaded.is_set():
|
258 |
-
logging.info("Loading QA model on-demand...")
|
259 |
-
try:
|
260 |
-
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad")
|
261 |
-
logging.info("QA model loaded successfully")
|
262 |
-
qa_loaded.set()
|
263 |
-
except Exception as e:
|
264 |
-
logging.error(f"Failed to load QA model: {str(e)}")
|
265 |
-
|
266 |
-
def load_t5_model():
|
267 |
-
global t5_tokenizer, t5_model
|
268 |
-
if not t5_loaded.is_set():
|
269 |
-
logging.info("Loading Flan-T5 model on-demand...")
|
270 |
-
try:
|
271 |
-
t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
|
272 |
-
t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
|
273 |
-
logging.info("Flan-T5 model loaded successfully")
|
274 |
-
t5_loaded.set()
|
275 |
-
except Exception as e:
|
276 |
-
logging.error(f"Failed to load Flan-T5 model: {str(e)}")
|
277 |
-
|
278 |
-
# Function to generate a 60-80 word description using Flan-T5 (disabled for now)
|
279 |
-
def generate_description(title: str, raw_content: str, category: str, current_date_str: str) -> str:
|
280 |
-
# Disabled to reduce CPU usage
|
281 |
-
return raw_content[:200] + "..."
|
282 |
-
|
283 |
-
# Function to generate response using Flan-T5
|
284 |
-
def generate_t5_response(prompt: str, max_length: int = 80) -> str:
|
285 |
-
load_t5_model() # Load Flan-T5 on-demand
|
286 |
-
if not t5_loaded.is_set():
|
287 |
-
return None
|
288 |
-
|
289 |
-
try:
|
290 |
-
inputs = t5_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
|
291 |
-
outputs = t5_model.generate(
|
292 |
-
inputs.input_ids,
|
293 |
-
max_length=max_length,
|
294 |
-
min_length=30,
|
295 |
-
temperature=0.7,
|
296 |
-
top_p=0.9,
|
297 |
-
do_sample=True
|
298 |
-
)
|
299 |
-
response = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
300 |
-
return response
|
301 |
-
except Exception as e:
|
302 |
-
logging.error(f"Error generating response with Flan-T5: {str(e)}")
|
303 |
-
return None
|
304 |
-
|
305 |
# Function to fetch news from websites using BeautifulSoup and requests
|
306 |
def fetch_news_from_websites() -> List[Dict]:
|
|
|
|
|
|
|
|
|
307 |
articles = []
|
308 |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
309 |
used_headlines = set()
|
@@ -319,7 +266,7 @@ def fetch_news_from_websites() -> List[Dict]:
|
|
319 |
# Generic selectors (adjust per site)
|
320 |
news_items = soup.select('h1, h2, h3, .story, .article, .headline, .title')
|
321 |
for item in news_items:
|
322 |
-
if len(articles) >=
|
323 |
break
|
324 |
title = item.get_text(strip=True)[:100]
|
325 |
if title and title not in used_headlines:
|
@@ -350,7 +297,7 @@ def fetch_news_from_websites() -> List[Dict]:
|
|
350 |
})
|
351 |
except Exception as e:
|
352 |
logging.error(f"Failed to fetch from {url}: {str(e)}")
|
353 |
-
if len(articles) >=
|
354 |
break
|
355 |
|
356 |
return articles
|
@@ -437,10 +384,10 @@ def fetch_news(query: str = None) -> List[Dict]:
|
|
437 |
'age': calculate_age(article['publishedAt'])
|
438 |
})
|
439 |
|
440 |
-
# Fetch additional articles from websites if NewsAPI yields fewer than
|
441 |
-
if len(processed) <
|
442 |
web_articles = fetch_news_from_websites()
|
443 |
-
processed.extend(web_articles[:
|
444 |
|
445 |
cached_articles = processed
|
446 |
cache_expiry = current_time + CACHE_EXPIRY_DURATION
|
@@ -466,10 +413,14 @@ def fetch_news(query: str = None) -> List[Dict]:
|
|
466 |
logging.error("Max retry attempts reached for NewsAPI, returning cached articles")
|
467 |
return cached_articles
|
468 |
|
469 |
-
# Background Refresh Thread
|
470 |
stop_refresh = Event()
|
471 |
|
472 |
def refresh_news_periodically():
|
|
|
|
|
|
|
|
|
473 |
while not stop_refresh.is_set():
|
474 |
with app.app_context():
|
475 |
fetch_news()
|
@@ -477,7 +428,8 @@ def refresh_news_periodically():
|
|
477 |
time.sleep(REFRESH_INTERVAL)
|
478 |
|
479 |
refresh_thread = Thread(target=refresh_news_periodically, daemon=True)
|
480 |
-
|
|
|
481 |
|
482 |
# Startup Logic
|
483 |
with app.app_context():
|
@@ -535,15 +487,6 @@ def category_news(category_name):
|
|
535 |
@app.route('/chat', methods=['POST'])
|
536 |
def chat():
|
537 |
logging.info("Received chat request")
|
538 |
-
if not qa_loaded.is_set() and not t5_loaded.is_set():
|
539 |
-
load_qa_model()
|
540 |
-
load_t5_model()
|
541 |
-
if not qa_loaded.is_set() or not t5_loaded.is_set():
|
542 |
-
return jsonify({
|
543 |
-
'response': ['One or more models failed to load. Please try again later.'],
|
544 |
-
'status': 'error'
|
545 |
-
}), 500
|
546 |
-
|
547 |
try:
|
548 |
data = request.get_json()
|
549 |
if not data or 'message' not in data:
|
@@ -628,31 +571,18 @@ def chat():
|
|
628 |
for article in context_articles:
|
629 |
article['description'] = article['summary']
|
630 |
|
631 |
-
# Use
|
|
|
632 |
qa_answer = None
|
633 |
-
if
|
634 |
-
context = " ".join([article['content'] for article in context_articles])
|
635 |
-
# Add static knowledge base to context for better QA
|
636 |
-
if topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
|
637 |
-
knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
|
638 |
-
context += " " + " ".join(knowledge.values())
|
639 |
-
try:
|
640 |
-
qa_result = qa_pipeline(question=user_input, context=context, max_answer_len=30)
|
641 |
-
qa_answer = qa_result['answer'] if qa_result['score'] > 0.5 else None
|
642 |
-
except Exception as e:
|
643 |
-
logging.error(f"QA model error: {str(e)}")
|
644 |
-
|
645 |
-
# If QA model fails, use static knowledge base
|
646 |
-
if not qa_answer and topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
|
647 |
knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
|
648 |
for key, value in knowledge.items():
|
649 |
if key in user_input:
|
650 |
qa_answer = value
|
|
|
651 |
break
|
652 |
|
653 |
-
|
654 |
-
summary = "No recent news available."
|
655 |
-
if context_articles:
|
656 |
# Deduplicate descriptions and limit to unique content
|
657 |
descriptions = list(dict.fromkeys([article['description'] for article in context_articles]))
|
658 |
summary = " ".join(descriptions[:2]) # Limit to 2 descriptions to avoid repetition
|
@@ -666,29 +596,11 @@ def chat():
|
|
666 |
knowledge = FINANCIAL_KNOWLEDGE_BASE['Stock Market'].get('nifty trend', '')
|
667 |
summary = knowledge + " " + summary
|
668 |
|
669 |
-
prompt = f"""You are a financial analyst providing concise answers as of {datetime.now().strftime('%Y-%m-%d')}.
|
670 |
-
Query: {user_input}
|
671 |
-
Context from recent news: {summary}
|
672 |
-
Factual answer (if available): {qa_answer if qa_answer else 'Not found'}
|
673 |
-
Provide a summary in 2-3 sentences (each under 30 words)."""
|
674 |
-
logging.info(f"Generated prompt: {prompt[:100]}...")
|
675 |
-
|
676 |
-
t5_response = generate_t5_response(prompt, max_length=80)
|
677 |
-
if t5_response:
|
678 |
-
summary_response = t5_response
|
679 |
-
else:
|
680 |
-
if qa_answer:
|
681 |
-
summary_response = f"As of {datetime.now().strftime('%b %d, %Y')}, {qa_answer}"
|
682 |
-
else:
|
683 |
-
summary_response = summary
|
684 |
-
|
685 |
-
# Ensure summary is within 30 words per sentence
|
686 |
-
summary_lines = summary_response.split('\n')
|
687 |
-
summary_lines = [line.strip() for line in summary_lines if line.strip()]
|
688 |
-
summary_lines = [line if len(line.split()) <= 30 else ' '.join(line.split()[:30]) + '.' for line in summary_lines]
|
689 |
-
|
690 |
# Construct the response as a list of lines
|
691 |
response_lines = ["**Summary**"]
|
|
|
|
|
|
|
692 |
response_lines.extend(summary_lines)
|
693 |
response_lines.append("")
|
694 |
response_lines.append("**Investment Recommendations for Indian Investors**")
|
@@ -777,8 +689,6 @@ def health():
|
|
777 |
return jsonify({
|
778 |
"status": "healthy",
|
779 |
"refresh_running": refresh_thread.is_alive(),
|
780 |
-
"qa_loaded": qa_loaded.is_set(),
|
781 |
-
"t5_loaded": t5_loaded.is_set(),
|
782 |
"database": db_status
|
783 |
})
|
784 |
|
|
|
7 |
import requests
|
8 |
from threading import Thread, Event
|
9 |
import logging
|
|
|
10 |
from typing import Dict, List
|
11 |
from bs4 import BeautifulSoup
|
12 |
|
|
|
14 |
|
15 |
# Configuration
|
16 |
NEWS_API_KEY = os.environ.get('NEWS_API_KEY', '352f67b35a544f408c58c74c654cfd7e')
|
17 |
+
MAX_NEWS_ARTICLES = 5 # Reduced to lower CPU usage during build
|
18 |
API_CALL_INTERVAL = 10 # seconds
|
19 |
REFRESH_INTERVAL = 7200 # 2 hours (increased to reduce CPU load)
|
20 |
CACHE_EXPIRY_DURATION = 3600 # 60 minutes (increased to reduce API calls)
|
|
|
23 |
last_api_call = 0
|
24 |
cached_articles = []
|
25 |
cache_expiry = None
|
26 |
+
IS_BUILDING = os.environ.get('IS_BUILDING', 'false').lower() == 'true' # Flag to skip heavy tasks during build
|
27 |
|
28 |
# List of Indian finance news websites (reduced to avoid HTTP errors)
|
29 |
WEBSITES = [
|
|
|
245 |
except ValueError:
|
246 |
return "Unknown time"
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
# Function to fetch news from websites using BeautifulSoup and requests
|
249 |
def fetch_news_from_websites() -> List[Dict]:
|
250 |
+
if IS_BUILDING:
|
251 |
+
logging.info("Skipping web scraping during build phase")
|
252 |
+
return []
|
253 |
+
|
254 |
articles = []
|
255 |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
256 |
used_headlines = set()
|
|
|
266 |
# Generic selectors (adjust per site)
|
267 |
news_items = soup.select('h1, h2, h3, .story, .article, .headline, .title')
|
268 |
for item in news_items:
|
269 |
+
if len(articles) >= 5: # Further reduced limit to 5 articles
|
270 |
break
|
271 |
title = item.get_text(strip=True)[:100]
|
272 |
if title and title not in used_headlines:
|
|
|
297 |
})
|
298 |
except Exception as e:
|
299 |
logging.error(f"Failed to fetch from {url}: {str(e)}")
|
300 |
+
if len(articles) >= 5:
|
301 |
break
|
302 |
|
303 |
return articles
|
|
|
384 |
'age': calculate_age(article['publishedAt'])
|
385 |
})
|
386 |
|
387 |
+
# Fetch additional articles from websites if NewsAPI yields fewer than 5 articles
|
388 |
+
if len(processed) < 5 and not IS_BUILDING:
|
389 |
web_articles = fetch_news_from_websites()
|
390 |
+
processed.extend(web_articles[:5 - len(processed)])
|
391 |
|
392 |
cached_articles = processed
|
393 |
cache_expiry = current_time + CACHE_EXPIRY_DURATION
|
|
|
413 |
logging.error("Max retry attempts reached for NewsAPI, returning cached articles")
|
414 |
return cached_articles
|
415 |
|
416 |
+
# Background Refresh Thread (disabled during build)
|
417 |
stop_refresh = Event()
|
418 |
|
419 |
def refresh_news_periodically():
|
420 |
+
if IS_BUILDING:
|
421 |
+
logging.info("Skipping background news refresh during build phase")
|
422 |
+
return
|
423 |
+
|
424 |
while not stop_refresh.is_set():
|
425 |
with app.app_context():
|
426 |
fetch_news()
|
|
|
428 |
time.sleep(REFRESH_INTERVAL)
|
429 |
|
430 |
refresh_thread = Thread(target=refresh_news_periodically, daemon=True)
|
431 |
+
if not IS_BUILDING:
|
432 |
+
refresh_thread.start()
|
433 |
|
434 |
# Startup Logic
|
435 |
with app.app_context():
|
|
|
487 |
@app.route('/chat', methods=['POST'])
|
488 |
def chat():
|
489 |
logging.info("Received chat request")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
try:
|
491 |
data = request.get_json()
|
492 |
if not data or 'message' not in data:
|
|
|
571 |
for article in context_articles:
|
572 |
article['description'] = article['summary']
|
573 |
|
574 |
+
# Use static knowledge base for summary
|
575 |
+
summary = "No recent news available."
|
576 |
qa_answer = None
|
577 |
+
if topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
|
579 |
for key, value in knowledge.items():
|
580 |
if key in user_input:
|
581 |
qa_answer = value
|
582 |
+
summary = value
|
583 |
break
|
584 |
|
585 |
+
if context_articles and summary == "No recent news available.":
|
|
|
|
|
586 |
# Deduplicate descriptions and limit to unique content
|
587 |
descriptions = list(dict.fromkeys([article['description'] for article in context_articles]))
|
588 |
summary = " ".join(descriptions[:2]) # Limit to 2 descriptions to avoid repetition
|
|
|
596 |
knowledge = FINANCIAL_KNOWLEDGE_BASE['Stock Market'].get('nifty trend', '')
|
597 |
summary = knowledge + " " + summary
|
598 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
# Construct the response as a list of lines
|
600 |
response_lines = ["**Summary**"]
|
601 |
+
summary_lines = summary.split('. ')
|
602 |
+
summary_lines = [line.strip() for line in summary_lines if line.strip()]
|
603 |
+
summary_lines = [line if len(line.split()) <= 30 else ' '.join(line.split()[:30]) + '.' for line in summary_lines]
|
604 |
response_lines.extend(summary_lines)
|
605 |
response_lines.append("")
|
606 |
response_lines.append("**Investment Recommendations for Indian Investors**")
|
|
|
689 |
return jsonify({
|
690 |
"status": "healthy",
|
691 |
"refresh_running": refresh_thread.is_alive(),
|
|
|
|
|
692 |
"database": db_status
|
693 |
})
|
694 |
|