ayush2917 commited on
Commit
b1e9722
·
verified ·
1 Parent(s): 1fa9651

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -116
app.py CHANGED
@@ -7,7 +7,6 @@ from datetime import datetime, timedelta
7
  import requests
8
  from threading import Thread, Event
9
  import logging
10
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
11
  from typing import Dict, List
12
  from bs4 import BeautifulSoup
13
 
@@ -15,7 +14,7 @@ app = Flask(__name__)
15
 
16
  # Configuration
17
  NEWS_API_KEY = os.environ.get('NEWS_API_KEY', '352f67b35a544f408c58c74c654cfd7e')
18
- MAX_NEWS_ARTICLES = 10
19
  API_CALL_INTERVAL = 10 # seconds
20
  REFRESH_INTERVAL = 7200 # 2 hours (increased to reduce CPU load)
21
  CACHE_EXPIRY_DURATION = 3600 # 60 minutes (increased to reduce API calls)
@@ -24,6 +23,7 @@ last_fetch_time = None
24
  last_api_call = 0
25
  cached_articles = []
26
  cache_expiry = None
 
27
 
28
  # List of Indian finance news websites (reduced to avoid HTTP errors)
29
  WEBSITES = [
@@ -245,65 +245,12 @@ def calculate_age(published):
245
  except ValueError:
246
  return "Unknown time"
247
 
248
- # Chatbot Models (Initialized on-demand)
249
- qa_pipeline = None
250
- t5_tokenizer = None
251
- t5_model = None
252
- qa_loaded = Event()
253
- t5_loaded = Event()
254
-
255
- def load_qa_model():
256
- global qa_pipeline
257
- if not qa_loaded.is_set():
258
- logging.info("Loading QA model on-demand...")
259
- try:
260
- qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad")
261
- logging.info("QA model loaded successfully")
262
- qa_loaded.set()
263
- except Exception as e:
264
- logging.error(f"Failed to load QA model: {str(e)}")
265
-
266
- def load_t5_model():
267
- global t5_tokenizer, t5_model
268
- if not t5_loaded.is_set():
269
- logging.info("Loading Flan-T5 model on-demand...")
270
- try:
271
- t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
272
- t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
273
- logging.info("Flan-T5 model loaded successfully")
274
- t5_loaded.set()
275
- except Exception as e:
276
- logging.error(f"Failed to load Flan-T5 model: {str(e)}")
277
-
278
- # Function to generate a 60-80 word description using Flan-T5 (disabled for now)
279
- def generate_description(title: str, raw_content: str, category: str, current_date_str: str) -> str:
280
- # Disabled to reduce CPU usage
281
- return raw_content[:200] + "..."
282
-
283
- # Function to generate response using Flan-T5
284
- def generate_t5_response(prompt: str, max_length: int = 80) -> str:
285
- load_t5_model() # Load Flan-T5 on-demand
286
- if not t5_loaded.is_set():
287
- return None
288
-
289
- try:
290
- inputs = t5_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
291
- outputs = t5_model.generate(
292
- inputs.input_ids,
293
- max_length=max_length,
294
- min_length=30,
295
- temperature=0.7,
296
- top_p=0.9,
297
- do_sample=True
298
- )
299
- response = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
300
- return response
301
- except Exception as e:
302
- logging.error(f"Error generating response with Flan-T5: {str(e)}")
303
- return None
304
-
305
  # Function to fetch news from websites using BeautifulSoup and requests
306
  def fetch_news_from_websites() -> List[Dict]:
 
 
 
 
307
  articles = []
308
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
309
  used_headlines = set()
@@ -319,7 +266,7 @@ def fetch_news_from_websites() -> List[Dict]:
319
  # Generic selectors (adjust per site)
320
  news_items = soup.select('h1, h2, h3, .story, .article, .headline, .title')
321
  for item in news_items:
322
- if len(articles) >= 10: # Reduced limit to 10 articles to lower CPU usage
323
  break
324
  title = item.get_text(strip=True)[:100]
325
  if title and title not in used_headlines:
@@ -350,7 +297,7 @@ def fetch_news_from_websites() -> List[Dict]:
350
  })
351
  except Exception as e:
352
  logging.error(f"Failed to fetch from {url}: {str(e)}")
353
- if len(articles) >= 10:
354
  break
355
 
356
  return articles
@@ -437,10 +384,10 @@ def fetch_news(query: str = None) -> List[Dict]:
437
  'age': calculate_age(article['publishedAt'])
438
  })
439
 
440
- # Fetch additional articles from websites if NewsAPI yields fewer than 10 articles
441
- if len(processed) < 10:
442
  web_articles = fetch_news_from_websites()
443
- processed.extend(web_articles[:10 - len(processed)])
444
 
445
  cached_articles = processed
446
  cache_expiry = current_time + CACHE_EXPIRY_DURATION
@@ -466,10 +413,14 @@ def fetch_news(query: str = None) -> List[Dict]:
466
  logging.error("Max retry attempts reached for NewsAPI, returning cached articles")
467
  return cached_articles
468
 
469
- # Background Refresh Thread
470
  stop_refresh = Event()
471
 
472
  def refresh_news_periodically():
 
 
 
 
473
  while not stop_refresh.is_set():
474
  with app.app_context():
475
  fetch_news()
@@ -477,7 +428,8 @@ def refresh_news_periodically():
477
  time.sleep(REFRESH_INTERVAL)
478
 
479
  refresh_thread = Thread(target=refresh_news_periodically, daemon=True)
480
- refresh_thread.start()
 
481
 
482
  # Startup Logic
483
  with app.app_context():
@@ -535,15 +487,6 @@ def category_news(category_name):
535
  @app.route('/chat', methods=['POST'])
536
  def chat():
537
  logging.info("Received chat request")
538
- if not qa_loaded.is_set() and not t5_loaded.is_set():
539
- load_qa_model()
540
- load_t5_model()
541
- if not qa_loaded.is_set() or not t5_loaded.is_set():
542
- return jsonify({
543
- 'response': ['One or more models failed to load. Please try again later.'],
544
- 'status': 'error'
545
- }), 500
546
-
547
  try:
548
  data = request.get_json()
549
  if not data or 'message' not in data:
@@ -628,31 +571,18 @@ def chat():
628
  for article in context_articles:
629
  article['description'] = article['summary']
630
 
631
- # Use QA model to extract a factual answer if possible
 
632
  qa_answer = None
633
- if context_articles and qa_loaded.is_set():
634
- context = " ".join([article['content'] for article in context_articles])
635
- # Add static knowledge base to context for better QA
636
- if topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
637
- knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
638
- context += " " + " ".join(knowledge.values())
639
- try:
640
- qa_result = qa_pipeline(question=user_input, context=context, max_answer_len=30)
641
- qa_answer = qa_result['answer'] if qa_result['score'] > 0.5 else None
642
- except Exception as e:
643
- logging.error(f"QA model error: {str(e)}")
644
-
645
- # If QA model fails, use static knowledge base
646
- if not qa_answer and topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
647
  knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
648
  for key, value in knowledge.items():
649
  if key in user_input:
650
  qa_answer = value
 
651
  break
652
 
653
- # Use Flan-T5 to generate the summary response
654
- summary = "No recent news available."
655
- if context_articles:
656
  # Deduplicate descriptions and limit to unique content
657
  descriptions = list(dict.fromkeys([article['description'] for article in context_articles]))
658
  summary = " ".join(descriptions[:2]) # Limit to 2 descriptions to avoid repetition
@@ -666,29 +596,11 @@ def chat():
666
  knowledge = FINANCIAL_KNOWLEDGE_BASE['Stock Market'].get('nifty trend', '')
667
  summary = knowledge + " " + summary
668
 
669
- prompt = f"""You are a financial analyst providing concise answers as of {datetime.now().strftime('%Y-%m-%d')}.
670
- Query: {user_input}
671
- Context from recent news: {summary}
672
- Factual answer (if available): {qa_answer if qa_answer else 'Not found'}
673
- Provide a summary in 2-3 sentences (each under 30 words)."""
674
- logging.info(f"Generated prompt: {prompt[:100]}...")
675
-
676
- t5_response = generate_t5_response(prompt, max_length=80)
677
- if t5_response:
678
- summary_response = t5_response
679
- else:
680
- if qa_answer:
681
- summary_response = f"As of {datetime.now().strftime('%b %d, %Y')}, {qa_answer}"
682
- else:
683
- summary_response = summary
684
-
685
- # Ensure summary is within 30 words per sentence
686
- summary_lines = summary_response.split('\n')
687
- summary_lines = [line.strip() for line in summary_lines if line.strip()]
688
- summary_lines = [line if len(line.split()) <= 30 else ' '.join(line.split()[:30]) + '.' for line in summary_lines]
689
-
690
  # Construct the response as a list of lines
691
  response_lines = ["**Summary**"]
 
 
 
692
  response_lines.extend(summary_lines)
693
  response_lines.append("")
694
  response_lines.append("**Investment Recommendations for Indian Investors**")
@@ -777,8 +689,6 @@ def health():
777
  return jsonify({
778
  "status": "healthy",
779
  "refresh_running": refresh_thread.is_alive(),
780
- "qa_loaded": qa_loaded.is_set(),
781
- "t5_loaded": t5_loaded.is_set(),
782
  "database": db_status
783
  })
784
 
 
7
  import requests
8
  from threading import Thread, Event
9
  import logging
 
10
  from typing import Dict, List
11
  from bs4 import BeautifulSoup
12
 
 
14
 
15
  # Configuration
16
  NEWS_API_KEY = os.environ.get('NEWS_API_KEY', '352f67b35a544f408c58c74c654cfd7e')
17
+ MAX_NEWS_ARTICLES = 5 # Reduced to lower CPU usage during build
18
  API_CALL_INTERVAL = 10 # seconds
19
  REFRESH_INTERVAL = 7200 # 2 hours (increased to reduce CPU load)
20
  CACHE_EXPIRY_DURATION = 3600 # 60 minutes (increased to reduce API calls)
 
23
  last_api_call = 0
24
  cached_articles = []
25
  cache_expiry = None
26
+ IS_BUILDING = os.environ.get('IS_BUILDING', 'false').lower() == 'true' # Flag to skip heavy tasks during build
27
 
28
  # List of Indian finance news websites (reduced to avoid HTTP errors)
29
  WEBSITES = [
 
245
  except ValueError:
246
  return "Unknown time"
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  # Function to fetch news from websites using BeautifulSoup and requests
249
  def fetch_news_from_websites() -> List[Dict]:
250
+ if IS_BUILDING:
251
+ logging.info("Skipping web scraping during build phase")
252
+ return []
253
+
254
  articles = []
255
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
256
  used_headlines = set()
 
266
  # Generic selectors (adjust per site)
267
  news_items = soup.select('h1, h2, h3, .story, .article, .headline, .title')
268
  for item in news_items:
269
+ if len(articles) >= 5: # Further reduced limit to 5 articles
270
  break
271
  title = item.get_text(strip=True)[:100]
272
  if title and title not in used_headlines:
 
297
  })
298
  except Exception as e:
299
  logging.error(f"Failed to fetch from {url}: {str(e)}")
300
+ if len(articles) >= 5:
301
  break
302
 
303
  return articles
 
384
  'age': calculate_age(article['publishedAt'])
385
  })
386
 
387
+ # Fetch additional articles from websites if NewsAPI yields fewer than 5 articles
388
+ if len(processed) < 5 and not IS_BUILDING:
389
  web_articles = fetch_news_from_websites()
390
+ processed.extend(web_articles[:5 - len(processed)])
391
 
392
  cached_articles = processed
393
  cache_expiry = current_time + CACHE_EXPIRY_DURATION
 
413
  logging.error("Max retry attempts reached for NewsAPI, returning cached articles")
414
  return cached_articles
415
 
416
+ # Background Refresh Thread (disabled during build)
417
  stop_refresh = Event()
418
 
419
  def refresh_news_periodically():
420
+ if IS_BUILDING:
421
+ logging.info("Skipping background news refresh during build phase")
422
+ return
423
+
424
  while not stop_refresh.is_set():
425
  with app.app_context():
426
  fetch_news()
 
428
  time.sleep(REFRESH_INTERVAL)
429
 
430
  refresh_thread = Thread(target=refresh_news_periodically, daemon=True)
431
+ if not IS_BUILDING:
432
+ refresh_thread.start()
433
 
434
  # Startup Logic
435
  with app.app_context():
 
487
  @app.route('/chat', methods=['POST'])
488
  def chat():
489
  logging.info("Received chat request")
 
 
 
 
 
 
 
 
 
490
  try:
491
  data = request.get_json()
492
  if not data or 'message' not in data:
 
571
  for article in context_articles:
572
  article['description'] = article['summary']
573
 
574
+ # Use static knowledge base for summary
575
+ summary = "No recent news available."
576
  qa_answer = None
577
+ if topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
579
  for key, value in knowledge.items():
580
  if key in user_input:
581
  qa_answer = value
582
+ summary = value
583
  break
584
 
585
+ if context_articles and summary == "No recent news available.":
 
 
586
  # Deduplicate descriptions and limit to unique content
587
  descriptions = list(dict.fromkeys([article['description'] for article in context_articles]))
588
  summary = " ".join(descriptions[:2]) # Limit to 2 descriptions to avoid repetition
 
596
  knowledge = FINANCIAL_KNOWLEDGE_BASE['Stock Market'].get('nifty trend', '')
597
  summary = knowledge + " " + summary
598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  # Construct the response as a list of lines
600
  response_lines = ["**Summary**"]
601
+ summary_lines = summary.split('. ')
602
+ summary_lines = [line.strip() for line in summary_lines if line.strip()]
603
+ summary_lines = [line if len(line.split()) <= 30 else ' '.join(line.split()[:30]) + '.' for line in summary_lines]
604
  response_lines.extend(summary_lines)
605
  response_lines.append("")
606
  response_lines.append("**Investment Recommendations for Indian Investors**")
 
689
  return jsonify({
690
  "status": "healthy",
691
  "refresh_running": refresh_thread.is_alive(),
 
 
692
  "database": db_status
693
  })
694