Spaces:

proKBD
/

news-summarization

Sleeping

App Files Files Community

proKBD commited on Mar 23

Commit

9652678

verified ·

1 Parent(s): 2ee959e

Update utils.py

Browse files

Files changed (1) hide show

utils.py +23 -5

utils.py CHANGED Viewed

@@ -134,13 +134,22 @@ def get_translator():
 class NewsExtractor:
     def __init__(self):
         self.headers = HEADERS
     def search_news(self, company_name: str) -> List[Dict[str, str]]:
         """Extract news articles about the company ensuring minimum count."""
         all_articles = []
         retries = 2  # Number of retries if we don't get enough articles
-        while retries > 0 and len(all_articles) < MIN_ARTICLES:
             for source, url_template in NEWS_SOURCES.items():
                 try:
                     url = url_template.format(company_name.replace(" ", "+"))
@@ -148,6 +157,10 @@ class NewsExtractor:
                     # Try different page numbers for more articles
                     for page in range(2):  # Try first two pages
                         page_url = url
                         if page > 0:
                             if source == "google":
@@ -199,7 +212,7 @@ class NewsExtractor:
                             print(f"Found {len(source_articles)} articles from {source} page {page+1}")
                         # If we have enough articles, break the page loop
-                        if len(all_articles) >= MIN_ARTICLES:
                             break
                 except Exception as e:
@@ -207,12 +220,16 @@ class NewsExtractor:
                     continue
                 # If we have enough articles, break the source loop
-                if len(all_articles) >= MIN_ARTICLES:
                     break
             retries -= 1
-            if len(all_articles) < MIN_ARTICLES and retries > 0:
                 print(f"\nFound only {len(all_articles)} articles, retrying...")
         # Remove duplicates
         unique_articles = self._remove_duplicates(all_articles)
@@ -220,10 +237,11 @@ class NewsExtractor:
         if len(unique_articles) < MIN_ARTICLES:
             print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
         # Balance articles across sources
         balanced_articles = self._balance_sources(unique_articles)
-        return balanced_articles[:max(MIN_ARTICLES, MAX_ARTICLES)]
     def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
         """Balance articles across sources while maintaining minimum count."""

 class NewsExtractor:
     def __init__(self):
         self.headers = HEADERS
+        self.start_time = None
+        self.timeout = 30  # 30 seconds timeout
     def search_news(self, company_name: str) -> List[Dict[str, str]]:
         """Extract news articles about the company ensuring minimum count."""
+        self.start_time = time.time()
         all_articles = []
         retries = 2  # Number of retries if we don't get enough articles
+        min_articles = MIN_ARTICLES  # Start with default minimum
+        while retries > 0 and len(all_articles) < min_articles:
+            # Check for timeout
+            if time.time() - self.start_time > self.timeout:
+                print(f"\nTimeout reached after {self.timeout} seconds. Proceeding with available articles.")
+                break
             for source, url_template in NEWS_SOURCES.items():
                 try:
                     url = url_template.format(company_name.replace(" ", "+"))
                     # Try different page numbers for more articles
                     for page in range(2):  # Try first two pages
+                        # Check for timeout again
+                        if time.time() - self.start_time > self.timeout:
+                            break
                         page_url = url
                         if page > 0:
                             if source == "google":
                             print(f"Found {len(source_articles)} articles from {source} page {page+1}")
                         # If we have enough articles, break the page loop
+                        if len(all_articles) >= min_articles:
                             break
                 except Exception as e:
                     continue
                 # If we have enough articles, break the source loop
+                if len(all_articles) >= min_articles:
                     break
             retries -= 1
+            if len(all_articles) < min_articles and retries > 0:
                 print(f"\nFound only {len(all_articles)} articles, retrying...")
+                # Lower the minimum requirement if we're close
+                if len(all_articles) >= 15:  # If we have at least 15 articles
+                    min_articles = len(all_articles)
+                    print(f"Adjusting minimum requirement to {min_articles} articles")
         # Remove duplicates
         unique_articles = self._remove_duplicates(all_articles)
         if len(unique_articles) < MIN_ARTICLES:
             print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
+            print("Proceeding with available articles...")
         # Balance articles across sources
         balanced_articles = self._balance_sources(unique_articles)
+        return balanced_articles[:max(len(unique_articles), MAX_ARTICLES)]
     def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
         """Balance articles across sources while maintaining minimum count."""