proKBD commited on
Commit
9652678
·
verified ·
1 Parent(s): 2ee959e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +23 -5
utils.py CHANGED
@@ -134,13 +134,22 @@ def get_translator():
134
  class NewsExtractor:
135
  def __init__(self):
136
  self.headers = HEADERS
 
 
137
 
138
  def search_news(self, company_name: str) -> List[Dict[str, str]]:
139
  """Extract news articles about the company ensuring minimum count."""
 
140
  all_articles = []
141
  retries = 2 # Number of retries if we don't get enough articles
 
142
 
143
- while retries > 0 and len(all_articles) < MIN_ARTICLES:
 
 
 
 
 
144
  for source, url_template in NEWS_SOURCES.items():
145
  try:
146
  url = url_template.format(company_name.replace(" ", "+"))
@@ -148,6 +157,10 @@ class NewsExtractor:
148
 
149
  # Try different page numbers for more articles
150
  for page in range(2): # Try first two pages
 
 
 
 
151
  page_url = url
152
  if page > 0:
153
  if source == "google":
@@ -199,7 +212,7 @@ class NewsExtractor:
199
  print(f"Found {len(source_articles)} articles from {source} page {page+1}")
200
 
201
  # If we have enough articles, break the page loop
202
- if len(all_articles) >= MIN_ARTICLES:
203
  break
204
 
205
  except Exception as e:
@@ -207,12 +220,16 @@ class NewsExtractor:
207
  continue
208
 
209
  # If we have enough articles, break the source loop
210
- if len(all_articles) >= MIN_ARTICLES:
211
  break
212
 
213
  retries -= 1
214
- if len(all_articles) < MIN_ARTICLES and retries > 0:
215
  print(f"\nFound only {len(all_articles)} articles, retrying...")
 
 
 
 
216
 
217
  # Remove duplicates
218
  unique_articles = self._remove_duplicates(all_articles)
@@ -220,10 +237,11 @@ class NewsExtractor:
220
 
221
  if len(unique_articles) < MIN_ARTICLES:
222
  print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
 
223
 
224
  # Balance articles across sources
225
  balanced_articles = self._balance_sources(unique_articles)
226
- return balanced_articles[:max(MIN_ARTICLES, MAX_ARTICLES)]
227
 
228
  def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
229
  """Balance articles across sources while maintaining minimum count."""
 
134
  class NewsExtractor:
135
  def __init__(self):
136
  self.headers = HEADERS
137
+ self.start_time = None
138
+ self.timeout = 30 # 30 seconds timeout
139
 
140
  def search_news(self, company_name: str) -> List[Dict[str, str]]:
141
  """Extract news articles about the company ensuring minimum count."""
142
+ self.start_time = time.time()
143
  all_articles = []
144
  retries = 2 # Number of retries if we don't get enough articles
145
+ min_articles = MIN_ARTICLES # Start with default minimum
146
 
147
+ while retries > 0 and len(all_articles) < min_articles:
148
+ # Check for timeout
149
+ if time.time() - self.start_time > self.timeout:
150
+ print(f"\nTimeout reached after {self.timeout} seconds. Proceeding with available articles.")
151
+ break
152
+
153
  for source, url_template in NEWS_SOURCES.items():
154
  try:
155
  url = url_template.format(company_name.replace(" ", "+"))
 
157
 
158
  # Try different page numbers for more articles
159
  for page in range(2): # Try first two pages
160
+ # Check for timeout again
161
+ if time.time() - self.start_time > self.timeout:
162
+ break
163
+
164
  page_url = url
165
  if page > 0:
166
  if source == "google":
 
212
  print(f"Found {len(source_articles)} articles from {source} page {page+1}")
213
 
214
  # If we have enough articles, break the page loop
215
+ if len(all_articles) >= min_articles:
216
  break
217
 
218
  except Exception as e:
 
220
  continue
221
 
222
  # If we have enough articles, break the source loop
223
+ if len(all_articles) >= min_articles:
224
  break
225
 
226
  retries -= 1
227
+ if len(all_articles) < min_articles and retries > 0:
228
  print(f"\nFound only {len(all_articles)} articles, retrying...")
229
+ # Lower the minimum requirement if we're close
230
+ if len(all_articles) >= 15: # If we have at least 15 articles
231
+ min_articles = len(all_articles)
232
+ print(f"Adjusting minimum requirement to {min_articles} articles")
233
 
234
  # Remove duplicates
235
  unique_articles = self._remove_duplicates(all_articles)
 
237
 
238
  if len(unique_articles) < MIN_ARTICLES:
239
  print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
240
+ print("Proceeding with available articles...")
241
 
242
  # Balance articles across sources
243
  balanced_articles = self._balance_sources(unique_articles)
244
+ return balanced_articles[:max(len(unique_articles), MAX_ARTICLES)]
245
 
246
  def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
247
  """Balance articles across sources while maintaining minimum count."""