Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -134,13 +134,22 @@ def get_translator():
|
|
134 |
class NewsExtractor:
|
135 |
def __init__(self):
|
136 |
self.headers = HEADERS
|
|
|
|
|
137 |
|
138 |
def search_news(self, company_name: str) -> List[Dict[str, str]]:
|
139 |
"""Extract news articles about the company ensuring minimum count."""
|
|
|
140 |
all_articles = []
|
141 |
retries = 2 # Number of retries if we don't get enough articles
|
|
|
142 |
|
143 |
-
while retries > 0 and len(all_articles) <
|
|
|
|
|
|
|
|
|
|
|
144 |
for source, url_template in NEWS_SOURCES.items():
|
145 |
try:
|
146 |
url = url_template.format(company_name.replace(" ", "+"))
|
@@ -148,6 +157,10 @@ class NewsExtractor:
|
|
148 |
|
149 |
# Try different page numbers for more articles
|
150 |
for page in range(2): # Try first two pages
|
|
|
|
|
|
|
|
|
151 |
page_url = url
|
152 |
if page > 0:
|
153 |
if source == "google":
|
@@ -199,7 +212,7 @@ class NewsExtractor:
|
|
199 |
print(f"Found {len(source_articles)} articles from {source} page {page+1}")
|
200 |
|
201 |
# If we have enough articles, break the page loop
|
202 |
-
if len(all_articles) >=
|
203 |
break
|
204 |
|
205 |
except Exception as e:
|
@@ -207,12 +220,16 @@ class NewsExtractor:
|
|
207 |
continue
|
208 |
|
209 |
# If we have enough articles, break the source loop
|
210 |
-
if len(all_articles) >=
|
211 |
break
|
212 |
|
213 |
retries -= 1
|
214 |
-
if len(all_articles) <
|
215 |
print(f"\nFound only {len(all_articles)} articles, retrying...")
|
|
|
|
|
|
|
|
|
216 |
|
217 |
# Remove duplicates
|
218 |
unique_articles = self._remove_duplicates(all_articles)
|
@@ -220,10 +237,11 @@ class NewsExtractor:
|
|
220 |
|
221 |
if len(unique_articles) < MIN_ARTICLES:
|
222 |
print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
|
|
|
223 |
|
224 |
# Balance articles across sources
|
225 |
balanced_articles = self._balance_sources(unique_articles)
|
226 |
-
return balanced_articles[:max(
|
227 |
|
228 |
def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
229 |
"""Balance articles across sources while maintaining minimum count."""
|
|
|
134 |
class NewsExtractor:
|
135 |
def __init__(self):
|
136 |
self.headers = HEADERS
|
137 |
+
self.start_time = None
|
138 |
+
self.timeout = 30 # 30 seconds timeout
|
139 |
|
140 |
def search_news(self, company_name: str) -> List[Dict[str, str]]:
|
141 |
"""Extract news articles about the company ensuring minimum count."""
|
142 |
+
self.start_time = time.time()
|
143 |
all_articles = []
|
144 |
retries = 2 # Number of retries if we don't get enough articles
|
145 |
+
min_articles = MIN_ARTICLES # Start with default minimum
|
146 |
|
147 |
+
while retries > 0 and len(all_articles) < min_articles:
|
148 |
+
# Check for timeout
|
149 |
+
if time.time() - self.start_time > self.timeout:
|
150 |
+
print(f"\nTimeout reached after {self.timeout} seconds. Proceeding with available articles.")
|
151 |
+
break
|
152 |
+
|
153 |
for source, url_template in NEWS_SOURCES.items():
|
154 |
try:
|
155 |
url = url_template.format(company_name.replace(" ", "+"))
|
|
|
157 |
|
158 |
# Try different page numbers for more articles
|
159 |
for page in range(2): # Try first two pages
|
160 |
+
# Check for timeout again
|
161 |
+
if time.time() - self.start_time > self.timeout:
|
162 |
+
break
|
163 |
+
|
164 |
page_url = url
|
165 |
if page > 0:
|
166 |
if source == "google":
|
|
|
212 |
print(f"Found {len(source_articles)} articles from {source} page {page+1}")
|
213 |
|
214 |
# If we have enough articles, break the page loop
|
215 |
+
if len(all_articles) >= min_articles:
|
216 |
break
|
217 |
|
218 |
except Exception as e:
|
|
|
220 |
continue
|
221 |
|
222 |
# If we have enough articles, break the source loop
|
223 |
+
if len(all_articles) >= min_articles:
|
224 |
break
|
225 |
|
226 |
retries -= 1
|
227 |
+
if len(all_articles) < min_articles and retries > 0:
|
228 |
print(f"\nFound only {len(all_articles)} articles, retrying...")
|
229 |
+
# Lower the minimum requirement if we're close
|
230 |
+
if len(all_articles) >= 15: # If we have at least 15 articles
|
231 |
+
min_articles = len(all_articles)
|
232 |
+
print(f"Adjusting minimum requirement to {min_articles} articles")
|
233 |
|
234 |
# Remove duplicates
|
235 |
unique_articles = self._remove_duplicates(all_articles)
|
|
|
237 |
|
238 |
if len(unique_articles) < MIN_ARTICLES:
|
239 |
print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
|
240 |
+
print("Proceeding with available articles...")
|
241 |
|
242 |
# Balance articles across sources
|
243 |
balanced_articles = self._balance_sources(unique_articles)
|
244 |
+
return balanced_articles[:max(len(unique_articles), MAX_ARTICLES)]
|
245 |
|
246 |
def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
247 |
"""Balance articles across sources while maintaining minimum count."""
|