Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -263,13 +263,18 @@ def summarize_news_content(content, model):
|
|
263 |
full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
|
264 |
|
265 |
# Extract only the summary part
|
266 |
-
summary_parts = full_response.split("
|
267 |
if len(summary_parts) > 1:
|
268 |
summary = summary_parts[-1].strip()
|
269 |
else:
|
270 |
summary = full_response.strip()
|
271 |
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
def process_google_news_rss(query, temperature, top_p, repetition_penalty):
|
275 |
model = get_model(temperature, top_p, repetition_penalty)
|
@@ -285,22 +290,29 @@ def process_google_news_rss(query, temperature, top_p, repetition_penalty):
|
|
285 |
try:
|
286 |
# Remove HTML tags from content
|
287 |
clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
289 |
processed_article = {
|
290 |
"published_date": article["published_date"],
|
291 |
"title": article["title"],
|
292 |
"url": article["url"],
|
293 |
"content": clean_content,
|
294 |
-
"summary":
|
|
|
295 |
}
|
296 |
processed_articles.append(processed_article)
|
297 |
except Exception as e:
|
298 |
-
print(f"Error processing article: {str(e)}")
|
|
|
299 |
if not processed_articles:
|
300 |
return "Failed to process any news articles. Please try a different query or check the summarization process."
|
301 |
|
302 |
# Add processed articles to the database
|
303 |
-
docs = [Document(page_content=article["
|
304 |
"source": article["url"],
|
305 |
"title": article["title"],
|
306 |
"published_date": article["published_date"]
|
@@ -327,6 +339,10 @@ def export_news_to_excel():
|
|
327 |
global news_database
|
328 |
df = pd.DataFrame(news_database)
|
329 |
|
|
|
|
|
|
|
|
|
330 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
331 |
excel_path = tmp.name
|
332 |
df.to_excel(excel_path, index=False)
|
|
|
263 |
full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
|
264 |
|
265 |
# Extract only the summary part
|
266 |
+
summary_parts = full_response.split("Summary:")
|
267 |
if len(summary_parts) > 1:
|
268 |
summary = summary_parts[-1].strip()
|
269 |
else:
|
270 |
summary = full_response.strip()
|
271 |
|
272 |
+
# Create a cleaned version of the summary
|
273 |
+
lines = summary.split('\n')
|
274 |
+
cleaned_lines = [line for line in lines if not line.strip().startswith(("Human:", "Assistant:", "Summary:"))]
|
275 |
+
cleaned_summary = ' '.join(cleaned_lines).strip()
|
276 |
+
|
277 |
+
return summary, cleaned_summary
|
278 |
|
279 |
def process_google_news_rss(query, temperature, top_p, repetition_penalty):
|
280 |
model = get_model(temperature, top_p, repetition_penalty)
|
|
|
290 |
try:
|
291 |
# Remove HTML tags from content
|
292 |
clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
|
293 |
+
|
294 |
+
# If content is very short, use the title as content
|
295 |
+
if len(clean_content) < 50:
|
296 |
+
clean_content = article["title"]
|
297 |
+
|
298 |
+
full_summary, cleaned_summary = summarize_news_content(clean_content, model)
|
299 |
processed_article = {
|
300 |
"published_date": article["published_date"],
|
301 |
"title": article["title"],
|
302 |
"url": article["url"],
|
303 |
"content": clean_content,
|
304 |
+
"summary": full_summary,
|
305 |
+
"cleaned_summary": cleaned_summary
|
306 |
}
|
307 |
processed_articles.append(processed_article)
|
308 |
except Exception as e:
|
309 |
+
print(f"Error processing article: {str(e)}")
|
310 |
+
|
311 |
if not processed_articles:
|
312 |
return "Failed to process any news articles. Please try a different query or check the summarization process."
|
313 |
|
314 |
# Add processed articles to the database
|
315 |
+
docs = [Document(page_content=article["cleaned_summary"], metadata={
|
316 |
"source": article["url"],
|
317 |
"title": article["title"],
|
318 |
"published_date": article["published_date"]
|
|
|
339 |
global news_database
|
340 |
df = pd.DataFrame(news_database)
|
341 |
|
342 |
+
# Use the cleaned summary for the Excel export
|
343 |
+
df['summary'] = df['cleaned_summary']
|
344 |
+
df = df.drop(columns=['cleaned_summary']) # Remove the extra column
|
345 |
+
|
346 |
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
|
347 |
excel_path = tmp.name
|
348 |
df.to_excel(excel_path, index=False)
|