Shreyas094 commited on
Commit
a89fe32
·
verified ·
1 Parent(s): f630f04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -6
app.py CHANGED
@@ -263,13 +263,18 @@ def summarize_news_content(content, model):
263
  full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
264
 
265
  # Extract only the summary part
266
- summary_parts = full_response.split("Assistant:")
267
  if len(summary_parts) > 1:
268
  summary = summary_parts[-1].strip()
269
  else:
270
  summary = full_response.strip()
271
 
272
- return summary
 
 
 
 
 
273
 
274
  def process_google_news_rss(query, temperature, top_p, repetition_penalty):
275
  model = get_model(temperature, top_p, repetition_penalty)
@@ -285,22 +290,29 @@ def process_google_news_rss(query, temperature, top_p, repetition_penalty):
285
  try:
286
  # Remove HTML tags from content
287
  clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
288
- summary = summarize_news_content(clean_content, model)
 
 
 
 
 
289
  processed_article = {
290
  "published_date": article["published_date"],
291
  "title": article["title"],
292
  "url": article["url"],
293
  "content": clean_content,
294
- "summary": summary
 
295
  }
296
  processed_articles.append(processed_article)
297
  except Exception as e:
298
- print(f"Error processing article: {str(e)}")
 
299
  if not processed_articles:
300
  return "Failed to process any news articles. Please try a different query or check the summarization process."
301
 
302
  # Add processed articles to the database
303
- docs = [Document(page_content=article["summary"], metadata={
304
  "source": article["url"],
305
  "title": article["title"],
306
  "published_date": article["published_date"]
@@ -327,6 +339,10 @@ def export_news_to_excel():
327
  global news_database
328
  df = pd.DataFrame(news_database)
329
 
 
 
 
 
330
  with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
331
  excel_path = tmp.name
332
  df.to_excel(excel_path, index=False)
 
263
  full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
264
 
265
  # Extract only the summary part
266
+ summary_parts = full_response.split("Summary:")
267
  if len(summary_parts) > 1:
268
  summary = summary_parts[-1].strip()
269
  else:
270
  summary = full_response.strip()
271
 
272
+ # Create a cleaned version of the summary
273
+ lines = summary.split('\n')
274
+ cleaned_lines = [line for line in lines if not line.strip().startswith(("Human:", "Assistant:", "Summary:"))]
275
+ cleaned_summary = ' '.join(cleaned_lines).strip()
276
+
277
+ return summary, cleaned_summary
278
 
279
  def process_google_news_rss(query, temperature, top_p, repetition_penalty):
280
  model = get_model(temperature, top_p, repetition_penalty)
 
290
  try:
291
  # Remove HTML tags from content
292
  clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
293
+
294
+ # If content is very short, use the title as content
295
+ if len(clean_content) < 50:
296
+ clean_content = article["title"]
297
+
298
+ full_summary, cleaned_summary = summarize_news_content(clean_content, model)
299
  processed_article = {
300
  "published_date": article["published_date"],
301
  "title": article["title"],
302
  "url": article["url"],
303
  "content": clean_content,
304
+ "summary": full_summary,
305
+ "cleaned_summary": cleaned_summary
306
  }
307
  processed_articles.append(processed_article)
308
  except Exception as e:
309
+ print(f"Error processing article: {str(e)}")
310
+
311
  if not processed_articles:
312
  return "Failed to process any news articles. Please try a different query or check the summarization process."
313
 
314
  # Add processed articles to the database
315
+ docs = [Document(page_content=article["cleaned_summary"], metadata={
316
  "source": article["url"],
317
  "title": article["title"],
318
  "published_date": article["published_date"]
 
339
  global news_database
340
  df = pd.DataFrame(news_database)
341
 
342
+ # Use the cleaned summary for the Excel export
343
+ df['summary'] = df['cleaned_summary']
344
+ df = df.drop(columns=['cleaned_summary']) # Remove the extra column
345
+
346
  with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
347
  excel_path = tmp.name
348
  df.to_excel(excel_path, index=False)