SearchGPT

Paused

App Files Files Community

Shreyas094 commited on Jul 9, 2024

Commit

a89fe32

verified ·

1 Parent(s): f630f04

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -6

app.py CHANGED Viewed

@@ -263,13 +263,18 @@ def summarize_news_content(content, model):
     full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
     # Extract only the summary part
-    summary_parts = full_response.split("Assistant:")
     if len(summary_parts) > 1:
         summary = summary_parts[-1].strip()
     else:
         summary = full_response.strip()
-    return summary
 def process_google_news_rss(query, temperature, top_p, repetition_penalty):
     model = get_model(temperature, top_p, repetition_penalty)
@@ -285,22 +290,29 @@ def process_google_news_rss(query, temperature, top_p, repetition_penalty):
         try:
             # Remove HTML tags from content
             clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
-            summary = summarize_news_content(clean_content, model)
             processed_article = {
                 "published_date": article["published_date"],
                 "title": article["title"],
                 "url": article["url"],
                 "content": clean_content,
-                "summary": summary
             }
             processed_articles.append(processed_article)
         except Exception as e:
-            print(f"Error processing article: {str(e)}")
     if not processed_articles:
         return "Failed to process any news articles. Please try a different query or check the summarization process."
     # Add processed articles to the database
-    docs = [Document(page_content=article["summary"], metadata={
         "source": article["url"],
         "title": article["title"],
         "published_date": article["published_date"]
@@ -327,6 +339,10 @@ def export_news_to_excel():
     global news_database
     df = pd.DataFrame(news_database)
     with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
         excel_path = tmp.name
         df.to_excel(excel_path, index=False)

     full_response = generate_chunked_response(model, formatted_prompt, max_tokens=200)
     # Extract only the summary part
+    summary_parts = full_response.split("Summary:")
     if len(summary_parts) > 1:
         summary = summary_parts[-1].strip()
     else:
         summary = full_response.strip()
+    # Create a cleaned version of the summary
+    lines = summary.split('\n')
+    cleaned_lines = [line for line in lines if not line.strip().startswith(("Human:", "Assistant:", "Summary:"))]
+    cleaned_summary = ' '.join(cleaned_lines).strip()
+    return summary, cleaned_summary
 def process_google_news_rss(query, temperature, top_p, repetition_penalty):
     model = get_model(temperature, top_p, repetition_penalty)
         try:
             # Remove HTML tags from content
             clean_content = BeautifulSoup(article["content"], "html.parser").get_text()
+            # If content is very short, use the title as content
+            if len(clean_content) < 50:
+                clean_content = article["title"]
+            full_summary, cleaned_summary = summarize_news_content(clean_content, model)
             processed_article = {
                 "published_date": article["published_date"],
                 "title": article["title"],
                 "url": article["url"],
                 "content": clean_content,
+                "summary": full_summary,
+                "cleaned_summary": cleaned_summary
             }
             processed_articles.append(processed_article)
         except Exception as e:
+            print(f"Error processing article: {str(e)}")
     if not processed_articles:
         return "Failed to process any news articles. Please try a different query or check the summarization process."
     # Add processed articles to the database
+    docs = [Document(page_content=article["cleaned_summary"], metadata={
         "source": article["url"],
         "title": article["title"],
         "published_date": article["published_date"]
     global news_database
     df = pd.DataFrame(news_database)
+    # Use the cleaned summary for the Excel export
+    df['summary'] = df['cleaned_summary']
+    df = df.drop(columns=['cleaned_summary'])  # Remove the extra column
     with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
         excel_path = tmp.name
         df.to_excel(excel_path, index=False)