Spaces:

Shreyas94
/

World_News

Sleeping

App Files Files Community

Shreyas94 commited on Jun 15, 2024

Commit

f3a27fd

verified ·

1 Parent(s): d530acf

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -69

app.py CHANGED Viewed

@@ -1,90 +1,82 @@
-import gradio as gr
-from transformers import pipeline
 from bs4 import BeautifulSoup
 import requests
-from googlesearch import search
-import logging
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
 def summarize_news(query, num_results=3):
     logging.debug(f"Query received: {query}")
     logging.debug(f"Number of results requested: {num_results}")
-    # Initialize summarization pipeline with a specific model
-    logging.debug("Initializing summarization pipeline...")
-    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
     # Search for news articles
     logging.debug("Searching for news articles...")
-    search_results = search(query, num_results=num_results)
-    articles = []
-    logging.debug(f"Search results: {search_results}")
-    for url in search_results:
-        try:
             logging.debug(f"Fetching content from URL: {url}")
-            # Fetch the content of the news article
-            r = requests.get(url)
-            soup = BeautifulSoup(r.text, 'html.parser')
-            results = soup.find_all(['h1', 'p'])
-            text = [result.text for result in results]
-            ARTICLE = ' '.join(text)
-            # Chunk the article text
-            logging.debug("Chunking the article text...")
-            max_chunk = 350
-            ARTICLE = ARTICLE.replace('.', '.<eos>')
-            ARTICLE = ARTICLE.replace('?', '?<eos>')
-            ARTICLE = ARTICLE.replace('!', '!<eos>')
-            sentences = ARTICLE.split('<eos>')
-            chunks = []
-            current_chunk = []
-            for sentence in sentences:
-                if len(' '.join(current_chunk + sentence.split())) <= max_chunk:
-                    current_chunk.extend(sentence.split())
-                else:
-                    chunks.append(' '.join(current_chunk))
-                    current_chunk = sentence.split()
-            chunks.append(' '.join(current_chunk))
-            logging.debug(f"Chunks created: {chunks}")
-            # Summarize the chunks
-            logging.debug("Summarizing the chunks...")
-            summaries = []
-            for chunk in chunks:
-                summaries.append(summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text'])
-            # Concatenate summaries and summarize again for cohesion
-            combined_summary = ' '.join(summaries)
-            final_summary = summarizer(combined_summary, max_length=300, min_length=80, do_sample=False)[0]['summary_text']
-            articles.append((url, final_summary))
-            logging.debug(f"Final summary for URL {url}: {final_summary}")
-        except Exception as e:
-            logging.error(f"Error processing URL {url}: {e}")
-            continue
-    logging.debug(f"Final summarized articles: {articles}")
-    return format_output(articles)
-def format_output(articles):
-    formatted_text = ""
-    for url, summary in articles:
-        formatted_text += f"URL: {url}\nSummary: {summary}\n\n"
-    return formatted_text
 iface = gr.Interface(
     fn=summarize_news,
-    inputs=["textbox", "slider"],
     outputs="textbox",
     title="News Summarizer",
-    description="Enter a query to get summarized versions of the top news articles."
 )
 if __name__ == "__main__":

+import logging
 from bs4 import BeautifulSoup
 import requests
+import nltk
+from transformers import pipeline
+import gradio as gr
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
+# Initialize the summarization pipeline from Hugging Face Transformers
+summarizer = pipeline("summarization")
+# Initialize the NLTK sentence tokenizer
+nltk.download('punkt')
+# Function to fetch content from a given URL
+def fetch_article_content(url):
+    try:
+        r = requests.get(url)
+        soup = BeautifulSoup(r.text, 'html.parser')
+        results = soup.find_all(['h1', 'p'])
+        text = [result.text for result in results]
+        return ' '.join(text)
+    except Exception as e:
+        logging.error(f"Error fetching content from {url}: {e}")
+        return ""
+# Function to summarize news articles based on a query
 def summarize_news(query, num_results=3):
     logging.debug(f"Query received: {query}")
     logging.debug(f"Number of results requested: {num_results}")
     # Search for news articles
     logging.debug("Searching for news articles...")
+    articles = []
+    aggregated_content = ""
+    try:
+        news_results = newsapi.get_everything(q=query, language='en', page_size=num_results)
+        logging.debug(f"Search results: {news_results}")
+        for article in news_results['articles']:
+            url = article['url']
             logging.debug(f"Fetching content from URL: {url}")
+            content = fetch_article_content(url)
+            aggregated_content += content + " "
+    except Exception as e:
+        logging.error(f"Error fetching news articles: {e}")
+    # Summarize the aggregated content
+    try:
+        # Chunk the aggregated content into meaningful segments
+        sentences = nltk.sent_tokenize(aggregated_content)
+        # Summarize each sentence individually if it's meaningful
+        summaries = []
+        for sentence in sentences:
+            if len(sentence) > 10:  # Adjust minimum length as needed
+                summary = summarizer(sentence, max_length=120, min_length=30, do_sample=False)
+                summaries.append(summary[0]['summary_text'])
+        # Join all summaries to form final output
+        final_summary = " ".join(summaries)
+        logging.debug(f"Final summarized text: {final_summary}")
+        return final_summary
+    except Exception as e:
+        logging.error(f"Error during summarization: {e}")
+        return "An error occurred during summarization."
+# Setting up Gradio interface
 iface = gr.Interface(
     fn=summarize_news,
+    inputs=[gr.Textbox(label="Query"), gr.Slider(minimum=1, maximum=10, default=3, label="Number of Results")],
     outputs="textbox",
     title="News Summarizer",
+    description="Enter a query to get a consolidated summary of the top news articles."
 )
 if __name__ == "__main__":