Shreyas94 commited on
Commit
f3a27fd
1 Parent(s): d530acf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -69
app.py CHANGED
@@ -1,90 +1,82 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
  from bs4 import BeautifulSoup
4
  import requests
5
- from googlesearch import search
6
- import logging
 
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.DEBUG)
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def summarize_news(query, num_results=3):
12
  logging.debug(f"Query received: {query}")
13
  logging.debug(f"Number of results requested: {num_results}")
14
 
15
- # Initialize summarization pipeline with a specific model
16
- logging.debug("Initializing summarization pipeline...")
17
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
18
-
19
  # Search for news articles
20
  logging.debug("Searching for news articles...")
21
- search_results = search(query, num_results=num_results)
22
- articles = []
23
 
24
- logging.debug(f"Search results: {search_results}")
25
-
26
- for url in search_results:
27
- try:
 
 
 
 
28
  logging.debug(f"Fetching content from URL: {url}")
29
- # Fetch the content of the news article
30
- r = requests.get(url)
31
- soup = BeautifulSoup(r.text, 'html.parser')
32
- results = soup.find_all(['h1', 'p'])
33
- text = [result.text for result in results]
34
- ARTICLE = ' '.join(text)
35
-
36
- # Chunk the article text
37
- logging.debug("Chunking the article text...")
38
- max_chunk = 350
39
- ARTICLE = ARTICLE.replace('.', '.<eos>')
40
- ARTICLE = ARTICLE.replace('?', '?<eos>')
41
- ARTICLE = ARTICLE.replace('!', '!<eos>')
42
-
43
- sentences = ARTICLE.split('<eos>')
44
- chunks = []
45
- current_chunk = []
46
-
47
- for sentence in sentences:
48
- if len(' '.join(current_chunk + sentence.split())) <= max_chunk:
49
- current_chunk.extend(sentence.split())
50
- else:
51
- chunks.append(' '.join(current_chunk))
52
- current_chunk = sentence.split()
53
- chunks.append(' '.join(current_chunk))
54
-
55
- logging.debug(f"Chunks created: {chunks}")
56
-
57
- # Summarize the chunks
58
- logging.debug("Summarizing the chunks...")
59
- summaries = []
60
- for chunk in chunks:
61
- summaries.append(summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text'])
62
-
63
- # Concatenate summaries and summarize again for cohesion
64
- combined_summary = ' '.join(summaries)
65
- final_summary = summarizer(combined_summary, max_length=300, min_length=80, do_sample=False)[0]['summary_text']
66
- articles.append((url, final_summary))
67
-
68
- logging.debug(f"Final summary for URL {url}: {final_summary}")
69
- except Exception as e:
70
- logging.error(f"Error processing URL {url}: {e}")
71
- continue
72
-
73
- logging.debug(f"Final summarized articles: {articles}")
74
- return format_output(articles)
75
-
76
- def format_output(articles):
77
- formatted_text = ""
78
- for url, summary in articles:
79
- formatted_text += f"URL: {url}\nSummary: {summary}\n\n"
80
- return formatted_text
81
-
82
  iface = gr.Interface(
83
  fn=summarize_news,
84
- inputs=["textbox", "slider"],
85
  outputs="textbox",
86
  title="News Summarizer",
87
- description="Enter a query to get summarized versions of the top news articles."
88
  )
89
 
90
  if __name__ == "__main__":
 
1
+ import logging
 
2
  from bs4 import BeautifulSoup
3
  import requests
4
+ import nltk
5
+ from transformers import pipeline
6
+ import gradio as gr
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.DEBUG)
10
 
11
+ # Initialize the summarization pipeline from Hugging Face Transformers
12
+ summarizer = pipeline("summarization")
13
+
14
+ # Initialize the NLTK sentence tokenizer
15
+ nltk.download('punkt')
16
+
17
+ # Function to fetch content from a given URL
18
+ def fetch_article_content(url):
19
+ try:
20
+ r = requests.get(url)
21
+ soup = BeautifulSoup(r.text, 'html.parser')
22
+ results = soup.find_all(['h1', 'p'])
23
+ text = [result.text for result in results]
24
+ return ' '.join(text)
25
+ except Exception as e:
26
+ logging.error(f"Error fetching content from {url}: {e}")
27
+ return ""
28
+
29
+ # Function to summarize news articles based on a query
30
  def summarize_news(query, num_results=3):
31
  logging.debug(f"Query received: {query}")
32
  logging.debug(f"Number of results requested: {num_results}")
33
 
 
 
 
 
34
  # Search for news articles
35
  logging.debug("Searching for news articles...")
 
 
36
 
37
+ articles = []
38
+ aggregated_content = ""
39
+ try:
40
+ news_results = newsapi.get_everything(q=query, language='en', page_size=num_results)
41
+ logging.debug(f"Search results: {news_results}")
42
+
43
+ for article in news_results['articles']:
44
+ url = article['url']
45
  logging.debug(f"Fetching content from URL: {url}")
46
+ content = fetch_article_content(url)
47
+ aggregated_content += content + " "
48
+ except Exception as e:
49
+ logging.error(f"Error fetching news articles: {e}")
50
+
51
+ # Summarize the aggregated content
52
+ try:
53
+ # Chunk the aggregated content into meaningful segments
54
+ sentences = nltk.sent_tokenize(aggregated_content)
55
+
56
+ # Summarize each sentence individually if it's meaningful
57
+ summaries = []
58
+ for sentence in sentences:
59
+ if len(sentence) > 10: # Adjust minimum length as needed
60
+ summary = summarizer(sentence, max_length=120, min_length=30, do_sample=False)
61
+ summaries.append(summary[0]['summary_text'])
62
+
63
+ # Join all summaries to form final output
64
+ final_summary = " ".join(summaries)
65
+
66
+ logging.debug(f"Final summarized text: {final_summary}")
67
+ return final_summary
68
+
69
+ except Exception as e:
70
+ logging.error(f"Error during summarization: {e}")
71
+ return "An error occurred during summarization."
72
+
73
+ # Setting up Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  iface = gr.Interface(
75
  fn=summarize_news,
76
+ inputs=[gr.Textbox(label="Query"), gr.Slider(minimum=1, maximum=10, default=3, label="Number of Results")],
77
  outputs="textbox",
78
  title="News Summarizer",
79
+ description="Enter a query to get a consolidated summary of the top news articles."
80
  )
81
 
82
  if __name__ == "__main__":