Shreyas94 commited on
Commit
d530acf
·
verified ·
1 Parent(s): fe87932

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -70
app.py CHANGED
@@ -2,26 +2,12 @@ import gradio as gr
2
  from transformers import pipeline
3
  from bs4 import BeautifulSoup
4
  import requests
 
5
  import logging
6
- from newsapi import NewsApiClient
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.DEBUG)
10
 
11
- # Initialize the News API client
12
- newsapi = NewsApiClient(api_key='5ab7bb1aaceb41b8993db03477098aad')
13
-
14
- def fetch_article_content(url):
15
- try:
16
- r = requests.get(url)
17
- soup = BeautifulSoup(r.text, 'html.parser')
18
- results = soup.find_all(['h1', 'p'])
19
- text = [result.text for result in results]
20
- return ' '.join(text)
21
- except Exception as e:
22
- logging.error(f"Error fetching content from {url}: {e}")
23
- return ""
24
-
25
  def summarize_news(query, num_results=3):
26
  logging.debug(f"Query received: {query}")
27
  logging.debug(f"Number of results requested: {num_results}")
@@ -32,69 +18,75 @@ def summarize_news(query, num_results=3):
32
 
33
  # Search for news articles
34
  logging.debug("Searching for news articles...")
 
35
  articles = []
36
- aggregated_content = ""
37
- try:
38
- news_results = newsapi.get_everything(q=query, language='en', page_size=num_results)
39
- logging.debug(f"Search results: {news_results}")
40
-
41
- for article in news_results['articles']:
42
- url = article['url']
43
  logging.debug(f"Fetching content from URL: {url}")
44
- content = fetch_article_content(url)
45
- aggregated_content += content + " "
46
- except Exception as e:
47
- logging.error(f"Error fetching news articles: {e}")
48
-
49
- # Chunk the aggregated content
50
- logging.debug("Chunking the aggregated content...")
51
- max_chunk = 500
52
- aggregated_content = aggregated_content.replace('.', '.<eos>')
53
- aggregated_content = aggregated_content.replace('?', '?<eos>')
54
- aggregated_content = aggregated_content.replace('!', '!<eos>')
55
-
56
- sentences = aggregated_content.split('<eos>')
57
- current_chunk = 0
58
- chunks = []
59
- for sentence in sentences:
60
- if len(chunks) == current_chunk + 1:
61
- if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
62
- chunks[current_chunk].extend(sentence.split(' '))
63
- else:
64
- current_chunk += 1
65
- chunks.append(sentence.split(' '))
66
- else:
67
- chunks.append(sentence.split(' '))
68
-
69
- for chunk_id in range(len(chunks)):
70
- chunks[chunk_id] = ' '.join(chunks[chunk_id])
71
-
72
- logging.debug(f"Chunks created: {chunks}")
73
-
74
- # Summarize the chunks
75
- logging.debug("Summarizing the chunks...")
76
- try:
77
- summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
78
- summary_text = " ".join([summary['summary_text'] for summary in summaries])
79
-
80
- # Reprocess the generated summary
81
- logging.debug("Reprocessing the summary for cohesiveness and elaboration...")
82
- final_summary = summarizer(summary_text, max_length=150, min_length=60, do_sample=False)[0]['summary_text']
83
- except Exception as e:
84
- logging.error(f"Error during summarization: {e}")
85
- final_summary = "An error occurred during summarization."
86
-
87
- logging.debug(f"Final summarized text: {final_summary}")
88
- return final_summary
 
 
 
 
 
 
 
89
 
90
  iface = gr.Interface(
91
  fn=summarize_news,
92
- inputs=[gr.Textbox(label="Query"), gr.Slider(minimum=1, maximum=10, value=3, label="Number of Results")],
93
  outputs="textbox",
94
  title="News Summarizer",
95
- description="Enter a query to get a consolidated summary of the top news articles."
96
  )
97
 
98
  if __name__ == "__main__":
99
  logging.debug("Launching Gradio interface...")
100
- iface.launch()
 
2
  from transformers import pipeline
3
  from bs4 import BeautifulSoup
4
  import requests
5
+ from googlesearch import search
6
  import logging
 
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.DEBUG)
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def summarize_news(query, num_results=3):
12
  logging.debug(f"Query received: {query}")
13
  logging.debug(f"Number of results requested: {num_results}")
 
18
 
19
  # Search for news articles
20
  logging.debug("Searching for news articles...")
21
+ search_results = search(query, num_results=num_results)
22
  articles = []
23
+
24
+ logging.debug(f"Search results: {search_results}")
25
+
26
+ for url in search_results:
27
+ try:
 
 
28
  logging.debug(f"Fetching content from URL: {url}")
29
+ # Fetch the content of the news article
30
+ r = requests.get(url)
31
+ soup = BeautifulSoup(r.text, 'html.parser')
32
+ results = soup.find_all(['h1', 'p'])
33
+ text = [result.text for result in results]
34
+ ARTICLE = ' '.join(text)
35
+
36
+ # Chunk the article text
37
+ logging.debug("Chunking the article text...")
38
+ max_chunk = 350
39
+ ARTICLE = ARTICLE.replace('.', '.<eos>')
40
+ ARTICLE = ARTICLE.replace('?', '?<eos>')
41
+ ARTICLE = ARTICLE.replace('!', '!<eos>')
42
+
43
+ sentences = ARTICLE.split('<eos>')
44
+ chunks = []
45
+ current_chunk = []
46
+
47
+ for sentence in sentences:
48
+ if len(' '.join(current_chunk + sentence.split())) <= max_chunk:
49
+ current_chunk.extend(sentence.split())
50
+ else:
51
+ chunks.append(' '.join(current_chunk))
52
+ current_chunk = sentence.split()
53
+ chunks.append(' '.join(current_chunk))
54
+
55
+ logging.debug(f"Chunks created: {chunks}")
56
+
57
+ # Summarize the chunks
58
+ logging.debug("Summarizing the chunks...")
59
+ summaries = []
60
+ for chunk in chunks:
61
+ summaries.append(summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text'])
62
+
63
+ # Concatenate summaries and summarize again for cohesion
64
+ combined_summary = ' '.join(summaries)
65
+ final_summary = summarizer(combined_summary, max_length=300, min_length=80, do_sample=False)[0]['summary_text']
66
+ articles.append((url, final_summary))
67
+
68
+ logging.debug(f"Final summary for URL {url}: {final_summary}")
69
+ except Exception as e:
70
+ logging.error(f"Error processing URL {url}: {e}")
71
+ continue
72
+
73
+ logging.debug(f"Final summarized articles: {articles}")
74
+ return format_output(articles)
75
+
76
+ def format_output(articles):
77
+ formatted_text = ""
78
+ for url, summary in articles:
79
+ formatted_text += f"URL: {url}\nSummary: {summary}\n\n"
80
+ return formatted_text
81
 
82
  iface = gr.Interface(
83
  fn=summarize_news,
84
+ inputs=["textbox", "slider"],
85
  outputs="textbox",
86
  title="News Summarizer",
87
+ description="Enter a query to get summarized versions of the top news articles."
88
  )
89
 
90
  if __name__ == "__main__":
91
  logging.debug("Launching Gradio interface...")
92
+ iface.launch()