Spaces:
Sleeping
Sleeping
File size: 3,426 Bytes
3f6ed4f 871b845 2e72d63 c19b837 e26bc82 c19b837 e26bc82 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 c19b837 2e72d63 3f6ed4f c19b837 3f6ed4f 2e72d63 3f6ed4f c19b837 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import gradio as gr
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
from googlesearch import search
import logging
# Configure logging
logging.basicConfig(level=logging.DEBUG)
def summarize_news(query, num_results=3):
logging.debug(f"Query received: {query}")
logging.debug(f"Number of results requested: {num_results}")
# Initialize summarization pipeline with a specific model
logging.debug("Initializing summarization pipeline...")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
# Search for news articles
logging.debug("Searching for news articles...")
search_results = search(query, num_results=num_results)
articles = []
logging.debug(f"Search results: {search_results}")
for url in search_results:
try:
logging.debug(f"Fetching content from URL: {url}")
# Fetch the content of the news article
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = [result.text for result in results]
ARTICLE = ' '.join(text)
# Chunk the article text
logging.debug("Chunking the article text...")
max_chunk = 500
ARTICLE = ARTICLE.replace('.', '.<eos>')
ARTICLE = ARTICLE.replace('?', '?<eos>')
ARTICLE = ARTICLE.replace('!', '!<eos>')
sentences = ARTICLE.split('<eos>')
current_chunk = 0
chunks = []
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
chunks[current_chunk].extend(sentence.split(' '))
else:
current_chunk += 1
chunks.append(sentence.split(' '))
else:
chunks.append(sentence.split(' '))
for chunk_id in range(len(chunks)):
chunks[chunk_id] = ' '.join(chunks[chunk_id])
logging.debug(f"Chunks created: {chunks}")
# Summarize the chunks
logging.debug("Summarizing the chunks...")
summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
summary_text = " ".join([summary['summary_text'] for summary in summaries])
articles.append((url, summary_text))
logging.debug(f"Summary for URL {url}: {summary_text}")
except Exception as e:
logging.error(f"Error processing URL {url}: {e}")
continue
logging.debug(f"Final summarized articles: {articles}")
return articles
def format_output(articles):
formatted_text = ""
for url, summary in articles:
formatted_text += f"URL: {url}\nSummary: {summary}\n\n"
return formatted_text
iface = gr.Interface(
fn=lambda query, num_results: format_output(summarize_news(query, num_results)),
inputs=[gr.inputs.Textbox(label="Query"), gr.inputs.Slider(minimum=1, maximum=10, default=3, label="Number of Results")],
outputs="text",
title="News Summarizer",
description="Enter a query to get summarized versions of the top news articles."
)
if __name__ == "__main__":
logging.debug("Launching Gradio interface...")
iface.launch()
|