Spaces:
Sleeping
Sleeping
File size: 3,091 Bytes
f3a27fd 871b845 f3a27fd 3148927 e26bc82 c19b837 f3a27fd 720d7f6 7b58076 720d7f6 f3a27fd 3148927 c19b837 2e72d63 c19b837 d530acf f3a27fd 5ccfc6a f3a27fd 3148927 f3a27fd 3148927 2225c6c 3148927 2225c6c 3148927 f3a27fd 3148927 f3a27fd 3f6ed4f 151e53b dfdc926 787b984 2e72d63 f3a27fd 3f6ed4f c19b837 fe7d500 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import logging
from bs4 import BeautifulSoup
import requests
import nltk
from transformers import pipeline
import gradio as gr
from newsapi import NewsApiClient
import asyncio
# Configure logging
logging.basicConfig(level=logging.DEBUG)
# Initialize the summarization pipeline from Hugging Face Transformers
summarizer = pipeline("summarization")
# Initialize the NLTK sentence tokenizer
nltk.download('punkt')
# Initialize the News API client with your API key
newsapi = NewsApiClient(api_key='5ab7bb1aaceb41b8993db03477098aad')
# Function to fetch content from a given URL
def fetch_article_content(url):
try:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = [result.text for result in results]
return ' '.join(text)
except Exception as e:
logging.error(f"Error fetching content from {url}: {e}")
return ""
# Function to summarize news articles based on a query
async def summarize_news(query, num_results=3):
logging.debug(f"Query received: {query}")
logging.debug(f"Number of results requested: {num_results}")
# Search for news articles
logging.debug("Searching for news articles...")
articles = []
aggregated_content = ""
try:
news_results = newsapi.get_everything(q=query, language='en', page_size=num_results)
logging.debug(f"Search results: {news_results}")
for article in news_results['articles']:
url = article['url']
logging.debug(f"Fetching content from URL: {url}")
content = fetch_article_content(url)
aggregated_content += content + " "
except Exception as e:
logging.error(f"Error fetching news articles: {e}")
# Summarize the aggregated content
try:
# Chunk the aggregated content into chunks
sentences = nltk.sent_tokenize(aggregated_content)
chunk_size = 500 # Adjust chunk size as needed
chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]
# Summarize each chunk separately
summaries = []
for chunk in chunks:
chunk_text = ' '.join(chunk)
summary = summarizer(chunk_text, max_length=120, min_length=30, do_sample=False)
summaries.append(summary[0]['summary_text'])
# Combine all summaries
final_summary = ' '.join(summaries)
logging.debug(f"Final summarized text: {final_summary}")
return final_summary
except Exception as e:
logging.error(f"Error during summarization: {e}")
return "An error occurred during summarization."
# Setting up Gradio interface
iface = gr.Interface(
fn=summarize_news,
inputs=[
gr.Textbox(label="Query"),
gr.Slider(minimum=1, maximum=10, value=3, label="Number of Results")
],
outputs="textbox",
title="News Summarizer",
description="Enter a query to get a consolidated summary of the top news articles."
)
if __name__ == "__main__":
iface.launch() |