import gradio as gr from transformers import pipeline from bs4 import BeautifulSoup import requests from googlesearch import search def summarize_news(query): # Initialize summarization pipeline summarizer = pipeline("summarization") # Search for news articles search_results = search(query, num_results=3) articles = [] for url in search_results: try: # Fetch the content of the news article r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') results = soup.find_all(['h1', 'p']) text = [result.text for result in results] ARTICLE = ' '.join(text) # Chunk the article text max_chunk = 500 ARTICLE = ARTICLE.replace('.', '.') ARTICLE = ARTICLE.replace('?', '?') ARTICLE = ARTICLE.replace('!', '!') sentences = ARTICLE.split('') current_chunk = 0 chunks = [] for sentence in sentences: if len(chunks) == current_chunk + 1: if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk: chunks[current_chunk].extend(sentence.split(' ')) else: current_chunk += 1 chunks.append(sentence.split(' ')) else: chunks.append(sentence.split(' ')) for chunk_id in range(len(chunks)): chunks[chunk_id] = ' '.join(chunks[chunk_id]) # Summarize the chunks summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False) summary_text = " ".join([summary['summary_text'] for summary in summaries]) articles.append((url, summary_text)) except Exception as e: continue return articles def format_output(articles): formatted_text = "" for url, summary in articles: formatted_text += f"URL: {url}\nSummary: {summary}\n\n" return formatted_text iface = gr.Interface( fn=lambda query: format_output(summarize_news(query)), inputs="text", outputs="text", title="News Summarizer", description="Enter a query to get summarized versions of the top news articles." ) iface.launch()