File size: 3,426 Bytes
3f6ed4f
 
871b845
 
2e72d63
c19b837
e26bc82
c19b837
 
 
 
 
 
 
 
 
 
e26bc82
2e72d63
c19b837
 
2e72d63
 
c19b837
 
2e72d63
 
c19b837
2e72d63
 
 
 
 
 
 
 
c19b837
2e72d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c19b837
 
2e72d63
c19b837
2e72d63
 
 
c19b837
 
2e72d63
c19b837
2e72d63
 
c19b837
2e72d63
 
 
 
 
 
 
3f6ed4f
 
c19b837
 
3f6ed4f
2e72d63
 
3f6ed4f
 
c19b837
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
from googlesearch import search
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG)

def summarize_news(query, num_results=3):
    logging.debug(f"Query received: {query}")
    logging.debug(f"Number of results requested: {num_results}")

    # Initialize summarization pipeline with a specific model
    logging.debug("Initializing summarization pipeline...")
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

    # Search for news articles
    logging.debug("Searching for news articles...")
    search_results = search(query, num_results=num_results)
    articles = []
    
    logging.debug(f"Search results: {search_results}")

    for url in search_results:
        try:
            logging.debug(f"Fetching content from URL: {url}")
            # Fetch the content of the news article
            r = requests.get(url)
            soup = BeautifulSoup(r.text, 'html.parser')
            results = soup.find_all(['h1', 'p'])
            text = [result.text for result in results]
            ARTICLE = ' '.join(text)

            # Chunk the article text
            logging.debug("Chunking the article text...")
            max_chunk = 500
            ARTICLE = ARTICLE.replace('.', '.<eos>')
            ARTICLE = ARTICLE.replace('?', '?<eos>')
            ARTICLE = ARTICLE.replace('!', '!<eos>')

            sentences = ARTICLE.split('<eos>')
            current_chunk = 0
            chunks = []
            for sentence in sentences:
                if len(chunks) == current_chunk + 1:
                    if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                        chunks[current_chunk].extend(sentence.split(' '))
                    else:
                        current_chunk += 1
                        chunks.append(sentence.split(' '))
                else:
                    chunks.append(sentence.split(' '))

            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])

            logging.debug(f"Chunks created: {chunks}")

            # Summarize the chunks
            logging.debug("Summarizing the chunks...")
            summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
            summary_text = " ".join([summary['summary_text'] for summary in summaries])
            articles.append((url, summary_text))

            logging.debug(f"Summary for URL {url}: {summary_text}")
        except Exception as e:
            logging.error(f"Error processing URL {url}: {e}")
            continue

    logging.debug(f"Final summarized articles: {articles}")
    return articles

def format_output(articles):
    formatted_text = ""
    for url, summary in articles:
        formatted_text += f"URL: {url}\nSummary: {summary}\n\n"
    return formatted_text

iface = gr.Interface(
    fn=lambda query, num_results: format_output(summarize_news(query, num_results)), 
    inputs=[gr.inputs.Textbox(label="Query"), gr.inputs.Slider(minimum=1, maximum=10, default=3, label="Number of Results")], 
    outputs="text",
    title="News Summarizer",
    description="Enter a query to get summarized versions of the top news articles."
)

if __name__ == "__main__":
    logging.debug("Launching Gradio interface...")
    iface.launch()