Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
from bs4 import BeautifulSoup | |
import requests | |
def summarize_blog_post(url): | |
# Load summarization pipeline | |
summarizer = pipeline("summarization") | |
# Get blog post content | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
results = soup.find_all(['h1', 'p']) | |
text = [result.text for result in results] | |
ARTICLE = ' '.join(text) | |
# Chunk text | |
max_chunk = 500 | |
ARTICLE = ARTICLE.replace('.', '.<eos>') | |
ARTICLE = ARTICLE.replace('?', '?<eos>') | |
ARTICLE = ARTICLE.replace('!', '!<eos>') | |
sentences = ARTICLE.split('<eos>') | |
current_chunk = 0 | |
chunks = [] | |
for sentence in sentences: | |
if len(chunks) == current_chunk + 1: | |
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk: | |
chunks[current_chunk].extend(sentence.split(' ')) | |
else: | |
current_chunk += 1 | |
chunks.append(sentence.split(' ')) | |
else: | |
chunks.append(sentence.split(' ')) | |
for chunk_id in range(len(chunks)): | |
chunks[chunk_id] = ' '.join(chunks[chunk_id]) | |
# Summarize text | |
summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False) | |
summary_text = " ".join([summary['summary_text'] for summary in summaries]) | |
return summary_text | |
iface = gr.Interface( | |
fn=summarize_blog_post, | |
inputs="text", | |
outputs="text", | |
title="Medium Blog Post Summarizer", | |
description="Enter the URL of a Medium blog post to get a summarized version of the content." | |
) | |
iface.launch() | |