|
import gradio as gr |
|
from transformers import pipeline, AutoTokenizer |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
import time |
|
|
|
|
|
nltk.download('punkt') |
|
|
|
|
|
translation_models = { |
|
'Vietnamese': "Helsinki-NLP/opus-mt-en-vi", |
|
'Japanese': "Helsinki-NLP/opus-mt-en-jap", |
|
'Thai': "Helsinki-NLP/opus-mt-en-tha", |
|
'Spanish': "Helsinki-NLP/opus-mt-en-es" |
|
} |
|
|
|
|
|
summarization_models = { |
|
'Scientific': "facebook/bart-large-cnn", |
|
'Literature': "google/pegasus-xsum" |
|
} |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") |
|
|
|
|
|
def get_summarizer(model_name): |
|
return pipeline("summarization", model=model_name) |
|
|
|
|
|
def get_translator(language): |
|
model_name = translation_models.get(language) |
|
if model_name: |
|
return pipeline("translation", model=model_name) |
|
return None |
|
|
|
|
|
def split_text(text, max_tokens=1024): |
|
sentences = sent_tokenize(text) |
|
chunks = [] |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
for sentence in sentences: |
|
sentence_length = len(tokenizer.tokenize(sentence)) |
|
if current_length + sentence_length <= max_tokens: |
|
current_chunk.append(sentence) |
|
current_length += sentence_length |
|
else: |
|
chunks.append(" ".join(current_chunk)) |
|
current_chunk = [sentence] |
|
current_length = sentence_length |
|
|
|
if current_chunk: |
|
chunks.append(" ".join(current_chunk)) |
|
|
|
return chunks |
|
|
|
|
|
def summarize_text(text, model_name): |
|
if len(text) < 200: |
|
print("Input text is too short for summarization. Please provide longer text.") |
|
return "" |
|
summarizer = get_summarizer(model_name) |
|
chunks = split_text(text) |
|
summaries = [] |
|
for chunk in chunks: |
|
try: |
|
summary = summarizer(chunk, max_length=150, min_length=20, do_sample=False)[0]['summary_text'] |
|
summaries.append(summary) |
|
except Exception as e: |
|
print(f"Error summarizing chunk: {chunk}\nError: {e}") |
|
return " ".join(summaries) |
|
|
|
|
|
def translate_text(text, language): |
|
translator = get_translator(language) |
|
if translator: |
|
try: |
|
translated_text = translator(text)[0]['translation_text'] |
|
return translated_text |
|
except Exception as e: |
|
print(f"Error translating text: {text}\nError: {e}") |
|
return text |
|
return text |
|
|
|
def process_text(input_text, model, language): |
|
start_time = time.time() |
|
print(f"Input text: {input_text[:500]}...") |
|
summary = summarize_text(input_text, model) |
|
if not summary: |
|
print("Summarization failed. Please provide longer text or try a different model.") |
|
return "", "" |
|
print(f"Summary: {summary[:500]}...") |
|
bullet_points = generate_bullet_points(summary) |
|
if not bullet_points: |
|
print("Bullet points generation failed.") |
|
return "", "" |
|
print(f"Bullet Points: {bullet_points}") |
|
translated_text = translate_text(bullet_points, language) |
|
print(f"Translated Text: {translated_text}") |
|
end_time = time.time() |
|
print(f"Processing time: {end_time - start_time} seconds") |
|
return bullet_points, translated_text |
|
|
|
def generate_bullet_points(summary): |
|
print("Summary Text:", summary) |
|
|
|
|
|
sentences = sent_tokenize(summary) |
|
if not sentences: |
|
return "" |
|
key_sentences = sentences[:3] |
|
|
|
bullet_points = "\n".join(f"- {sentence}" for sentence in key_sentences) |
|
print("Bullet Points:", bullet_points) |
|
|
|
return bullet_points |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_text, |
|
inputs=[ |
|
gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10), |
|
gr.Radio(choices=["Scientific", "Literature"], label="Summarization Model"), |
|
gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Bullet Points", lines=10), |
|
gr.Textbox(label="Translated Bullet Points", lines=10) |
|
], |
|
title="Text to Bullet Points and Translation", |
|
description="Paste any text, choose the summarization model, and optionally translate the bullet points into Vietnamese, Japanese, Thai, or Spanish." |
|
) |
|
|
|
iface.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|