Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import torch | |
import re | |
import gradio as gr | |
import os | |
import docx2txt | |
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384") | |
model = AutoModelForSeq2SeqLM.from_pretrained("checkpoint-64840").to("cpu") | |
def summarize(text_file): | |
file_extension = os.path.splitext(text_file.name)[1] | |
if file_extension == ".txt": | |
# Load text from a txt file | |
with open(text_file.name, "r", encoding="utf-8") as f: | |
text = f.read() | |
elif file_extension == ".docx": | |
# Load text from a Word file | |
text = docx2txt.process(text_file.name) | |
else: | |
raise ValueError(f"Unsupported file type: {file_extension}") | |
input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cpu") | |
global_attention_mask = torch.zeros_like(input_ids) | |
# set global_attention_mask on first token | |
global_attention_mask[:, 0] = 1 | |
sequences = model.generate(input_ids, global_attention_mask=global_attention_mask).sequences | |
summary = tokenizer.batch_decode(sequences)[0] | |
return text, summary | |
iface = gr.Interface( | |
fn=summarize, | |
inputs=gr.inputs.File(label="Upload a txt file or a Word file for the input text"), | |
outputs=[gr.outputs.Textbox(label="Original text"), gr.outputs.Textbox(label="Summary")], | |
title="Academic Paper Summarization Demo", | |
description="Upload a txt file or a Word file for the input text. Get a summary generated by a small T5 model from Hugging Face.", | |
) | |
iface.launch() | |