|
import nltk |
|
import streamlit as st |
|
|
|
nltk.download('punkt') |
|
|
|
def fragment_text(text, tokenizer): |
|
|
|
sentences = nltk.tokenize.sent_tokenize(text) |
|
max_len = tokenizer.max_len_single_sentence |
|
|
|
chunks = [] |
|
chunk = "" |
|
count = -1 |
|
|
|
for sentence in sentences: |
|
count += 1 |
|
combined_length = len(tokenizer.tokenize(sentence)) + len(chunk) |
|
|
|
if combined_length <= max_len: |
|
chunk += sentence + " " |
|
else: |
|
chunks.append(chunk.strip()) |
|
chunk = sentence + " " |
|
|
|
if chunk != "": |
|
chunks.append(chunk.strip()) |
|
|
|
return chunks |
|
|
|
def summarize_text(text): |
|
|
|
chunks = fragment_text(text, tokenizer) |
|
|
|
summaries = [] |
|
for chunk in chunks: |
|
input = tokenizer(chunk, return_tensors='pt') |
|
output = model.generate(**input) |
|
summary = tokenizer.decode(*output, skip_special_tokens=True) |
|
summaries.append(summary) |
|
|
|
final_summary = " ".join(summaries) |
|
return final_summary |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
checkpoint = "tclopess/bart_samsum" |
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) |
|
|
|
|
|
text_input = st.text_area("Cole ou digite o texto a ser resumido:") |
|
|
|
button = st.button("Resumo") |
|
|
|
|
|
if button: |
|
summary = summarize_text(text_input) |
|
st.write("Resumo:") |
|
st.write(summary) |
|
|