Spaces:
Runtime error
Runtime error
import torch | |
from nltk.tokenize import sent_tokenize | |
from transformers import T5Tokenizer | |
def abstractive_summarizer(tokenizer, model, text): | |
device = torch.device("cpu") | |
preprocess_text = text.strip().replace("\n", "") | |
t5_prepared_text = "summarize: " + preprocess_text | |
tokenized_text = tokenizer.encode(t5_prepared_text, return_tensors="pt").to(device) | |
# summmarize | |
summary_ids = model.generate( | |
tokenized_text, | |
num_beams=4, | |
no_repeat_ngram_size=2, | |
min_length=30, | |
max_length=300, | |
early_stopping=True, | |
) | |
abs_summarized_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
return abs_summarized_text | |
def preprocess_text_for_abstractive_summarization(tokenizer, text): | |
sentences = sent_tokenize(text) | |
# initialize | |
length = 0 | |
chunk = "" | |
chunks = [] | |
count = -1 | |
for sentence in sentences: | |
count += 1 | |
combined_length = ( | |
len(tokenizer.tokenize(sentence)) + length | |
) # add the no. of sentence tokens to the length counter | |
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed | |
chunk += sentence + " " # add the sentence to the chunk | |
length = combined_length # update the length counter | |
# if it is the last sentence | |
if count == len(sentences) - 1: | |
chunks.append(chunk.strip()) # save the chunk | |
else: | |
chunks.append(chunk.strip()) # save the chunk | |
# reset | |
length = 0 | |
chunk = "" | |
# take care of the overflow sentence | |
chunk += sentence + " " | |
length = len(tokenizer.tokenize(sentence)) | |
return chunks | |