Spaces:

Gladiator
/

Text-Summarizer

Runtime error

Text-Summarizer / src /abstractive_summarizer.py

add abs preprocess func

4354680 over 3 years ago

1.76 kB

	import torch
	from nltk.tokenize import sent_tokenize
	from transformers import T5Tokenizer


	def abstractive_summarizer(tokenizer, model, text):
	device = torch.device("cpu")
	preprocess_text = text.strip().replace("\n", "")
	t5_prepared_text = "summarize: " + preprocess_text
	tokenized_text = tokenizer.encode(t5_prepared_text, return_tensors="pt").to(device)

	# summmarize
	summary_ids = model.generate(
	tokenized_text,
	num_beams=4,
	no_repeat_ngram_size=2,
	min_length=30,
	max_length=300,
	early_stopping=True,
	)
	abs_summarized_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	return abs_summarized_text


	def preprocess_text_for_abstractive_summarization(tokenizer, text):
	sentences = sent_tokenize(text)

	# initialize
	length = 0
	chunk = ""
	chunks = []
	count = -1
	for sentence in sentences:
	count += 1
	combined_length = (
	len(tokenizer.tokenize(sentence)) + length
	) # add the no. of sentence tokens to the length counter

	if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
	chunk += sentence + " " # add the sentence to the chunk
	length = combined_length # update the length counter

	# if it is the last sentence
	if count == len(sentences) - 1:
	chunks.append(chunk.strip()) # save the chunk

	else:
	chunks.append(chunk.strip()) # save the chunk

	# reset
	length = 0
	chunk = ""

	# take care of the overflow sentence
	chunk += sentence + " "
	length = len(tokenizer.tokenize(sentence))

	return chunks