Spaces:

ArchitSharma
/

Text-Summarization-Tool

Running

App Files Files Community

Text-Summarization-Tool / utils.py

ArchitSharma

Update utils.py

cc71207 almost 3 years ago

raw

history blame contribute delete

3.75 kB

	import re
	import requests
	import docx2txt
	from io import StringIO
	from PyPDF2 import PdfReader

	from bs4 import BeautifulSoup
	from nltk.tokenize import sent_tokenize

	emoji_pattern = re.compile(
	"["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+",
	flags=re.UNICODE,
	)


	def clean_text(x):
	x = x.encode("ascii", "ignore").decode() # unicode
	x = re.sub(r"https*\S+", " ", x) # url
	x = re.sub(r"@\S+", " ", x) # mentions
	x = re.sub(r"#\S+", " ", x) # hastags
	x = re.sub(r"\s{2,}", " ", x) # over spaces
	x = emoji_pattern.sub(r"", x) # emojis
	x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?

	return x


	def fetch_article_text(url: str):

	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	results = soup.find_all(["h1", "p"])
	text = [result.text for result in results]
	ARTICLE = " ".join(text)
	ARTICLE = ARTICLE.replace(".", ".<eos>")
	ARTICLE = ARTICLE.replace("!", "!<eos>")
	ARTICLE = ARTICLE.replace("?", "?<eos>")
	sentences = ARTICLE.split("<eos>")
	current_chunk = 0
	chunks = []
	for sentence in sentences:
	if len(chunks) == current_chunk + 1:
	if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
	chunks[current_chunk].extend(sentence.split(" "))
	else:
	current_chunk += 1
	chunks.append(sentence.split(" "))
	else:
	print(current_chunk)
	chunks.append(sentence.split(" "))

	for chunk_id in range(len(chunks)):
	chunks[chunk_id] = " ".join(chunks[chunk_id])

	return ARTICLE, chunks


	def preprocess_text_for_abstractive_summarization(tokenizer, text):
	sentences = sent_tokenize(text)

	# initialize
	length = 0
	chunk = ""
	chunks = []
	count = -1
	for sentence in sentences:
	count += 1
	combined_length = (
	len(tokenizer.tokenize(sentence)) + length
	) # add the no. of sentence tokens to the length counter

	if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
	chunk += sentence + " " # add the sentence to the chunk
	length = combined_length # update the length counter

	# if it is the last sentence
	if count == len(sentences) - 1:
	chunks.append(chunk.strip()) # save the chunk

	else:
	chunks.append(chunk.strip()) # save the chunk

	# reset
	length = 0
	chunk = ""

	# take care of the overflow sentence
	chunk += sentence + " "
	length = len(tokenizer.tokenize(sentence))

	return chunks


	def read_pdf(file):
	pdfReader = PdfReader(file)
	count = len(pdfReader.pages)
	all_page_text = ""
	for i in range(count):
	page = pdfReader.pages[i]
	all_page_text += page.extract_text()

	return all_page_text


	def read_text_from_file(file):

	# read text file
	if file.type == "text/plain":
	# To convert to a string based IO:
	stringio = StringIO(file.getvalue().decode("utf-8"))

	# To read file as string:
	file_content = stringio.read()

	# read pdf file
	elif file.type == "application/pdf":
	file_content = read_pdf(file)

	# read docx file
	elif (
	file.type
	== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	):
	file_content = docx2txt.process(file)

	return file_content