Spaces:

pszemraj
/

document-summarization

Running on CPU Upgrade

App Files Files Community

document-summarization / utils.py

pszemraj

🎨

7452863 about 2 years ago

raw

history blame

2.01 kB

	"""
	utils.py - Utility functions for the project.
	"""

	import re
	from pathlib import Path

	from natsort import natsorted
	import subprocess


	def truncate_word_count(text, max_words=512):
	"""
	truncate_word_count - a helper function for the gradio module
	Parameters
	----------
	text : str, required, the text to be processed
	max_words : int, optional, the maximum number of words, default=512
	Returns
	-------
	dict, the text and whether it was truncated
	"""
	# split on whitespace with regex
	words = re.split(r"\s+", text)
	processed = {}
	if len(words) > max_words:
	processed["was_truncated"] = True
	processed["truncated_text"] = " ".join(words[:max_words])
	else:
	processed["was_truncated"] = False
	processed["truncated_text"] = text
	return processed


	def load_examples(src, filetypes=[".txt", ".pdf"]):
	"""
	load_examples - a helper function for the gradio module to load examples
	Returns:
	list of str, the examples
	"""
	src = Path(src)
	src.mkdir(exist_ok=True)

	pdf_url = (
	"https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1"
	)
	subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"])
	examples = [f for f in src.iterdir() if f.suffix in filetypes]
	examples = natsorted(examples)
	# load the examples into a list
	text_examples = []
	for example in examples:
	with open(example, "r") as f:
	text = f.read()
	text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3])

	return text_examples


	def load_example_filenames(example_path: str or Path):
	"""
	load_example_filenames - a helper function for the gradio module to load examples
	Returns:
	dict, the examples (filename:full path)
	"""
	example_path = Path(example_path)
	# load the examples into a list
	examples = {f.name: f for f in example_path.glob("*.txt")}
	return examples