Spaces:

ankur-bohra
/

demo-automatic-reimbursement-tool

Build error

App Files Files Community

demo-automatic-reimbursement-tool / main.py

ankur-bohra

Add basic structure

0d99179 over 1 year ago

raw

history blame contribute delete

2.02 kB

	from pathlib import Path

	import categories
	import processing
	import extract
	from PIL import Image
	from pydantic import BaseModel
	from io import BytesIO

	def categorize_and_parse_text(text: str) -> BaseModel:
	"""Categorizes the text and parses the information from it.

	Args:
	text(str): The text to categorize and parse information from.

	Returns: The category of the text.
	"""
	category = categories.categorize_text(text)
	# if stop_on_category:
	# return category, text
	result = categories.run_category_chain(category, text)
	return result

	def process_pdf(filename: Path, extract_only=False) -> BaseModel:
	"""Processes the given PDF file and extracts information from it.

	Args:
	filename(Path): The PDF file to process.

	Returns: The extracted information.
	"""
	with open(filename, "rb") as f:
	pdf_bytes = bytes(f.read())

	text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
	# If the encoded text is too short, a pdf scanner probably added a watermark
	if len(text) < 20:
	# Try to extract text from images
	images = processing.preprocess_pdf_pdf2image(pdf_bytes)
	text = extract.extract_text_from_images_pyocr_tesseract(images)
	if extract_only:
	return text
	result = categorize_and_parse_text(text)
	return result

	def process_image(filename: Path, extract_only=False) -> BaseModel:
	"""Processes the given image file and extracts information from it.

	Args:
	filename(Path): The image file to process.

	Returns: The extracted information.
	"""
	image = Image.open(filename)
	image = processing.preprocess_image(image)
	text = extract.extract_text_from_image_pyocr_tesseract(image)
	image.close()
	if extract_only:
	return text
	result = categorize_and_parse_text(text)
	return result

	if __name__ == "__main__":
	filename = Path("examples/example1.pdf")
	result = process_pdf(filename)
	print(result.json(indent=4))