aslessor
/

layoutlm-invoices

Document Question Answering

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

layoutlm-invoices / handler.py

Alexander Slessor

completed initial handler.py

1d2c57b 9 months ago

history blame contribute delete

No virus

2.03 kB

	from typing import Any
	from subprocess import run
	from docquery import document, pipeline
	import tempfile
	import os
	# from transformers import AutoConfig, AutoTokenizer, LayoutLMForQuestionAnswering

	# install tesseract-ocr and pytesseract
	run("apt install -y tesseract-ocr", shell=True, check=True)

	class EndpointHandler:
	def __init__(self, path=""):
	# config = AutoConfig.from_pretrained(model_checkpoint, revision=rev)
	# self.tokenizer = AutoTokenizer.from_pretrained(path)
	# self.model = LayoutLMForQuestionAnswering.from_pretrained(path)
	# self.pipeline = pipeline('document-question-answering', model=self.model, tokenizer=self.tokenizer)
	# self.pipeline = pipeline('document-question-answering', model='impira/layoutlm-invoices')
	self.pipeline = pipeline('document-question-answering', model=path)

	def __call__(self, data: dict[str, bytes]) -> dict[str, list[Any]]:
	"""
	Args:
	data (:obj:):
	includes:
	- pdf bytes
	"""
	# process input
	f_bytes = data.pop("inputs", data)
	try:
	temp_file_name = next(tempfile._get_candidate_names())
	temp_file_path = os.path.join('/tmp', f'{temp_file_name}.pdf')
	with open(temp_file_path, 'wb') as temp_file:
	temp_file.write(f_bytes)

	if not os.path.exists(temp_file_path):
	raise ValueError(f'File not found at path: {temp_file_path}')

	results = []
	doc = document.load_document(temp_file_path)
	for q in ["What is the invoice number?", "What is the invoice total?"]:
	result = self.pipeline(question=q, **doc.context)
	results.append(result)

	except Exception as e:
	raise
	else:
	return {"predictions": results}
	finally:
	try:
	os.remove(temp_file_path)
	except FileNotFoundError as e:
	print(e)