layoutlm-invoices / handler.py
Alexander Slessor
completed initial handler.py
1d2c57b
raw
history blame contribute delete
No virus
2.03 kB
from typing import Any
from subprocess import run
from docquery import document, pipeline
import tempfile
import os
# from transformers import AutoConfig, AutoTokenizer, LayoutLMForQuestionAnswering
# install tesseract-ocr and pytesseract
run("apt install -y tesseract-ocr", shell=True, check=True)
class EndpointHandler:
def __init__(self, path=""):
# config = AutoConfig.from_pretrained(model_checkpoint, revision=rev)
# self.tokenizer = AutoTokenizer.from_pretrained(path)
# self.model = LayoutLMForQuestionAnswering.from_pretrained(path)
# self.pipeline = pipeline('document-question-answering', model=self.model, tokenizer=self.tokenizer)
# self.pipeline = pipeline('document-question-answering', model='impira/layoutlm-invoices')
self.pipeline = pipeline('document-question-answering', model=path)
def __call__(self, data: dict[str, bytes]) -> dict[str, list[Any]]:
"""
Args:
data (:obj:):
includes:
- pdf bytes
"""
# process input
f_bytes = data.pop("inputs", data)
try:
temp_file_name = next(tempfile._get_candidate_names())
temp_file_path = os.path.join('/tmp', f'{temp_file_name}.pdf')
with open(temp_file_path, 'wb') as temp_file:
temp_file.write(f_bytes)
if not os.path.exists(temp_file_path):
raise ValueError(f'File not found at path: {temp_file_path}')
results = []
doc = document.load_document(temp_file_path)
for q in ["What is the invoice number?", "What is the invoice total?"]:
result = self.pipeline(question=q, **doc.context)
results.append(result)
except Exception as e:
raise
else:
return {"predictions": results}
finally:
try:
os.remove(temp_file_path)
except FileNotFoundError as e:
print(e)