Alexander Slessor
commited on
Commit
·
1d2c57b
1
Parent(s):
475995c
completed initial handler.py
Browse files- .gitignore +13 -0
- README.md +5 -0
- handler.py +56 -0
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
*.ipynb
|
3 |
+
*.pdf
|
4 |
+
|
5 |
+
test_handler_local.py
|
6 |
+
test_handler.py
|
7 |
+
test_endpoint.py
|
8 |
+
|
9 |
+
setup
|
10 |
+
upload_to_hf
|
11 |
+
requirements.txt
|
12 |
+
hf_token.py
|
13 |
+
|
README.md
CHANGED
@@ -7,11 +7,16 @@ tags:
|
|
7 |
- document-question-answering
|
8 |
- pdf
|
9 |
- invoices
|
|
|
10 |
widget:
|
11 |
- text: "What is the invoice number?"
|
12 |
src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png"
|
13 |
- text: "What is the purchase amount?"
|
14 |
src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/contract.jpeg"
|
|
|
|
|
|
|
|
|
15 |
---
|
16 |
|
17 |
# LayoutLM for Invoices
|
|
|
7 |
- document-question-answering
|
8 |
- pdf
|
9 |
- invoices
|
10 |
+
- endpoints-template
|
11 |
widget:
|
12 |
- text: "What is the invoice number?"
|
13 |
src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png"
|
14 |
- text: "What is the purchase amount?"
|
15 |
src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/contract.jpeg"
|
16 |
+
library_name: generic
|
17 |
+
model-index:
|
18 |
+
- name: layoutlm-invoices
|
19 |
+
results: []
|
20 |
---
|
21 |
|
22 |
# LayoutLM for Invoices
|
handler.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
from subprocess import run
|
3 |
+
from docquery import document, pipeline
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
# from transformers import AutoConfig, AutoTokenizer, LayoutLMForQuestionAnswering
|
7 |
+
|
8 |
+
# install tesseract-ocr and pytesseract
|
9 |
+
run("apt install -y tesseract-ocr", shell=True, check=True)
|
10 |
+
|
11 |
+
class EndpointHandler:
|
12 |
+
def __init__(self, path=""):
|
13 |
+
# config = AutoConfig.from_pretrained(model_checkpoint, revision=rev)
|
14 |
+
# self.tokenizer = AutoTokenizer.from_pretrained(path)
|
15 |
+
# self.model = LayoutLMForQuestionAnswering.from_pretrained(path)
|
16 |
+
# self.pipeline = pipeline('document-question-answering', model=self.model, tokenizer=self.tokenizer)
|
17 |
+
# self.pipeline = pipeline('document-question-answering', model='impira/layoutlm-invoices')
|
18 |
+
self.pipeline = pipeline('document-question-answering', model=path)
|
19 |
+
|
20 |
+
def __call__(self, data: dict[str, bytes]) -> dict[str, list[Any]]:
|
21 |
+
"""
|
22 |
+
Args:
|
23 |
+
data (:obj:):
|
24 |
+
includes:
|
25 |
+
- pdf bytes
|
26 |
+
"""
|
27 |
+
# process input
|
28 |
+
f_bytes = data.pop("inputs", data)
|
29 |
+
try:
|
30 |
+
temp_file_name = next(tempfile._get_candidate_names())
|
31 |
+
temp_file_path = os.path.join('/tmp', f'{temp_file_name}.pdf')
|
32 |
+
with open(temp_file_path, 'wb') as temp_file:
|
33 |
+
temp_file.write(f_bytes)
|
34 |
+
|
35 |
+
if not os.path.exists(temp_file_path):
|
36 |
+
raise ValueError(f'File not found at path: {temp_file_path}')
|
37 |
+
|
38 |
+
results = []
|
39 |
+
doc = document.load_document(temp_file_path)
|
40 |
+
for q in ["What is the invoice number?", "What is the invoice total?"]:
|
41 |
+
result = self.pipeline(question=q, **doc.context)
|
42 |
+
results.append(result)
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
raise
|
46 |
+
else:
|
47 |
+
return {"predictions": results}
|
48 |
+
finally:
|
49 |
+
try:
|
50 |
+
os.remove(temp_file_path)
|
51 |
+
except FileNotFoundError as e:
|
52 |
+
print(e)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|