tesseract / handler.py
jonahkaye's picture
same psm
6dcd7d1
raw
history blame contribute delete
No virus
968 Bytes
from typing import Dict, List, Any
from transformers import LayoutLMForTokenClassification, LayoutLMv2Processor
import torch
from subprocess import run
import pytesseract
from pytesseract import Output
# install tesseract-ocr and pytesseract
run("apt install -y tesseract-ocr", shell=True, check=True)
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class EndpointHandler:
def __init__(self, path=""):
self.pytesseract_installed = False
try:
import pytesseract
self.pytesseract_installed = True
except ImportError:
print("Pytesseract not installed, will not use OCR")
def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]:
"""
Args:
data (:obj:):
includes the deserialized image file as PIL.Image
"""
# process input
image = data.pop("inputs", data)
result = pytesseract.image_to_string(image, config='--psm 3', output_type=Output.STRING)
return {"predictions": result}