tesseract-ocr / app.py
Witold Wydmański
fix: create `flagged` dir in dockerfile
c914e02
raw
history blame
1.43 kB
import gradio as gr
import tempfile
import pytesseract
import os
import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
from PIL import Image
def pdf_to_image(pdf_file, path, progress):
# Convert the PDF to a PNG image using pdf2image
doc = fitz.open(pdf_file.name) # open document
fnames = []
for page in progress.tqdm(doc, desc="Converting PDF to image"):
pix = page.get_pixmap()
output = f"{path}/page.png"
pix.save(output)
fnames.append(output)
return fnames
def tesseract_ocr(image, progress=gr.Progress()):
# Run OCR on the image using Tesseract
with tempfile.TemporaryDirectory() as path:
images = pdf_to_image(image, path, progress)
text_res = []
for img in progress.tqdm(images, desc="Running OCR"):
with open(img, 'rb') as f:
img = Image.open(f)
img.load()
text = pytesseract.image_to_string(img)
text_res.append(text)
return text
if __name__=="__main__":
#make sure that flagged/ dir is created
os.chdir("/code")
iface = gr.Interface(
fn=tesseract_ocr,
inputs=[gr.File(label="PDF file")],
outputs=gr.Textbox(label="Text"),
title="PDF to Text Converter",
description="Converts a PDF file to text using Tesseract OCR.",
).queue(concurrency_count=10)
iface.launch()