Spaces:

wwydmanski
/

tesseract-ocr

Runtime error

Witold Wydmański commited on Mar 12, 2023

Commit

9d34bba

•

0 Parent(s):

init

Files changed (3) hide show

app.py ADDED Viewed

+import gradio as gr
+import tempfile
+import pytesseract
+import fitz  # PyMuPDF, imported as fitz for backward compatibility reasons
+from PIL import Image
+def pdf_to_image(pdf_file, path, progress):
+    # Convert the PDF to a PNG image using pdf2image
+    doc = fitz.open(pdf_file.name)  # open document
+    fnames = []
+    for page in progress.tqdm(doc, desc="Converting PDF to image"):
+        pix = page.get_pixmap()
+        output = f"{path}/page.png"
+        pix.save(output)
+        fnames.append(output)
+    return fnames
+def tesseract_ocr(image, progress=gr.Progress()):
+    # Run OCR on the image using Tesseract
+    with tempfile.TemporaryDirectory() as path:
+        images = pdf_to_image(image, path, progress)
+        text_res = []
+        for img in progress.tqdm(images, desc="Running OCR"):
+            with open(img, 'rb') as f:
+                img = Image.open(f)
+                img.load()
+                text = pytesseract.image_to_string(img)
+                text_res.append(text)
+    return text
+if __name__=="__main__":
+    iface = gr.Interface(
+        fn=tesseract_ocr,
+        inputs=[gr.File(label="PDF file")],
+        outputs=gr.Textbox(label="Text"),
+        title="PDF to Text Converter",
+        description="Converts a PDF file to text using Tesseract OCR.",
+    ).queue(concurrency_count=10)
+    iface.launch()

dockerfile ADDED Viewed

+FROM python:3.10-slim
+WORKDIR /code
+# Install tesseract
+RUN apt-get update && apt-get install -y tesseract-ocr
+# Install python dependencies
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+# Copy the source code
+COPY app.py .
+# Run the app
+CMD ["python", "app.py"]

requirements.txt ADDED Viewed

+pytesseract
+pymupdf
+gradio
+pillow
+tqdm