Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tempfile | |
import pytesseract | |
import os | |
import fitz # PyMuPDF, imported as fitz for backward compatibility reasons | |
from PIL import Image | |
def pdf_to_image(pdf_file, path, progress): | |
# Convert the PDF to a PNG image using pdf2image | |
doc = fitz.open(pdf_file.name) # open document | |
fnames = [] | |
for page in progress.tqdm(doc, desc="Converting PDF to image"): | |
pix = page.get_pixmap() | |
output = f"{path}/page.png" | |
pix.save(output) | |
fnames.append(output) | |
return fnames | |
def tesseract_ocr(image, progress=gr.Progress()): | |
# Run OCR on the image using Tesseract | |
with tempfile.TemporaryDirectory() as path: | |
images = pdf_to_image(image, path, progress) | |
text_res = [] | |
for img in progress.tqdm(images, desc="Running OCR"): | |
with open(img, 'rb') as f: | |
img = Image.open(f) | |
img.load() | |
text = pytesseract.image_to_string(img) | |
text_res.append(text) | |
return text | |
if __name__=="__main__": | |
#make sure that flagged/ dir is created | |
os.chdir("/code") | |
iface = gr.Interface( | |
fn=tesseract_ocr, | |
inputs=[gr.File(label="PDF file")], | |
outputs=gr.Textbox(label="Text"), | |
title="PDF to Text Converter", | |
description="Converts a PDF file to text using Tesseract OCR.", | |
).queue(concurrency_count=10) | |
iface.launch() |