import gradio as gr import tempfile import pytesseract import os import fitz # PyMuPDF, imported as fitz for backward compatibility reasons from PIL import Image import logging logging.basicConfig(level=logging.INFO) def pdf_to_image(pdf_file, path, progress): # Convert the PDF to a PNG image using pdf2image doc = fitz.open(pdf_file.name) # open document fnames = [] for page in progress.tqdm(doc, desc="Converting PDF to image"): pix = page.get_pixmap() output = f"{path}/page.png" pix.save(output) fnames.append(output) return fnames def tesseract_ocr(image, progress=gr.Progress()): # Run OCR on the image using Tesseract with tempfile.TemporaryDirectory() as path: images = pdf_to_image(image, path, progress) text_res = [] for img in progress.tqdm(images, desc="Running OCR"): with open(img, 'rb') as f: img = Image.open(f) img.load() text = pytesseract.image_to_string(img) text_res.append(text) with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file: file.write("\n".join(text_res)) return file.name if __name__ == "__main__": logging.info("Starting Tesseract OCR") iface = gr.Interface( fn=tesseract_ocr, inputs=[gr.File(label="PDF file")], outputs=gr.File(label="Text file", type="file"), title="PDF to Text Converter", description="Converts a PDF file to text using Tesseract OCR." ).queue(concurrency_count=10) iface.launch(server_port=7860, server_name="0.0.0.0")