Spaces:

GAS17
/

pdfextract

Runtime error

File size: 779 Bytes

f3320bb

import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os

def ocr_pdf(pdf_file):
    with tempfile.TemporaryDirectory() as path:
        pdf_path = os.path.join(path, "temp.pdf")
        with open(pdf_path, 'wb') as f:
            f.write(pdf_file.read())
        
        images = convert_from_path(pdf_path)
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image)
    return text

iface = gr.Interface(
    fn=ocr_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Extracted Text"),
    title="PDF OCR with PyTesseract",
    description="Upload a PDF file to extract its text using PyTesseract."
)

if __name__ == "__main__":
    iface.launch()