File size: 779 Bytes
f3320bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os

def ocr_pdf(pdf_file):
    with tempfile.TemporaryDirectory() as path:
        pdf_path = os.path.join(path, "temp.pdf")
        with open(pdf_path, 'wb') as f:
            f.write(pdf_file.read())
        
        images = convert_from_path(pdf_path)
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image)
    return text

iface = gr.Interface(
    fn=ocr_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Extracted Text"),
    title="PDF OCR with PyTesseract",
    description="Upload a PDF file to extract its text using PyTesseract."
)

if __name__ == "__main__":
    iface.launch()