pdfextract / app.py
GAS17's picture
Update app.py
f3320bb verified
raw
history blame contribute delete
779 Bytes
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
def ocr_pdf(pdf_file):
with tempfile.TemporaryDirectory() as path:
pdf_path = os.path.join(path, "temp.pdf")
with open(pdf_path, 'wb') as f:
f.write(pdf_file.read())
images = convert_from_path(pdf_path)
text = ""
for image in images:
text += pytesseract.image_to_string(image)
return text
iface = gr.Interface(
fn=ocr_pdf,
inputs=gr.File(label="Upload PDF", type="binary"),
outputs=gr.Textbox(label="Extracted Text"),
title="PDF OCR with PyTesseract",
description="Upload a PDF file to extract its text using PyTesseract."
)
if __name__ == "__main__":
iface.launch()