File size: 1,425 Bytes
9d34bba
 
 
010c4d7
9d34bba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
010c4d7
daf6dc3
010c4d7
9d34bba
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gradio as gr
import tempfile
import pytesseract
import os
import fitz  # PyMuPDF, imported as fitz for backward compatibility reasons
from PIL import Image

def pdf_to_image(pdf_file, path, progress):
    # Convert the PDF to a PNG image using pdf2image
    doc = fitz.open(pdf_file.name)  # open document
    fnames = []
    for page in progress.tqdm(doc, desc="Converting PDF to image"):
        pix = page.get_pixmap()
        output = f"{path}/page.png"
        pix.save(output)
        fnames.append(output)
    return fnames

def tesseract_ocr(image, progress=gr.Progress()):
    # Run OCR on the image using Tesseract
    with tempfile.TemporaryDirectory() as path:
        images = pdf_to_image(image, path, progress)
        text_res = []
        for img in progress.tqdm(images, desc="Running OCR"):
            with open(img, 'rb') as f:
                img = Image.open(f)
                img.load()
                text = pytesseract.image_to_string(img)
                text_res.append(text)
    return text


if __name__=="__main__":
    #make sure that flagged/ dir is created
    os.chdir("/code")

    iface = gr.Interface(
        fn=tesseract_ocr,
        inputs=[gr.File(label="PDF file")],
        outputs=gr.Textbox(label="Text"),
        title="PDF to Text Converter",
        description="Converts a PDF file to text using Tesseract OCR.",
    ).queue(concurrency_count=10)

    iface.launch()