Witold Wydmański commited on
Commit
9d34bba
0 Parent(s):
Files changed (3) hide show
  1. app.py +42 -0
  2. dockerfile +15 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tempfile
3
+ import pytesseract
4
+
5
+ import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
6
+ from PIL import Image
7
+
8
+ def pdf_to_image(pdf_file, path, progress):
9
+ # Convert the PDF to a PNG image using pdf2image
10
+ doc = fitz.open(pdf_file.name) # open document
11
+ fnames = []
12
+ for page in progress.tqdm(doc, desc="Converting PDF to image"):
13
+ pix = page.get_pixmap()
14
+ output = f"{path}/page.png"
15
+ pix.save(output)
16
+ fnames.append(output)
17
+ return fnames
18
+
19
+ def tesseract_ocr(image, progress=gr.Progress()):
20
+ # Run OCR on the image using Tesseract
21
+ with tempfile.TemporaryDirectory() as path:
22
+ images = pdf_to_image(image, path, progress)
23
+ text_res = []
24
+ for img in progress.tqdm(images, desc="Running OCR"):
25
+ with open(img, 'rb') as f:
26
+ img = Image.open(f)
27
+ img.load()
28
+ text = pytesseract.image_to_string(img)
29
+ text_res.append(text)
30
+ return text
31
+
32
+
33
+ if __name__=="__main__":
34
+ iface = gr.Interface(
35
+ fn=tesseract_ocr,
36
+ inputs=[gr.File(label="PDF file")],
37
+ outputs=gr.Textbox(label="Text"),
38
+ title="PDF to Text Converter",
39
+ description="Converts a PDF file to text using Tesseract OCR.",
40
+ ).queue(concurrency_count=10)
41
+
42
+ iface.launch()
dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /code
3
+
4
+ # Install tesseract
5
+ RUN apt-get update && apt-get install -y tesseract-ocr
6
+
7
+ # Install python dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install -r requirements.txt
10
+
11
+ # Copy the source code
12
+ COPY app.py .
13
+
14
+ # Run the app
15
+ CMD ["python", "app.py"]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pytesseract
2
+ pymupdf
3
+ gradio
4
+ pillow
5
+ tqdm