Spaces:

wwydmanski
/

tesseract-ocr

Runtime error

App Files Files Community

Witold Wydmański commited on Mar 12, 2023

Commit

02d986d

1 Parent(s): 6b6b8dc

feat: add tessdata

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +14 -6
tessdata/pol.traineddata +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tessdata/pol.traineddata filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -8,29 +8,32 @@ import logging
 logging.basicConfig(level=logging.INFO)
-def pdf_to_image(pdf_file, path, progress):
     # Convert the PDF to a PNG image using pdf2image
     doc = fitz.open(pdf_file.name)  # open document
     fnames = []
     idx = 1
-    for page in progress.tqdm(doc, desc="Converting PDF to image"):
         pix = page.get_pixmap()
         output = f"{path}/page-{idx}.png"
         pix.save(output)
         fnames.append(output)
         idx += 1
     return fnames
-def tesseract_ocr(image, progress=gr.Progress()):
     # Run OCR on the image using Tesseract
     with tempfile.TemporaryDirectory() as path:
-        images = pdf_to_image(image, path, progress)
         text_res = []
         for img in progress.tqdm(images, desc="Running OCR"):
             with open(img, 'rb') as f:
                 img = Image.open(f)
                 img.load()
-                text = pytesseract.image_to_string(img)
                 text_res.append(text)
     with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
@@ -40,9 +43,14 @@ def tesseract_ocr(image, progress=gr.Progress()):
 if __name__ == "__main__":
     logging.info("Starting Tesseract OCR")
     iface = gr.Interface(
         fn=tesseract_ocr,
-        inputs=[gr.File(label="PDF file")],
         outputs=gr.File(label="Text file", type="file"),
         title="PDF to Text Converter",
         description="Converts a PDF file to text using Tesseract OCR."

 logging.basicConfig(level=logging.INFO)
+def pdf_to_image(pdf_file, path, progress, max_pages):
     # Convert the PDF to a PNG image using pdf2image
     doc = fitz.open(pdf_file.name)  # open document
     fnames = []
     idx = 1
+    total = len(doc) if max_pages == 0 else max_pages
+    for page in progress.tqdm(doc, desc="Converting PDF to image", total=total):
         pix = page.get_pixmap()
         output = f"{path}/page-{idx}.png"
         pix.save(output)
         fnames.append(output)
         idx += 1
+        if max_pages > 0 and idx > max_pages:
+            break
     return fnames
+def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
     # Run OCR on the image using Tesseract
     with tempfile.TemporaryDirectory() as path:
+        images = pdf_to_image(image, path, progress, max_pages)
         text_res = []
         for img in progress.tqdm(images, desc="Running OCR"):
             with open(img, 'rb') as f:
                 img = Image.open(f)
                 img.load()
+                text = pytesseract.image_to_string(img, lang=language)
                 text_res.append(text)
     with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
 if __name__ == "__main__":
     logging.info("Starting Tesseract OCR")
+    os.environ["TESSDATA_PREFIX"] = "./tessdata"
     iface = gr.Interface(
         fn=tesseract_ocr,
+        inputs=[
+            gr.File(label="PDF file"),
+            gr.Dropdown(["eng", "pol"], label="Language", value="eng"),
+            gr.Number(label="Number of pages", value=0)
+        ],
         outputs=gr.File(label="Text file", type="file"),
         title="PDF to Text Converter",
         description="Converts a PDF file to text using Tesseract OCR."

tessdata/pol.traineddata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10b5a77c4e865ccaa79984879457df8aea7b6b0caabd9a5860733d485c913634
+size 25941386