Spaces:
Runtime error
Runtime error
Witold Wydmański
commited on
Commit
•
02d986d
1
Parent(s):
6b6b8dc
feat: add tessdata
Browse files- .gitattributes +1 -0
- app.py +14 -6
- tessdata/pol.traineddata +3 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
tessdata/pol.traineddata filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -8,29 +8,32 @@ import logging
|
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
11 |
-
def pdf_to_image(pdf_file, path, progress):
|
12 |
# Convert the PDF to a PNG image using pdf2image
|
13 |
doc = fitz.open(pdf_file.name) # open document
|
14 |
fnames = []
|
15 |
idx = 1
|
16 |
-
|
|
|
17 |
pix = page.get_pixmap()
|
18 |
output = f"{path}/page-{idx}.png"
|
19 |
pix.save(output)
|
20 |
fnames.append(output)
|
21 |
idx += 1
|
|
|
|
|
22 |
return fnames
|
23 |
|
24 |
-
def tesseract_ocr(image, progress=gr.Progress()):
|
25 |
# Run OCR on the image using Tesseract
|
26 |
with tempfile.TemporaryDirectory() as path:
|
27 |
-
images = pdf_to_image(image, path, progress)
|
28 |
text_res = []
|
29 |
for img in progress.tqdm(images, desc="Running OCR"):
|
30 |
with open(img, 'rb') as f:
|
31 |
img = Image.open(f)
|
32 |
img.load()
|
33 |
-
text = pytesseract.image_to_string(img)
|
34 |
text_res.append(text)
|
35 |
|
36 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
|
@@ -40,9 +43,14 @@ def tesseract_ocr(image, progress=gr.Progress()):
|
|
40 |
|
41 |
if __name__ == "__main__":
|
42 |
logging.info("Starting Tesseract OCR")
|
|
|
43 |
iface = gr.Interface(
|
44 |
fn=tesseract_ocr,
|
45 |
-
inputs=[
|
|
|
|
|
|
|
|
|
46 |
outputs=gr.File(label="Text file", type="file"),
|
47 |
title="PDF to Text Converter",
|
48 |
description="Converts a PDF file to text using Tesseract OCR."
|
|
|
8 |
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
11 |
+
def pdf_to_image(pdf_file, path, progress, max_pages):
|
12 |
# Convert the PDF to a PNG image using pdf2image
|
13 |
doc = fitz.open(pdf_file.name) # open document
|
14 |
fnames = []
|
15 |
idx = 1
|
16 |
+
total = len(doc) if max_pages == 0 else max_pages
|
17 |
+
for page in progress.tqdm(doc, desc="Converting PDF to image", total=total):
|
18 |
pix = page.get_pixmap()
|
19 |
output = f"{path}/page-{idx}.png"
|
20 |
pix.save(output)
|
21 |
fnames.append(output)
|
22 |
idx += 1
|
23 |
+
if max_pages > 0 and idx > max_pages:
|
24 |
+
break
|
25 |
return fnames
|
26 |
|
27 |
+
def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
|
28 |
# Run OCR on the image using Tesseract
|
29 |
with tempfile.TemporaryDirectory() as path:
|
30 |
+
images = pdf_to_image(image, path, progress, max_pages)
|
31 |
text_res = []
|
32 |
for img in progress.tqdm(images, desc="Running OCR"):
|
33 |
with open(img, 'rb') as f:
|
34 |
img = Image.open(f)
|
35 |
img.load()
|
36 |
+
text = pytesseract.image_to_string(img, lang=language)
|
37 |
text_res.append(text)
|
38 |
|
39 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
|
|
|
43 |
|
44 |
if __name__ == "__main__":
|
45 |
logging.info("Starting Tesseract OCR")
|
46 |
+
os.environ["TESSDATA_PREFIX"] = "./tessdata"
|
47 |
iface = gr.Interface(
|
48 |
fn=tesseract_ocr,
|
49 |
+
inputs=[
|
50 |
+
gr.File(label="PDF file"),
|
51 |
+
gr.Dropdown(["eng", "pol"], label="Language", value="eng"),
|
52 |
+
gr.Number(label="Number of pages", value=0)
|
53 |
+
],
|
54 |
outputs=gr.File(label="Text file", type="file"),
|
55 |
title="PDF to Text Converter",
|
56 |
description="Converts a PDF file to text using Tesseract OCR."
|
tessdata/pol.traineddata
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10b5a77c4e865ccaa79984879457df8aea7b6b0caabd9a5860733d485c913634
|
3 |
+
size 25941386
|