Vishwas1 commited on
Commit
e8ea066
·
verified ·
1 Parent(s): 1a5c00e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -28
app.py CHANGED
@@ -1,34 +1,73 @@
1
- import gradio as gr
 
 
2
  import pytesseract
3
- from pdf2image import convert_from_bytes
4
- import shutil
5
-
6
- def ocr_marathi_from_pdf(pdf_file):
7
- # Verify if Poppler's `pdfinfo` is installed
8
- if not shutil.which("pdfinfo"):
9
- raise EnvironmentError("Poppler's pdfinfo utility is not installed or not in PATH.")
10
-
11
- # Verify if Tesseract is installed
12
- if not shutil.which("tesseract"):
13
- raise EnvironmentError("Tesseract-OCR is not installed or not in PATH.")
14
-
15
- # Process the PDF file
16
- images = convert_from_bytes(pdf_file, poppler_path='/usr/bin')
17
- all_text = []
18
- for img in images:
19
- text = pytesseract.image_to_string(img, lang='mar')
20
- all_text.append(text)
21
- return "\n".join(all_text)
22
-
23
- iface = gr.Interface(
24
- fn=ocr_marathi_from_pdf,
25
- inputs=gr.File(label="Upload PDF", type="binary"),
26
- outputs="text",
27
- title="Marathi OCR"
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  if __name__ == "__main__":
31
- iface.launch()
 
 
 
 
 
 
 
 
 
32
 
33
 
34
 
 
1
+ import os
2
+ from popplerqt5 import Poppler
3
+ from PyQt5.QtCore import QByteArray
4
  import pytesseract
5
+ from PIL import Image
6
+
7
+ # Ensure pytesseract is configured for Marathi language
8
+ pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract" # Update if needed
9
+ marathi_lang = "mar" # Ensure Marathi language is installed in Tesseract
10
+
11
+ def extract_images_from_pdf(pdf_path):
12
+ """
13
+ Extract images from the PDF file using python-poppler-qt5.
14
+ """
15
+ document = Poppler.Document.load(pdf_path)
16
+ if not document:
17
+ raise ValueError(f"Unable to open {pdf_path}")
18
+
19
+ images = []
20
+ for i in range(document.numPages()):
21
+ page = document.page(i)
22
+ if page:
23
+ image = page.renderToImage(300, 300) # DPI: 300x300 for better OCR
24
+ images.append(image)
25
+
26
+ return images
27
+
28
+ def perform_ocr_on_images(images):
29
+ """
30
+ Perform OCR on the extracted images.
31
+ """
32
+ ocr_results = []
33
+ for i, image in enumerate(images):
34
+ # Convert Qt Image to PIL Image
35
+ pil_image = Image.fromqimage(image)
36
+ text = pytesseract.image_to_string(pil_image, lang=marathi_lang)
37
+ ocr_results.append(text)
38
+ print(f"OCR for Page {i + 1}: {text}")
39
+
40
+ return ocr_results
41
+
42
+ def ocr_marathi_from_pdf(pdf_path):
43
+ """
44
+ Main function to handle Marathi OCR from a PDF.
45
+ """
46
+ # Step 1: Extract images from the PDF
47
+ print("Extracting images from PDF...")
48
+ images = extract_images_from_pdf(pdf_path)
49
+
50
+ # Step 2: Perform OCR on the extracted images
51
+ print("Performing OCR on images...")
52
+ ocr_results = perform_ocr_on_images(images)
53
+
54
+ # Combine results
55
+ combined_text = "\n".join(ocr_results)
56
+ print(f"Combined OCR Text: {combined_text}")
57
+
58
+ return combined_text
59
 
60
  if __name__ == "__main__":
61
+ pdf_path = "path/to/marathi/pdf.pdf" # Replace with the path to your PDF
62
+ if not os.path.exists(pdf_path):
63
+ print(f"PDF file not found: {pdf_path}")
64
+ else:
65
+ print("Processing Marathi PDF...")
66
+ ocr_text = ocr_marathi_from_pdf(pdf_path)
67
+ with open("output.txt", "w", encoding="utf-8") as f:
68
+ f.write(ocr_text)
69
+ print("OCR text saved to output.txt")
70
+
71
 
72
 
73