Spaces:

Vishwas1
/

PDF2Marathi

Sleeping

App Files Files Community

Vishwas1 commited on Dec 7, 2024

Commit

e8ea066

verified ·

1 Parent(s): 1a5c00e

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -28

app.py CHANGED Viewed

@@ -1,34 +1,73 @@
-import gradio as gr
 import pytesseract
-from pdf2image import convert_from_bytes
-import shutil
-def ocr_marathi_from_pdf(pdf_file):
-    # Verify if Poppler's `pdfinfo` is installed
-    if not shutil.which("pdfinfo"):
-        raise EnvironmentError("Poppler's pdfinfo utility is not installed or not in PATH.")
-    # Verify if Tesseract is installed
-    if not shutil.which("tesseract"):
-        raise EnvironmentError("Tesseract-OCR is not installed or not in PATH.")
-    # Process the PDF file
-    images = convert_from_bytes(pdf_file, poppler_path='/usr/bin')
-    all_text = []
-    for img in images:
-        text = pytesseract.image_to_string(img, lang='mar')
-        all_text.append(text)
-    return "\n".join(all_text)
-iface = gr.Interface(
-    fn=ocr_marathi_from_pdf,
-    inputs=gr.File(label="Upload PDF", type="binary"),
-    outputs="text",
-    title="Marathi OCR"
-)
 if __name__ == "__main__":
-    iface.launch()

+import os
+from popplerqt5 import Poppler
+from PyQt5.QtCore import QByteArray
 import pytesseract
+from PIL import Image
+# Ensure pytesseract is configured for Marathi language
+pytesseract.pytesseract.tesseract_cmd = r"/path/to/tesseract"  # Update if needed
+marathi_lang = "mar"  # Ensure Marathi language is installed in Tesseract
+def extract_images_from_pdf(pdf_path):
+    """
+    Extract images from the PDF file using python-poppler-qt5.
+    """
+    document = Poppler.Document.load(pdf_path)
+    if not document:
+        raise ValueError(f"Unable to open {pdf_path}")
+    images = []
+    for i in range(document.numPages()):
+        page = document.page(i)
+        if page:
+            image = page.renderToImage(300, 300)  # DPI: 300x300 for better OCR
+            images.append(image)
+    return images
+def perform_ocr_on_images(images):
+    """
+    Perform OCR on the extracted images.
+    """
+    ocr_results = []
+    for i, image in enumerate(images):
+        # Convert Qt Image to PIL Image
+        pil_image = Image.fromqimage(image)
+        text = pytesseract.image_to_string(pil_image, lang=marathi_lang)
+        ocr_results.append(text)
+        print(f"OCR for Page {i + 1}: {text}")
+    return ocr_results
+def ocr_marathi_from_pdf(pdf_path):
+    """
+    Main function to handle Marathi OCR from a PDF.
+    """
+    # Step 1: Extract images from the PDF
+    print("Extracting images from PDF...")
+    images = extract_images_from_pdf(pdf_path)
+    # Step 2: Perform OCR on the extracted images
+    print("Performing OCR on images...")
+    ocr_results = perform_ocr_on_images(images)
+    # Combine results
+    combined_text = "\n".join(ocr_results)
+    print(f"Combined OCR Text: {combined_text}")
+    return combined_text
 if __name__ == "__main__":
+    pdf_path = "path/to/marathi/pdf.pdf"  # Replace with the path to your PDF
+    if not os.path.exists(pdf_path):
+        print(f"PDF file not found: {pdf_path}")
+    else:
+        print("Processing Marathi PDF...")
+        ocr_text = ocr_marathi_from_pdf(pdf_path)
+        with open("output.txt", "w", encoding="utf-8") as f:
+            f.write(ocr_text)
+        print("OCR text saved to output.txt")