Spaces:
Sleeping
Sleeping
import fitz # PyMuPDF for PDF processing | |
from PIL import Image | |
import pytesseract | |
import gradio as gr | |
# Ensure Tesseract is configured with Marathi language support | |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Default on Linux-based HF Spaces | |
# Install Marathi language: sudo apt-get install tesseract-ocr-mar | |
def extract_images_from_pdf(pdf_path): | |
""" | |
Extract images from a PDF file using PyMuPDF. | |
""" | |
images = [] | |
document = fitz.open(pdf_path) | |
for page_number in range(len(document)): | |
page = document.load_page(page_number) | |
pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
return images | |
def perform_ocr_on_images(images): | |
""" | |
Perform OCR on the extracted images using pytesseract for Marathi text. | |
""" | |
ocr_results = [] | |
for img in images: | |
text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi | |
ocr_results.append(text) | |
return "\n".join(ocr_results) | |
def ocr_marathi_from_pdf(pdf_file_path): | |
""" | |
Main function to handle Marathi OCR from a PDF file. | |
""" | |
images = extract_images_from_pdf(pdf_file_path) # Use the file path from the upload | |
ocr_text = perform_ocr_on_images(images) | |
return ocr_text | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=ocr_marathi_from_pdf, | |
inputs=gr.File(type="filepath", label="Upload Marathi PDF"), # Fixed the type here | |
outputs=gr.Textbox(label="Extracted Marathi Text"), | |
title="Marathi PDF OCR", | |
description="Upload a PDF containing Marathi text. The app will extract the text using OCR.", | |
) | |
if __name__ == "__main__": | |
interface.launch() | |