import fitz # PyMuPDF for PDF processing from PIL import Image import pytesseract import gradio as gr # Ensure Tesseract is configured with Marathi language support pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Default on Linux-based HF Spaces # Install Marathi language: sudo apt-get install tesseract-ocr-mar def extract_images_from_pdf(pdf_path): """ Extract images from a PDF file using PyMuPDF. """ images = [] document = fitz.open(pdf_path) for page_number in range(len(document)): page = document.load_page(page_number) pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) return images def perform_ocr_on_images(images): """ Perform OCR on the extracted images using pytesseract for Marathi text. """ ocr_results = [] for img in images: text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi ocr_results.append(text) return "\n".join(ocr_results) def ocr_marathi_from_pdf(pdf_file_path): """ Main function to handle Marathi OCR from a PDF file. """ images = extract_images_from_pdf(pdf_file_path) # Use the file path from the upload ocr_text = perform_ocr_on_images(images) return ocr_text # Define the Gradio interface interface = gr.Interface( fn=ocr_marathi_from_pdf, inputs=gr.File(type="filepath", label="Upload Marathi PDF"), # Fixed the type here outputs=gr.Textbox(label="Extracted Marathi Text"), title="Marathi PDF OCR", description="Upload a PDF containing Marathi text. The app will extract the text using OCR.", ) if __name__ == "__main__": interface.launch()