import fitz  # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
import gradio as gr

# Ensure Tesseract is configured with Marathi language support
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Default on Linux-based HF Spaces
# Install Marathi language: sudo apt-get install tesseract-ocr-mar

def extract_images_from_pdf(pdf_path):
    """
    Extract images from a PDF file using PyMuPDF.
    """
    images = []
    document = fitz.open(pdf_path)
    for page_number in range(len(document)):
        page = document.load_page(page_number)
        pix = page.get_pixmap(dpi=300)  # Render page to an image with 300 DPI
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

def perform_ocr_on_images(images):
    """
    Perform OCR on the extracted images using pytesseract for Marathi text.
    """
    ocr_results = []
    for img in images:
        text = pytesseract.image_to_string(img, lang='mar')  # Specify 'mar' for Marathi
        ocr_results.append(text)
    return "\n".join(ocr_results)

def ocr_marathi_from_pdf(pdf_file_path):
    """
    Main function to handle Marathi OCR from a PDF file.
    """
    images = extract_images_from_pdf(pdf_file_path)  # Use the file path from the upload
    ocr_text = perform_ocr_on_images(images)
    return ocr_text

# Define the Gradio interface
interface = gr.Interface(
    fn=ocr_marathi_from_pdf,
    inputs=gr.File(type="filepath", label="Upload Marathi PDF"),  # Fixed the type here
    outputs=gr.Textbox(label="Extracted Marathi Text"),
    title="Marathi PDF OCR",
    description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
)

if __name__ == "__main__":
    interface.launch()