Spaces:

Vishwas1
/

PDF2Marathi

Sleeping

File size: 1,788 Bytes

5c27db2
e8ea066
0d509f3
5c27db2
 
 
7a87bbf
5c27db2
e8ea066
 
 
5c27db2
e8ea066
 
0d509f3
 
 
 
 
 
e8ea066
 
 
 
5c27db2
e8ea066
 
0d509f3
 
e8ea066
0d509f3
e8ea066
f9c5f8b
e8ea066
5c27db2
e8ea066
f9c5f8b
0d509f3
 
9453fd8
5c27db2
 
 
f9c5f8b
5c27db2
 
 
 
 
9453fd8
5c27db2
 
f9c5f8b

import fitz  # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
import gradio as gr

# Ensure Tesseract is configured with Marathi language support
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Default on Linux-based HF Spaces
# Install Marathi language: sudo apt-get install tesseract-ocr-mar

def extract_images_from_pdf(pdf_path):
    """
    Extract images from a PDF file using PyMuPDF.
    """
    images = []
    document = fitz.open(pdf_path)
    for page_number in range(len(document)):
        page = document.load_page(page_number)
        pix = page.get_pixmap(dpi=300)  # Render page to an image with 300 DPI
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

def perform_ocr_on_images(images):
    """
    Perform OCR on the extracted images using pytesseract for Marathi text.
    """
    ocr_results = []
    for img in images:
        text = pytesseract.image_to_string(img, lang='mar')  # Specify 'mar' for Marathi
        ocr_results.append(text)
    return "\n".join(ocr_results)

def ocr_marathi_from_pdf(pdf_file_path):
    """
    Main function to handle Marathi OCR from a PDF file.
    """
    images = extract_images_from_pdf(pdf_file_path)  # Use the file path from the upload
    ocr_text = perform_ocr_on_images(images)
    return ocr_text

# Define the Gradio interface
interface = gr.Interface(
    fn=ocr_marathi_from_pdf,
    inputs=gr.File(type="filepath", label="Upload Marathi PDF"),  # Fixed the type here
    outputs=gr.Textbox(label="Extracted Marathi Text"),
    title="Marathi PDF OCR",
    description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
)

if __name__ == "__main__":
    interface.launch()