PDF2Marathi / app.py
Vishwas1's picture
Update app.py
7a87bbf verified
import fitz # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
import gradio as gr
# Ensure Tesseract is configured with Marathi language support
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Default on Linux-based HF Spaces
# Install Marathi language: sudo apt-get install tesseract-ocr-mar
def extract_images_from_pdf(pdf_path):
"""
Extract images from a PDF file using PyMuPDF.
"""
images = []
document = fitz.open(pdf_path)
for page_number in range(len(document)):
page = document.load_page(page_number)
pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
def perform_ocr_on_images(images):
"""
Perform OCR on the extracted images using pytesseract for Marathi text.
"""
ocr_results = []
for img in images:
text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi
ocr_results.append(text)
return "\n".join(ocr_results)
def ocr_marathi_from_pdf(pdf_file_path):
"""
Main function to handle Marathi OCR from a PDF file.
"""
images = extract_images_from_pdf(pdf_file_path) # Use the file path from the upload
ocr_text = perform_ocr_on_images(images)
return ocr_text
# Define the Gradio interface
interface = gr.Interface(
fn=ocr_marathi_from_pdf,
inputs=gr.File(type="filepath", label="Upload Marathi PDF"), # Fixed the type here
outputs=gr.Textbox(label="Extracted Marathi Text"),
title="Marathi PDF OCR",
description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
)
if __name__ == "__main__":
interface.launch()