Spaces:
Sleeping
Sleeping
File size: 1,788 Bytes
5c27db2 e8ea066 0d509f3 5c27db2 7a87bbf 5c27db2 e8ea066 5c27db2 e8ea066 0d509f3 e8ea066 5c27db2 e8ea066 0d509f3 e8ea066 0d509f3 e8ea066 f9c5f8b e8ea066 5c27db2 e8ea066 f9c5f8b 0d509f3 9453fd8 5c27db2 f9c5f8b 5c27db2 9453fd8 5c27db2 f9c5f8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import fitz # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
import gradio as gr
# Ensure Tesseract is configured with Marathi language support
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Default on Linux-based HF Spaces
# Install Marathi language: sudo apt-get install tesseract-ocr-mar
def extract_images_from_pdf(pdf_path):
"""
Extract images from a PDF file using PyMuPDF.
"""
images = []
document = fitz.open(pdf_path)
for page_number in range(len(document)):
page = document.load_page(page_number)
pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
def perform_ocr_on_images(images):
"""
Perform OCR on the extracted images using pytesseract for Marathi text.
"""
ocr_results = []
for img in images:
text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi
ocr_results.append(text)
return "\n".join(ocr_results)
def ocr_marathi_from_pdf(pdf_file_path):
"""
Main function to handle Marathi OCR from a PDF file.
"""
images = extract_images_from_pdf(pdf_file_path) # Use the file path from the upload
ocr_text = perform_ocr_on_images(images)
return ocr_text
# Define the Gradio interface
interface = gr.Interface(
fn=ocr_marathi_from_pdf,
inputs=gr.File(type="filepath", label="Upload Marathi PDF"), # Fixed the type here
outputs=gr.Textbox(label="Extracted Marathi Text"),
title="Marathi PDF OCR",
description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
)
if __name__ == "__main__":
interface.launch()
|