Spaces:

Vishwas1
/

PDF2Marathi

Sleeping

App Files Files Community

PDF2Marathi / app.py

Vishwas1

Update app.py

7a87bbf verified 7 months ago

raw

history blame contribute delete

1.79 kB

	import fitz # PyMuPDF for PDF processing
	from PIL import Image
	import pytesseract
	import gradio as gr

	# Ensure Tesseract is configured with Marathi language support
	pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Default on Linux-based HF Spaces
	# Install Marathi language: sudo apt-get install tesseract-ocr-mar

	def extract_images_from_pdf(pdf_path):
	"""
	Extract images from a PDF file using PyMuPDF.
	"""
	images = []
	document = fitz.open(pdf_path)
	for page_number in range(len(document)):
	page = document.load_page(page_number)
	pix = page.get_pixmap(dpi=300) # Render page to an image with 300 DPI
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)
	return images

	def perform_ocr_on_images(images):
	"""
	Perform OCR on the extracted images using pytesseract for Marathi text.
	"""
	ocr_results = []
	for img in images:
	text = pytesseract.image_to_string(img, lang='mar') # Specify 'mar' for Marathi
	ocr_results.append(text)
	return "\n".join(ocr_results)

	def ocr_marathi_from_pdf(pdf_file_path):
	"""
	Main function to handle Marathi OCR from a PDF file.
	"""
	images = extract_images_from_pdf(pdf_file_path) # Use the file path from the upload
	ocr_text = perform_ocr_on_images(images)
	return ocr_text

	# Define the Gradio interface
	interface = gr.Interface(
	fn=ocr_marathi_from_pdf,
	inputs=gr.File(type="filepath", label="Upload Marathi PDF"), # Fixed the type here
	outputs=gr.Textbox(label="Extracted Marathi Text"),
	title="Marathi PDF OCR",
	description="Upload a PDF containing Marathi text. The app will extract the text using OCR.",
	)

	if __name__ == "__main__":
	interface.launch()