Spaces:

Sakshiw1
/

OCR-Document-Search-App

Sleeping

OCR-Document-Search-App / app.py

Update app.py

214c45f verified about 1 year ago

1.56 kB

	import gradio as gr
	import cv2
	import pytesseract
	import numpy as np
	import pkg_resources

	# Function to log installed packages (for debugging purposes)
	def log_installed_packages():
	installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
	return installed_packages

	# Function to process the image
	def preprocess_image_for_tesseract(image):
	# Convert the image to grayscale
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	return gray

	def ocr_and_search(image, keyword):
	processed_image = preprocess_image_for_tesseract(image)

	# Ensure Tesseract is available
	if not pytesseract.pytesseract.get_tesseract_version():
	return "Tesseract is not available in PATH.", []

	try:
	# Extract text in both Hindi and English
	extracted_text = pytesseract.image_to_string(processed_image, lang='hin+eng')
	except Exception as e:
	return f"Error during OCR: {str(e)}", []

	# Search for the keyword in the extracted text (case insensitive)
	search_results = [line for line in extracted_text.split('\n') if keyword.lower() in line.lower()]

	return extracted_text, search_results

	# Create Gradio interface
	iface = gr.Interface(
	fn=ocr_and_search,
	inputs=[gr.Image(type="numpy"), gr.Textbox(label="Keyword")],
	outputs=["text", "text"],
	title="OCR and Keyword Search",
	description="Upload an image with text and search for a keyword."
	)

	# Optionally log installed packages
	print(log_installed_packages())

	# Launch the interface
	iface.launch(share=True)