Spaces:

ayanika02
/

OCR-IITRoorkie

Sleeping

File size: 2,678 Bytes

#!/usr/bin/env python
# coding: utf-8

# In[ ]:


pip install tesseract pytesseract pillow


# In[ ]:


pip install transformers 


# In[ ]:


pip install torch


# In[ ]:


pip install gradio


# In[ ]:


pip install streamlit


# In[ ]:


pip install pillow


# In[10]:


import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


# In[11]:


import gradio as gr
import pytesseract
from PIL import Image
import re
import os
import sys

# Set Tesseract path if needed (uncomment and modify if Tesseract is not in PATH)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def ocr_image(image):
    try:
        # Ensure the image is in RGB mode
        image = image.convert('RGB')
        # Perform OCR with Tesseract (supports both English and Hindi)
        text = pytesseract.image_to_string(image, lang='eng+hin')
        return text
    except Exception as e:
        return f"OCR Error: {str(e)}"

def search_text(text, keyword):
    if not keyword:
        return "Please enter a keyword to search."
    
    # Perform case-insensitive search
    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
    matches = pattern.finditer(text)
    
    # Highlight matches and get surrounding context
    results = []
    for match in matches:
        start = max(0, match.start() - 20)
        end = min(len(text), match.end() + 20)
        context = text[start:end]
        highlighted = pattern.sub(f"<mark>{match.group()}</mark>", context)
        results.append(f"...{highlighted}...")
    
    if results:
        return "<br><br>".join(results)
    else:
        return "No matches found."

def process_image(image, keyword):
    if image is None:
        return "Please upload an image.", ""
    
    extracted_text = ocr_image(image)
    search_results = search_text(extracted_text, keyword) if "OCR Error" not in extracted_text else ""
    
    return extracted_text, search_results

# Debug information
print(f"Python version: {sys.version}")
print(f"Tesseract version: {pytesseract.get_tesseract_version()}")
print(f"Tesseract path: {pytesseract.pytesseract.tesseract_cmd}")

# Create the Gradio interface
iface = gr.Interface(
    fn=process_image,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(label="Search Keyword")
    ],
    outputs=[
        gr.Textbox(label="Extracted Text"),
        gr.HTML(label="Search Results")
    ],
    title="OCR and Keyword Search",
    description="Upload an image with English text, and optionally provide a keyword to search within the extracted text."
)

# Launch the app
iface.launch(share=True)