Spaces:
Sleeping
Sleeping
File size: 2,678 Bytes
62eafad e6f7108 62eafad e6f7108 62eafad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
pip install tesseract pytesseract pillow
# In[ ]:
pip install transformers
# In[ ]:
pip install torch
# In[ ]:
pip install gradio
# In[ ]:
pip install streamlit
# In[ ]:
pip install pillow
# In[10]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# In[11]:
import gradio as gr
import pytesseract
from PIL import Image
import re
import os
import sys
# Set Tesseract path if needed (uncomment and modify if Tesseract is not in PATH)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def ocr_image(image):
try:
# Ensure the image is in RGB mode
image = image.convert('RGB')
# Perform OCR with Tesseract (supports both English and Hindi)
text = pytesseract.image_to_string(image, lang='eng+hin')
return text
except Exception as e:
return f"OCR Error: {str(e)}"
def search_text(text, keyword):
if not keyword:
return "Please enter a keyword to search."
# Perform case-insensitive search
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
matches = pattern.finditer(text)
# Highlight matches and get surrounding context
results = []
for match in matches:
start = max(0, match.start() - 20)
end = min(len(text), match.end() + 20)
context = text[start:end]
highlighted = pattern.sub(f"<mark>{match.group()}</mark>", context)
results.append(f"...{highlighted}...")
if results:
return "<br><br>".join(results)
else:
return "No matches found."
def process_image(image, keyword):
if image is None:
return "Please upload an image.", ""
extracted_text = ocr_image(image)
search_results = search_text(extracted_text, keyword) if "OCR Error" not in extracted_text else ""
return extracted_text, search_results
# Debug information
print(f"Python version: {sys.version}")
print(f"Tesseract version: {pytesseract.get_tesseract_version()}")
print(f"Tesseract path: {pytesseract.pytesseract.tesseract_cmd}")
# Create the Gradio interface
iface = gr.Interface(
fn=process_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(label="Search Keyword")
],
outputs=[
gr.Textbox(label="Extracted Text"),
gr.HTML(label="Search Results")
],
title="OCR and Keyword Search",
description="Upload an image with English text, and optionally provide a keyword to search within the extracted text."
)
# Launch the app
iface.launch(share=True)
|