Spaces:
Runtime error
Runtime error
import gradio as gr | |
from pydantic import BaseModel, Field | |
from typing import Optional, Any | |
# Import statements that should only run once | |
if gr.NO_RELOAD: | |
import random | |
import os | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
from typing import Optional | |
from PIL import Image # Needed for working with PIL images | |
import datasets | |
import numpy as np # Added to help handle numpy array images | |
import pandas as pd # Added for pandas DataFrame | |
import cv2 # Added for OpenCV | |
# Load environment variables from .env if available. | |
from dotenv import load_dotenv | |
load_dotenv() | |
# The list of sentences from our previous conversation. | |
sentences = [ | |
"Optical character recognition (OCR) is the process of converting images of text into machine-readable data.", | |
"When applied to handwriting, OCR faces additional challenges because of the natural variability in individual penmanship.", | |
"Over the last century, advances in computer vision and machine learning have transformed handwriting OCR from bulky, specialized hardware into highly accurate, software-driven systems.", | |
"The origins of OCR date back to the early 20th century.", | |
"Early pioneers explored how machines might read text.", | |
"In the 1920s, inventors such as Emanuel Goldberg developed early devices that could capture printed characters by converting them into telegraph codes.", | |
"Around the same time, Gustav Tauschek created the Reading Machine using template-matching methods to detect letters in images.", | |
"These devices were designed for printed text and depended on fixed, machine-friendly fonts rather than natural handwriting.", | |
"In the 1950s, systems like David Shepard's GISMO emerged to begin automating the conversion of paper records into digital form.", | |
"Although these early OCR systems were limited in scope and accuracy, they laid the groundwork for later innovations.", | |
"The 1960s saw OCR technology being applied to real-world tasks.", | |
"In 1965, American inventor Jacob Rabinow developed an OCR machine specifically aimed at sorting mail by reading addresses.", | |
"This was a critical step for the U.S. Postal Service.", | |
"Soon after, research groups, including those at IBM, began developing machines such as the IBM 1287, which was capable of reading handprinted numbers on envelopes to facilitate automated mail processing.", | |
"These systems marked the first attempts to apply computer vision to handwritten data on a large scale.", | |
"By the late 1980s and early 1990s, researchers such as Yann LeCun and his colleagues developed neural network architectures to recognize handwritten digits.", | |
"Their work, initially applied to reading ZIP codes on mail, demonstrated that carefully designed, constrained neural networks could achieve error rates as low as about 1% on USPS data.", | |
"Sargur Srihari and his team at the Center of Excellence for Document Analysis and Recognition extended these ideas to develop complete handwritten address interpretation systems.", | |
"These systems, deployed by the USPS and postal agencies worldwide, helped automate the routing of mail and revolutionized the sorting process.", | |
"The development and evaluation of handwriting OCR have been driven in part by standard benchmark datasets.", | |
"The MNIST dataset, introduced in the 1990s, consists of 70,000 images of handwritten digits and became the de facto benchmark for handwritten digit recognition.", | |
"Complementing MNIST is the USPS dataset, which provides images of hand‐written digits derived from actual envelopes and captures real-world variability.", | |
"Handwriting OCR entered a new era with the introduction of neural network models.", | |
"In 1989, LeCun et al. applied backpropagation to a convolutional neural network tailored for handwritten digit recognition, an innovation that evolved into the LeNet series.", | |
"By automatically learning features rather than relying on hand-designed templates, these networks drastically improved recognition performance.", | |
"As computational power increased and large labeled datasets became available, deep learning models, particularly convolutional neural networks and recurrent neural networks, pushed the accuracy of handwriting OCR to near-human levels.", | |
"Modern systems can handle both printed and cursive text, automatically segmenting and recognizing characters in complex handwritten documents.", | |
"Cursive handwriting presents a classic challenge known as Sayre's paradox, where word recognition requires letter segmentation and letter segmentation requires word recognition.", | |
"Contemporary approaches use implicit segmentation methods, often combined with hidden Markov models or end-to-end neural networks, to circumvent this paradox.", | |
"Today's handwriting OCR systems are highly accurate and widely deployed.", | |
"Modern systems combine OCR with artificial intelligence to not only recognize text but also extract meaning, verify data, and integrate into larger enterprise workflows.", | |
"Projects such as In Codice Ratio use deep convolutional networks to transcribe historical handwritten documents, further expanding OCR applications.", | |
"Despite impressive advances, handwriting OCR continues to face challenges with highly variable or degraded handwriting.", | |
"Ongoing research aims to improve recognition accuracy, particularly for cursive and unconstrained handwriting, and to extend support across languages and historical scripts.", | |
"With improvements in deep learning architectures, increased computing power, and large annotated datasets, future OCR systems are expected to become even more robust, handling real-world handwriting in diverse applications from postal services to archival digitization.", | |
"Today's research in handwriting OCR benefits from a wide array of well-established datasets and ongoing evaluation challenges.", | |
"These resources help drive the development of increasingly robust systems for both digit and full-text recognition.", | |
"For handwritten digit recognition, the MNIST dataset remains the most widely used benchmark thanks to its simplicity and broad adoption.", | |
"Complementing MNIST is the USPS dataset, which is derived from actual mail envelopes and provides additional challenges with real-world variability.", | |
"The IAM Handwriting Database is one of the most popular datasets for unconstrained offline handwriting recognition and includes scanned pages of handwritten English text with corresponding transcriptions.", | |
"It is frequently used to train and evaluate models that work on full-line or full-page recognition tasks.", | |
"For systems designed to capture the dynamic aspects of handwriting, such as pen stroke trajectories, the IAM On-Line Handwriting Database offers valuable data.", | |
"The CVL dataset provides multi-writer handwritten texts with a range of writing styles, making it useful for assessing the generalization capabilities of OCR systems across diverse handwriting samples.", | |
"The RIMES dataset, developed for French handwriting recognition, contains scanned documents and is a key resource for evaluating systems in multilingual settings.", | |
"Various ICDAR competitions, such as ICDAR 2013 and ICDAR 2017, have released datasets that reflect the complexities of real-world handwriting, including historical documents and unconstrained writing.", | |
"For Arabic handwriting recognition, the KHATT dataset offers a collection of handwritten texts that capture the unique challenges of cursive and context-dependent scripts.", | |
"These datasets, along with continual evaluation efforts through competitions hosted at ICDAR and ICFHR, ensure that the field keeps pushing toward higher accuracy, better robustness, and broader language coverage.", | |
"Emerging benchmarks, often tailored to specific scripts, historical documents, or noisy real-world data, will further refine the state-of-the-art in handwriting OCR.", | |
"This array of resources continues to shape the development of handwriting OCR systems today.", | |
"This additional section outlines today's most influential datasets and benchmarks, highlighting how they continue to shape the development of handwriting OCR systems." | |
] | |
class SubmissionData(BaseModel): | |
text: str = Field(..., description="Text to be handwritten") | |
profile: Any = Field(..., description="Gradio OAuth profile") | |
image: Optional[Image.Image] = Field(None, description="Uploaded handwritten image") | |
max_words: int = Field(..., ge=1, le=201, description="Maximum number of words") | |
public_checkbox: bool = Field(..., description="Submit to public dataset") | |
model_config = { | |
"arbitrary_types_allowed": True # Allow PIL.Image.Image type | |
} | |
class OCRDataCollector: | |
def __init__(self): | |
self.collected_pairs = [] | |
self.last_text_block = None | |
self.current_text_block = self.get_random_text_block(201) # Default max words | |
self.hf_api = HfApi() | |
def get_random_text_block(self, max_words: int): | |
attempts = 0 | |
max_attempts = 10 # Prevent infinite loop in case of very small sentence list | |
while attempts < max_attempts: | |
block_length = random.randint(1, 5) | |
start_index = random.randint(0, len(sentences) - block_length) | |
block = " ".join(sentences[start_index:start_index + block_length]) | |
# Truncate to max_words if necessary | |
words = block.split() | |
if len(words) > max_words: | |
block = " ".join(words[:max_words]) | |
# If this block is different from the last one, use it | |
if block != self.last_text_block: | |
self.last_text_block = block | |
return block | |
attempts += 1 | |
# If we couldn't find a different block after max attempts, | |
# force a different block by using the next available sentences | |
current_start = sentences.index(self.last_text_block.split('.')[0] + '.') if self.last_text_block else 0 | |
next_start = (current_start + 1) % len(sentences) | |
block = sentences[next_start] | |
# Truncate to max_words if necessary | |
words = block.split() | |
if len(words) > max_words: | |
block = " ".join(words[:max_words]) | |
self.last_text_block = block | |
return block | |
def submit_image(self, image, text_block, username: Optional[str] = None): | |
if image is not None and username: | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
self.collected_pairs.append({ | |
"text": text_block, | |
"image": image, | |
"timestamp": timestamp, | |
"username": username | |
}) | |
return self.get_random_text_block(201) | |
def skip_text(self, text_block, username: Optional[str] = None): | |
return self.get_random_text_block(201) | |
def get_leaderboard(self): | |
try: | |
dataset = datasets.load_dataset("rawwerks/handwriting-ocr-all", split="train") | |
# Count contributions by non-anonymous users | |
user_counts = {} | |
for item in dataset: | |
if item['user'] != 'anonymous': | |
user_counts[item['user']] = user_counts.get(item['user'], 0) + 1 | |
# Create a pandas DataFrame for better styling | |
df = pd.DataFrame(user_counts.items(), columns=['Username', 'Contributions']) | |
df['Rank'] = range(1, len(df) + 1) | |
df['Medal'] = df['Rank'].apply(lambda x: "🏆" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else "👏") | |
# Reorder columns | |
df = df[['Rank', 'Medal', 'Username', 'Contributions']] | |
# Style the DataFrame | |
styled_df = df.style\ | |
.set_properties(**{ | |
'text-align': 'center', | |
'font-size': '16px', | |
'padding': '10px', | |
'border': '1px solid #ddd' | |
})\ | |
.set_table_styles([ | |
{'selector': 'th', 'props': [ | |
('background-color', '#f4f4f4'), | |
('color', '#333'), | |
('font-weight', 'bold'), | |
('text-align', 'center'), | |
('padding', '12px'), | |
('border', '1px solid #ddd') | |
]}, | |
{'selector': 'tr:nth-of-type(odd)', 'props': [ | |
('background-color', '#f9f9f9') | |
]}, | |
{'selector': 'tr:hover', 'props': [ | |
('background-color', '#f5f5f5') | |
]} | |
]) | |
return styled_df | |
except Exception as e: | |
print(f"Error fetching leaderboard: {e}") | |
return pd.DataFrame(columns=['Rank', 'Medal', 'Username', 'Contributions']) | |
def strip_metadata(image: Image.Image) -> Image.Image: | |
""" | |
Helper function to strip all metadata from the provided image data. | |
""" | |
if image is None: | |
raise gr.Error("No valid image provided") | |
# Create a new image with the same pixel data but no metadata | |
data = list(image.getdata()) | |
stripped_image = Image.new(image.mode, image.size) | |
stripped_image.putdata(data) | |
return stripped_image | |
def transform_webcam(image: np.ndarray) -> np.ndarray: | |
"""Transform webcam input to ensure text is readable""" | |
if image is None: | |
return None | |
# Flip the image horizontally to un-mirror it | |
return cv2.flip(image, 1) | |
class UserState: | |
def __init__(self): | |
self.username = None | |
self.is_logged_in = False | |
def update_from_profile(self, profile: gr.OAuthProfile | None) -> None: | |
"""Update user state from Gradio OAuth profile""" | |
self.is_logged_in = profile is not None and getattr(profile, "username", None) is not None | |
self.username = profile.username if self.is_logged_in else None | |
def create_gradio_interface(): | |
collector = OCRDataCollector() | |
user_state = UserState() | |
with gr.Blocks() as demo: | |
gr.Markdown("# Handwriting OCR Dataset Creator") | |
gr.Markdown("## After almost 100 years of research, handwriting recognition still sucks. Together, we can change that.") | |
# Add leaderboard section at the top | |
gr.Markdown("### 🏆 Top Contributors", show_label=False) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pass | |
with gr.Column(scale=2, min_width=400): | |
leaderboard = gr.Dataframe( | |
value=collector.get_leaderboard(), | |
elem_id="leaderboard", | |
visible=True, | |
interactive=False, | |
show_label=False | |
) | |
with gr.Column(scale=1): | |
pass | |
gr.Markdown("### Step 1: Log in with your Hugging Face account to use this app.") | |
# Login section - centered | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pass | |
with gr.Column(scale=2, min_width=200): | |
login_btn = gr.LoginButton(elem_id="login_btn") | |
# Activate the login button so OAuth is correctly initialized. | |
login_btn.activate() | |
user_info = gr.Markdown( | |
value="<center>Please log in with your Hugging Face account to contribute to the dataset.</center>", | |
elem_id="user_info" | |
) | |
# Create a hidden state component to store the OAuth profile. | |
profile_state = gr.State() | |
with gr.Column(scale=1): | |
pass | |
# Update user info based on the OAuth profile. | |
def update_user_info(profile: gr.OAuthProfile | None) -> str: | |
if profile and getattr(profile, "username", None): | |
return f"<center>Logged in as: {profile.username}</center>" | |
else: | |
return "<center>Please log in with your Hugging Face account to contribute to the dataset.</center>" | |
demo.load(update_user_info, inputs=None, outputs=user_info) | |
# Store the OAuth profile in the hidden state. | |
def store_profile(profile: gr.OAuthProfile | None) -> gr.OAuthProfile | None: | |
return profile | |
demo.load(store_profile, inputs=None, outputs=profile_state) | |
gr.Markdown( | |
"### Step 2: Read the text. " | |
"You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. " | |
"You can change the maximum number of words you are willing to write by using the slider below. " | |
"If you wish to skip the current text, click 'Skip'." | |
) | |
text_box = gr.Textbox( | |
value=collector.current_text_block, | |
label="Text to Handwrite", | |
interactive=False, | |
lines=10, | |
show_copy_button=True, | |
visible=True, | |
elem_id="text_box" | |
) | |
max_words_slider = gr.Slider( | |
1, 201, step=5, value=201, | |
label="Maximum Number of Words", | |
interactive=True, | |
visible=True, | |
elem_id="max_words_slider" | |
) | |
regenerate_btn = gr.Button( | |
"Regenerate Text", | |
visible=True, | |
elem_id="regenerate_btn" | |
) | |
gr.Markdown("### Step 3: Upload an image of your handwritten version of the text") | |
upload_info = gr.Markdown( | |
value="You must be logged in to do this, to help us prevent spam submissions", | |
elem_id="upload_info" | |
) | |
image_input = gr.Image( | |
type="pil", | |
label="Upload Handwritten Image", | |
sources=["upload", "webcam"], | |
mirror_webcam=False, # Explicitly set to false to ensure text is readable | |
visible=False, | |
elem_id="image_input" | |
) | |
with gr.Column(visible=False) as dataset_options: | |
private_checkbox = gr.Checkbox( | |
value=True, | |
label="Private", | |
interactive=True, | |
elem_id="private_cb" | |
) | |
private_explanation = gr.Markdown( | |
"*Private: Creates a new dataset on your account named '/handwriting-ocr-private' and appends data there.*", | |
elem_id="private_exp" | |
) | |
public_checkbox = gr.Checkbox( | |
value=True, | |
label="Public", | |
interactive=True, | |
elem_id="public_cb" | |
) | |
public_explanation = gr.Markdown( | |
"*Public: Will be added to our [public Handwriting OCR dataset](https://huggingface.co/datasets/rawwerks/handwriting-ocr-all). By submitting, you are giving permission to be added to the dataset.*", | |
elem_id="public_exp" | |
) | |
anonymous_checkbox = gr.Checkbox( | |
value=False, | |
label="Submit Anonymously", | |
interactive=True, | |
elem_id="anonymous_cb" | |
) | |
anonymous_explanation = gr.Markdown( | |
"*If un-checked, your HF username will be appended next to your submission and you will be added to the leaderboard. If checked, your submission will be anonymous in the public dataset.*", | |
elem_id="anonymous_exp" | |
) | |
with gr.Row(visible=False) as button_row: | |
submit_btn = gr.Button("Submit", elem_id="submit_btn") | |
# Update user state when profile changes | |
def update_user_state(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None = None, *args): | |
user_state.update_from_profile(profile) | |
is_logged_in = user_state.is_logged_in | |
message = "Please upload your handwritten image of the text below." if is_logged_in else "You must be logged in to do this, to help us prevent spam submissions" | |
return { | |
upload_info: gr.update(value=message), | |
image_input: gr.update(visible=is_logged_in), | |
dataset_options: gr.update(visible=is_logged_in), | |
button_row: gr.update(visible=is_logged_in) | |
} | |
# Load initial state and update UI visibility | |
demo.load(update_user_state, inputs=profile_state, outputs=[upload_info, image_input, dataset_options, button_row]) | |
# Also load leaderboard on page load | |
demo.load(fn=lambda: collector.get_leaderboard(), outputs=leaderboard) | |
def handle_submit( | |
text: str, | |
upload_image: Image.Image, | |
max_words: int, | |
public_checkbox: bool, | |
anonymous_checkbox: bool, | |
collector: OCRDataCollector | None = None, | |
profile: gr.OAuthProfile | None = None, | |
oauth_token: gr.OAuthToken | None = None, | |
*args | |
): | |
"""Handle submission using separate credentials: | |
- For public dataset updates, the master token is loaded from .env. | |
- For private dataset updates, the user's OAuth token is used.""" | |
print(f"Debug - Initial params:") | |
print(f"Text: {text[:50]}") | |
image = upload_image if upload_image is not None else None | |
print(f"Image type: {type(image)}") | |
print(f"Max words: {max_words}") | |
print(f"Public checkbox: {public_checkbox}") | |
print(f"Anonymous checkbox: {anonymous_checkbox}") | |
print(f"Collector type: {type(collector)}") | |
if collector is None: | |
raise gr.Error("Internal error: OCR collector not initialized") | |
if not user_state.is_logged_in: | |
raise gr.Error("Please log in to use this application") | |
if not isinstance(image, Image.Image): | |
raise gr.Error("Please upload a valid image before submitting") | |
# Strip metadata from validated image | |
stripped_image = strip_metadata(image) | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
temp_dir = "temp" | |
os.makedirs(temp_dir, exist_ok=True) | |
# Public dataset submission using master credentials from .env | |
if public_checkbox: | |
master_token = os.getenv("PUBLIC_DATASET_TOKEN") | |
if not master_token: | |
raise gr.Error("Master token for public dataset not configured in .env") | |
public_repo_id = "rawwerks/handwriting-ocr-all" | |
filename_public = f"{timestamp}_public.png" | |
temp_path_public = os.path.join(temp_dir, filename_public) | |
stripped_image.save(temp_path_public) | |
try: | |
collector.hf_api.dataset_info(public_repo_id) | |
except Exception: | |
collector.hf_api.create_repo(public_repo_id, repo_type="dataset", private=False) | |
features = datasets.Features({ | |
'text': datasets.Value('string'), | |
'image': datasets.Image(), | |
'timestamp': datasets.Value('string'), | |
'user': datasets.Value('string') | |
}) | |
try: | |
dataset = datasets.load_dataset(public_repo_id, split="train") | |
except Exception: | |
dataset = datasets.Dataset.from_dict({ | |
'text': [], | |
'image': [], | |
'timestamp': [], | |
'user': [] | |
}, features=features) | |
dataset = dataset.add_item({ | |
'text': text, | |
'image': temp_path_public, | |
'timestamp': timestamp, | |
'user': 'anonymous' if anonymous_checkbox else user_state.username | |
}) | |
dataset.push_to_hub(public_repo_id, split="train", token=master_token) | |
os.remove(temp_path_public) | |
# Private dataset submission using user's OAuth token | |
if private_checkbox: # Only proceed with private dataset if checkbox is checked | |
if oauth_token is None: | |
raise gr.Error("Authentication token is missing. Please log in again.") | |
if not hasattr(oauth_token, 'token') or not oauth_token.token: | |
raise gr.Error("Invalid OAuth token. Please log in again with the required scopes (write-repos, manage-repos).") | |
private_repo_id = f"{user_state.username}/handwriting-ocr-private" | |
filename_private = f"{timestamp}_private.png" | |
temp_path_private = os.path.join(temp_dir, filename_private) | |
stripped_image.save(temp_path_private) | |
try: | |
# Initialize HfApi with the OAuth token | |
hf_api = HfApi(token=oauth_token.token) | |
try: | |
# Try to get dataset info first | |
hf_api.dataset_info(private_repo_id) | |
except Exception: | |
# Create repo if it doesn't exist | |
hf_api.create_repo( | |
repo_id=private_repo_id, | |
repo_type="dataset", | |
private=True, | |
token=oauth_token.token # Explicitly pass token here | |
) | |
features = datasets.Features({ | |
'text': datasets.Value('string'), | |
'image': datasets.Image(), | |
'timestamp': datasets.Value('string') | |
}) | |
try: | |
# Load dataset with explicit token | |
dataset = datasets.load_dataset(private_repo_id, split="train", token=oauth_token.token) | |
except Exception: | |
# If dataset doesn't exist yet, create an empty one | |
dataset = datasets.Dataset.from_dict({ | |
'text': [], | |
'image': [], | |
'timestamp': [] | |
}, features=features) | |
# Add the new item | |
dataset = dataset.add_item({ | |
'text': text, | |
'image': temp_path_private, | |
'timestamp': timestamp | |
}) | |
# Push to hub with explicit token | |
dataset.push_to_hub( | |
private_repo_id, | |
split="train", | |
token=oauth_token.token, | |
private=True | |
) | |
os.remove(temp_path_private) | |
except Exception as e: | |
raise gr.Error(f"Failed to save to private dataset: {str(e)}") | |
# Ensure at least one checkbox is selected | |
if not public_checkbox and not private_checkbox: | |
raise gr.Error("Please select at least one dataset (public or private) to save to.") | |
new_text = collector.get_random_text_block(max_words) | |
return None, new_text, collector.get_leaderboard() | |
# Submit button click handler with simplified inputs | |
submit_btn.click( | |
fn=handle_submit, | |
inputs=[ | |
text_box, # Text to handwrite | |
image_input, # Uploaded image | |
max_words_slider, # Max words | |
public_checkbox, # Public dataset option | |
anonymous_checkbox, | |
gr.State(collector), | |
gr.State(None), # Profile will be filled by Gradio | |
gr.State(None) # Token will be filled by Gradio | |
], | |
outputs=[image_input, text_box, leaderboard] | |
) | |
def handle_regenerate(text, max_words): | |
# Allow anyone to regenerate text regardless of login status. | |
return collector.get_random_text_block(max_words) | |
regenerate_btn.click( | |
fn=handle_regenerate, | |
inputs=[text_box, max_words_slider], | |
outputs=text_box | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_gradio_interface() | |
demo.launch() |