import gradio as gr from pydantic import BaseModel, Field from typing import Optional, Any # Import statements that should only run once if gr.NO_RELOAD: import random import os from datetime import datetime from huggingface_hub import HfApi from typing import Optional from PIL import Image # Needed for working with PIL images import datasets import numpy as np # Added to help handle numpy array images import pandas as pd # Added for pandas DataFrame import cv2 # Added for OpenCV # Load environment variables from .env if available. from dotenv import load_dotenv load_dotenv() # The list of sentences from our previous conversation. sentences = [ "Optical character recognition (OCR) is the process of converting images of text into machine-readable data.", "When applied to handwriting, OCR faces additional challenges because of the natural variability in individual penmanship.", "Over the last century, advances in computer vision and machine learning have transformed handwriting OCR from bulky, specialized hardware into highly accurate, software-driven systems.", "The origins of OCR date back to the early 20th century.", "Early pioneers explored how machines might read text.", "In the 1920s, inventors such as Emanuel Goldberg developed early devices that could capture printed characters by converting them into telegraph codes.", "Around the same time, Gustav Tauschek created the Reading Machine using template-matching methods to detect letters in images.", "These devices were designed for printed text and depended on fixed, machine-friendly fonts rather than natural handwriting.", "In the 1950s, systems like David Shepard's GISMO emerged to begin automating the conversion of paper records into digital form.", "Although these early OCR systems were limited in scope and accuracy, they laid the groundwork for later innovations.", "The 1960s saw OCR technology being applied to real-world tasks.", "In 1965, American inventor Jacob Rabinow developed an OCR machine specifically aimed at sorting mail by reading addresses.", "This was a critical step for the U.S. Postal Service.", "Soon after, research groups, including those at IBM, began developing machines such as the IBM 1287, which was capable of reading handprinted numbers on envelopes to facilitate automated mail processing.", "These systems marked the first attempts to apply computer vision to handwritten data on a large scale.", "By the late 1980s and early 1990s, researchers such as Yann LeCun and his colleagues developed neural network architectures to recognize handwritten digits.", "Their work, initially applied to reading ZIP codes on mail, demonstrated that carefully designed, constrained neural networks could achieve error rates as low as about 1% on USPS data.", "Sargur Srihari and his team at the Center of Excellence for Document Analysis and Recognition extended these ideas to develop complete handwritten address interpretation systems.", "These systems, deployed by the USPS and postal agencies worldwide, helped automate the routing of mail and revolutionized the sorting process.", "The development and evaluation of handwriting OCR have been driven in part by standard benchmark datasets.", "The MNIST dataset, introduced in the 1990s, consists of 70,000 images of handwritten digits and became the de facto benchmark for handwritten digit recognition.", "Complementing MNIST is the USPS dataset, which provides images of handโwritten digits derived from actual envelopes and captures real-world variability.", "Handwriting OCR entered a new era with the introduction of neural network models.", "In 1989, LeCun et al. applied backpropagation to a convolutional neural network tailored for handwritten digit recognition, an innovation that evolved into the LeNet series.", "By automatically learning features rather than relying on hand-designed templates, these networks drastically improved recognition performance.", "As computational power increased and large labeled datasets became available, deep learning models, particularly convolutional neural networks and recurrent neural networks, pushed the accuracy of handwriting OCR to near-human levels.", "Modern systems can handle both printed and cursive text, automatically segmenting and recognizing characters in complex handwritten documents.", "Cursive handwriting presents a classic challenge known as Sayre's paradox, where word recognition requires letter segmentation and letter segmentation requires word recognition.", "Contemporary approaches use implicit segmentation methods, often combined with hidden Markov models or end-to-end neural networks, to circumvent this paradox.", "Today's handwriting OCR systems are highly accurate and widely deployed.", "Modern systems combine OCR with artificial intelligence to not only recognize text but also extract meaning, verify data, and integrate into larger enterprise workflows.", "Projects such as In Codice Ratio use deep convolutional networks to transcribe historical handwritten documents, further expanding OCR applications.", "Despite impressive advances, handwriting OCR continues to face challenges with highly variable or degraded handwriting.", "Ongoing research aims to improve recognition accuracy, particularly for cursive and unconstrained handwriting, and to extend support across languages and historical scripts.", "With improvements in deep learning architectures, increased computing power, and large annotated datasets, future OCR systems are expected to become even more robust, handling real-world handwriting in diverse applications from postal services to archival digitization.", "Today's research in handwriting OCR benefits from a wide array of well-established datasets and ongoing evaluation challenges.", "These resources help drive the development of increasingly robust systems for both digit and full-text recognition.", "For handwritten digit recognition, the MNIST dataset remains the most widely used benchmark thanks to its simplicity and broad adoption.", "Complementing MNIST is the USPS dataset, which is derived from actual mail envelopes and provides additional challenges with real-world variability.", "The IAM Handwriting Database is one of the most popular datasets for unconstrained offline handwriting recognition and includes scanned pages of handwritten English text with corresponding transcriptions.", "It is frequently used to train and evaluate models that work on full-line or full-page recognition tasks.", "For systems designed to capture the dynamic aspects of handwriting, such as pen stroke trajectories, the IAM On-Line Handwriting Database offers valuable data.", "The CVL dataset provides multi-writer handwritten texts with a range of writing styles, making it useful for assessing the generalization capabilities of OCR systems across diverse handwriting samples.", "The RIMES dataset, developed for French handwriting recognition, contains scanned documents and is a key resource for evaluating systems in multilingual settings.", "Various ICDAR competitions, such as ICDAR 2013 and ICDAR 2017, have released datasets that reflect the complexities of real-world handwriting, including historical documents and unconstrained writing.", "For Arabic handwriting recognition, the KHATT dataset offers a collection of handwritten texts that capture the unique challenges of cursive and context-dependent scripts.", "These datasets, along with continual evaluation efforts through competitions hosted at ICDAR and ICFHR, ensure that the field keeps pushing toward higher accuracy, better robustness, and broader language coverage.", "Emerging benchmarks, often tailored to specific scripts, historical documents, or noisy real-world data, will further refine the state-of-the-art in handwriting OCR.", "This array of resources continues to shape the development of handwriting OCR systems today.", "This additional section outlines today's most influential datasets and benchmarks, highlighting how they continue to shape the development of handwriting OCR systems." ] class SubmissionData(BaseModel): text: str = Field(..., description="Text to be handwritten") profile: Any = Field(..., description="Gradio OAuth profile") image: Optional[Image.Image] = Field(None, description="Uploaded handwritten image") max_words: int = Field(..., ge=1, le=201, description="Maximum number of words") public_checkbox: bool = Field(..., description="Submit to public dataset") model_config = { "arbitrary_types_allowed": True # Allow PIL.Image.Image type } class OCRDataCollector: def __init__(self): self.collected_pairs = [] self.last_text_block = None self.current_text_block = self.get_random_text_block(201) # Default max words self.hf_api = HfApi() def get_random_text_block(self, max_words: int): attempts = 0 max_attempts = 10 # Prevent infinite loop in case of very small sentence list while attempts < max_attempts: block_length = random.randint(1, 5) start_index = random.randint(0, len(sentences) - block_length) block = " ".join(sentences[start_index:start_index + block_length]) # Truncate to max_words if necessary words = block.split() if len(words) > max_words: block = " ".join(words[:max_words]) # If this block is different from the last one, use it if block != self.last_text_block: self.last_text_block = block return block attempts += 1 # If we couldn't find a different block after max attempts, # force a different block by using the next available sentences current_start = sentences.index(self.last_text_block.split('.')[0] + '.') if self.last_text_block else 0 next_start = (current_start + 1) % len(sentences) block = sentences[next_start] # Truncate to max_words if necessary words = block.split() if len(words) > max_words: block = " ".join(words[:max_words]) self.last_text_block = block return block def submit_image(self, image, text_block, username: Optional[str] = None): if image is not None and username: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.collected_pairs.append({ "text": text_block, "image": image, "timestamp": timestamp, "username": username }) return self.get_random_text_block(201) def skip_text(self, text_block, username: Optional[str] = None): return self.get_random_text_block(201) def get_leaderboard(self): try: dataset = datasets.load_dataset("rawwerks/handwriting-ocr-all", split="train") # Count contributions by non-anonymous users user_counts = {} for item in dataset: if item['user'] != 'anonymous': user_counts[item['user']] = user_counts.get(item['user'], 0) + 1 # Create a pandas DataFrame for better styling df = pd.DataFrame(user_counts.items(), columns=['Username', 'Contributions']) df['Rank'] = range(1, len(df) + 1) df['Medal'] = df['Rank'].apply(lambda x: "๐" if x == 1 else "๐ฅ" if x == 2 else "๐ฅ" if x == 3 else "๐") # Reorder columns df = df[['Rank', 'Medal', 'Username', 'Contributions']] # Style the DataFrame styled_df = df.style\ .set_properties(**{ 'text-align': 'center', 'font-size': '16px', 'padding': '10px', 'border': '1px solid #ddd' })\ .set_table_styles([ {'selector': 'th', 'props': [ ('background-color', '#f4f4f4'), ('color', '#333'), ('font-weight', 'bold'), ('text-align', 'center'), ('padding', '12px'), ('border', '1px solid #ddd') ]}, {'selector': 'tr:nth-of-type(odd)', 'props': [ ('background-color', '#f9f9f9') ]}, {'selector': 'tr:hover', 'props': [ ('background-color', '#f5f5f5') ]} ]) return styled_df except Exception as e: print(f"Error fetching leaderboard: {e}") return pd.DataFrame(columns=['Rank', 'Medal', 'Username', 'Contributions']) def strip_metadata(image: Image.Image) -> Image.Image: """ Helper function to strip all metadata from the provided image data. """ if image is None: raise gr.Error("No valid image provided") # Create a new image with the same pixel data but no metadata data = list(image.getdata()) stripped_image = Image.new(image.mode, image.size) stripped_image.putdata(data) return stripped_image def transform_webcam(image: np.ndarray) -> np.ndarray: """Transform webcam input to ensure text is readable""" if image is None: return None # Flip the image horizontally to un-mirror it return cv2.flip(image, 1) class UserState: def __init__(self): self.username = None self.is_logged_in = False def update_from_profile(self, profile: gr.OAuthProfile | None) -> None: """Update user state from Gradio OAuth profile""" self.is_logged_in = profile is not None and getattr(profile, "username", None) is not None self.username = profile.username if self.is_logged_in else None def create_gradio_interface(): collector = OCRDataCollector() user_state = UserState() with gr.Blocks() as demo: gr.Markdown("# Handwriting OCR Dataset Creator") gr.Markdown("## After almost 100 years of research, handwriting recognition still sucks. Together, we can change that.") # Add leaderboard section at the top gr.Markdown("### ๐ Top Contributors", show_label=False) with gr.Row(): with gr.Column(scale=1): pass with gr.Column(scale=2, min_width=400): leaderboard = gr.Dataframe( value=collector.get_leaderboard(), elem_id="leaderboard", visible=True, interactive=False, show_label=False ) with gr.Column(scale=1): pass gr.Markdown("### Step 1: Log in with your Hugging Face account to use this app.") # Login section - centered with gr.Row(): with gr.Column(scale=1): pass with gr.Column(scale=2, min_width=200): login_btn = gr.LoginButton(elem_id="login_btn") # Activate the login button so OAuth is correctly initialized. login_btn.activate() user_info = gr.Markdown( value="