Spaces:

rawwerks
/

handwriting-ocr

Runtime error

handwriting-ocr / app.py

Raymond Weitekamp

link to dataset

a597e2b about 1 month ago

29 kB

	import gradio as gr
	from pydantic import BaseModel, Field
	from typing import Optional, Any
	# Import statements that should only run once
	if gr.NO_RELOAD:
	import random
	import os
	from datetime import datetime
	from huggingface_hub import HfApi
	from typing import Optional
	from PIL import Image # Needed for working with PIL images
	import datasets
	import numpy as np # Added to help handle numpy array images
	import pandas as pd # Added for pandas DataFrame
	import cv2 # Added for OpenCV

	# Load environment variables from .env if available.
	from dotenv import load_dotenv
	load_dotenv()

	# The list of sentences from our previous conversation.
	sentences = [
	"Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
	"When applied to handwriting, OCR faces additional challenges because of the natural variability in individual penmanship.",
	"Over the last century, advances in computer vision and machine learning have transformed handwriting OCR from bulky, specialized hardware into highly accurate, software-driven systems.",
	"The origins of OCR date back to the early 20th century.",
	"Early pioneers explored how machines might read text.",
	"In the 1920s, inventors such as Emanuel Goldberg developed early devices that could capture printed characters by converting them into telegraph codes.",
	"Around the same time, Gustav Tauschek created the Reading Machine using template-matching methods to detect letters in images.",
	"These devices were designed for printed text and depended on fixed, machine-friendly fonts rather than natural handwriting.",
	"In the 1950s, systems like David Shepard's GISMO emerged to begin automating the conversion of paper records into digital form.",
	"Although these early OCR systems were limited in scope and accuracy, they laid the groundwork for later innovations.",
	"The 1960s saw OCR technology being applied to real-world tasks.",
	"In 1965, American inventor Jacob Rabinow developed an OCR machine specifically aimed at sorting mail by reading addresses.",
	"This was a critical step for the U.S. Postal Service.",
	"Soon after, research groups, including those at IBM, began developing machines such as the IBM 1287, which was capable of reading handprinted numbers on envelopes to facilitate automated mail processing.",
	"These systems marked the first attempts to apply computer vision to handwritten data on a large scale.",
	"By the late 1980s and early 1990s, researchers such as Yann LeCun and his colleagues developed neural network architectures to recognize handwritten digits.",
	"Their work, initially applied to reading ZIP codes on mail, demonstrated that carefully designed, constrained neural networks could achieve error rates as low as about 1% on USPS data.",
	"Sargur Srihari and his team at the Center of Excellence for Document Analysis and Recognition extended these ideas to develop complete handwritten address interpretation systems.",
	"These systems, deployed by the USPS and postal agencies worldwide, helped automate the routing of mail and revolutionized the sorting process.",
	"The development and evaluation of handwriting OCR have been driven in part by standard benchmark datasets.",
	"The MNIST dataset, introduced in the 1990s, consists of 70,000 images of handwritten digits and became the de facto benchmark for handwritten digit recognition.",
	"Complementing MNIST is the USPS dataset, which provides images of hand‐written digits derived from actual envelopes and captures real-world variability.",
	"Handwriting OCR entered a new era with the introduction of neural network models.",
	"In 1989, LeCun et al. applied backpropagation to a convolutional neural network tailored for handwritten digit recognition, an innovation that evolved into the LeNet series.",
	"By automatically learning features rather than relying on hand-designed templates, these networks drastically improved recognition performance.",
	"As computational power increased and large labeled datasets became available, deep learning models, particularly convolutional neural networks and recurrent neural networks, pushed the accuracy of handwriting OCR to near-human levels.",
	"Modern systems can handle both printed and cursive text, automatically segmenting and recognizing characters in complex handwritten documents.",
	"Cursive handwriting presents a classic challenge known as Sayre's paradox, where word recognition requires letter segmentation and letter segmentation requires word recognition.",
	"Contemporary approaches use implicit segmentation methods, often combined with hidden Markov models or end-to-end neural networks, to circumvent this paradox.",
	"Today's handwriting OCR systems are highly accurate and widely deployed.",
	"Modern systems combine OCR with artificial intelligence to not only recognize text but also extract meaning, verify data, and integrate into larger enterprise workflows.",
	"Projects such as In Codice Ratio use deep convolutional networks to transcribe historical handwritten documents, further expanding OCR applications.",
	"Despite impressive advances, handwriting OCR continues to face challenges with highly variable or degraded handwriting.",
	"Ongoing research aims to improve recognition accuracy, particularly for cursive and unconstrained handwriting, and to extend support across languages and historical scripts.",
	"With improvements in deep learning architectures, increased computing power, and large annotated datasets, future OCR systems are expected to become even more robust, handling real-world handwriting in diverse applications from postal services to archival digitization.",
	"Today's research in handwriting OCR benefits from a wide array of well-established datasets and ongoing evaluation challenges.",
	"These resources help drive the development of increasingly robust systems for both digit and full-text recognition.",
	"For handwritten digit recognition, the MNIST dataset remains the most widely used benchmark thanks to its simplicity and broad adoption.",
	"Complementing MNIST is the USPS dataset, which is derived from actual mail envelopes and provides additional challenges with real-world variability.",
	"The IAM Handwriting Database is one of the most popular datasets for unconstrained offline handwriting recognition and includes scanned pages of handwritten English text with corresponding transcriptions.",
	"It is frequently used to train and evaluate models that work on full-line or full-page recognition tasks.",
	"For systems designed to capture the dynamic aspects of handwriting, such as pen stroke trajectories, the IAM On-Line Handwriting Database offers valuable data.",
	"The CVL dataset provides multi-writer handwritten texts with a range of writing styles, making it useful for assessing the generalization capabilities of OCR systems across diverse handwriting samples.",
	"The RIMES dataset, developed for French handwriting recognition, contains scanned documents and is a key resource for evaluating systems in multilingual settings.",
	"Various ICDAR competitions, such as ICDAR 2013 and ICDAR 2017, have released datasets that reflect the complexities of real-world handwriting, including historical documents and unconstrained writing.",
	"For Arabic handwriting recognition, the KHATT dataset offers a collection of handwritten texts that capture the unique challenges of cursive and context-dependent scripts.",
	"These datasets, along with continual evaluation efforts through competitions hosted at ICDAR and ICFHR, ensure that the field keeps pushing toward higher accuracy, better robustness, and broader language coverage.",
	"Emerging benchmarks, often tailored to specific scripts, historical documents, or noisy real-world data, will further refine the state-of-the-art in handwriting OCR.",
	"This array of resources continues to shape the development of handwriting OCR systems today.",
	"This additional section outlines today's most influential datasets and benchmarks, highlighting how they continue to shape the development of handwriting OCR systems."
	]

	class SubmissionData(BaseModel):
	text: str = Field(..., description="Text to be handwritten")
	profile: Any = Field(..., description="Gradio OAuth profile")
	image: Optional[Image.Image] = Field(None, description="Uploaded handwritten image")
	max_words: int = Field(..., ge=1, le=201, description="Maximum number of words")
	public_checkbox: bool = Field(..., description="Submit to public dataset")

	model_config = {
	"arbitrary_types_allowed": True # Allow PIL.Image.Image type
	}

	class OCRDataCollector:
	def __init__(self):
	self.collected_pairs = []
	self.last_text_block = None
	self.current_text_block = self.get_random_text_block(201) # Default max words
	self.hf_api = HfApi()

	def get_random_text_block(self, max_words: int):
	attempts = 0
	max_attempts = 10 # Prevent infinite loop in case of very small sentence list

	while attempts < max_attempts:
	block_length = random.randint(1, 5)
	start_index = random.randint(0, len(sentences) - block_length)
	block = " ".join(sentences[start_index:start_index + block_length])

	# Truncate to max_words if necessary
	words = block.split()
	if len(words) > max_words:
	block = " ".join(words[:max_words])

	# If this block is different from the last one, use it
	if block != self.last_text_block:
	self.last_text_block = block
	return block

	attempts += 1

	# If we couldn't find a different block after max attempts,
	# force a different block by using the next available sentences
	current_start = sentences.index(self.last_text_block.split('.')[0] + '.') if self.last_text_block else 0
	next_start = (current_start + 1) % len(sentences)
	block = sentences[next_start]

	# Truncate to max_words if necessary
	words = block.split()
	if len(words) > max_words:
	block = " ".join(words[:max_words])

	self.last_text_block = block
	return block

	def submit_image(self, image, text_block, username: Optional[str] = None):
	if image is not None and username:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	self.collected_pairs.append({
	"text": text_block,
	"image": image,
	"timestamp": timestamp,
	"username": username
	})
	return self.get_random_text_block(201)

	def skip_text(self, text_block, username: Optional[str] = None):
	return self.get_random_text_block(201)

	def get_leaderboard(self):
	try:
	dataset = datasets.load_dataset("rawwerks/handwriting-ocr-all", split="train")
	# Count contributions by non-anonymous users
	user_counts = {}
	for item in dataset:
	if item['user'] != 'anonymous':
	user_counts[item['user']] = user_counts.get(item['user'], 0) + 1

	# Create a pandas DataFrame for better styling
	df = pd.DataFrame(user_counts.items(), columns=['Username', 'Contributions'])
	df['Rank'] = range(1, len(df) + 1)
	df['Medal'] = df['Rank'].apply(lambda x: "🏆" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else "👏")

	# Reorder columns
	df = df[['Rank', 'Medal', 'Username', 'Contributions']]

	# Style the DataFrame
	styled_df = df.style\
	.set_properties(**{
	'text-align': 'center',
	'font-size': '16px',
	'padding': '10px',
	'border': '1px solid #ddd'
	})\
	.set_table_styles([
	{'selector': 'th', 'props': [
	('background-color', '#f4f4f4'),
	('color', '#333'),
	('font-weight', 'bold'),
	('text-align', 'center'),
	('padding', '12px'),
	('border', '1px solid #ddd')
	]},
	{'selector': 'tr:nth-of-type(odd)', 'props': [
	('background-color', '#f9f9f9')
	]},
	{'selector': 'tr:hover', 'props': [
	('background-color', '#f5f5f5')
	]}
	])

	return styled_df
	except Exception as e:
	print(f"Error fetching leaderboard: {e}")
	return pd.DataFrame(columns=['Rank', 'Medal', 'Username', 'Contributions'])

	def strip_metadata(image: Image.Image) -> Image.Image:
	"""
	Helper function to strip all metadata from the provided image data.
	"""
	if image is None:
	raise gr.Error("No valid image provided")

	# Create a new image with the same pixel data but no metadata
	data = list(image.getdata())
	stripped_image = Image.new(image.mode, image.size)
	stripped_image.putdata(data)
	return stripped_image

	def transform_webcam(image: np.ndarray) -> np.ndarray:
	"""Transform webcam input to ensure text is readable"""
	if image is None:
	return None
	# Flip the image horizontally to un-mirror it
	return cv2.flip(image, 1)

	class UserState:
	def __init__(self):
	self.username = None
	self.is_logged_in = False

	def update_from_profile(self, profile: gr.OAuthProfile \| None) -> None:
	"""Update user state from Gradio OAuth profile"""
	self.is_logged_in = profile is not None and getattr(profile, "username", None) is not None
	self.username = profile.username if self.is_logged_in else None

	def create_gradio_interface():
	collector = OCRDataCollector()
	user_state = UserState()

	with gr.Blocks() as demo:
	gr.Markdown("# Handwriting OCR Dataset Creator")
	gr.Markdown("## After almost 100 years of research, handwriting recognition still sucks. Together, we can change that.")

	# Add leaderboard section at the top
	gr.Markdown("### 🏆 Top Contributors", show_label=False)
	with gr.Row():
	with gr.Column(scale=1):
	pass
	with gr.Column(scale=2, min_width=400):
	leaderboard = gr.Dataframe(
	value=collector.get_leaderboard(),
	elem_id="leaderboard",
	visible=True,
	interactive=False,
	show_label=False
	)
	with gr.Column(scale=1):
	pass

	gr.Markdown("### Step 1: Log in with your Hugging Face account to use this app.")
	# Login section - centered
	with gr.Row():
	with gr.Column(scale=1):
	pass
	with gr.Column(scale=2, min_width=200):
	login_btn = gr.LoginButton(elem_id="login_btn")
	# Activate the login button so OAuth is correctly initialized.
	login_btn.activate()
	user_info = gr.Markdown(
	value="<center>Please log in with your Hugging Face account to contribute to the dataset.</center>",
	elem_id="user_info"
	)
	# Create a hidden state component to store the OAuth profile.
	profile_state = gr.State()
	with gr.Column(scale=1):
	pass

	# Update user info based on the OAuth profile.
	def update_user_info(profile: gr.OAuthProfile \| None) -> str:
	if profile and getattr(profile, "username", None):
	return f"<center>Logged in as: {profile.username}</center>"
	else:
	return "<center>Please log in with your Hugging Face account to contribute to the dataset.</center>"

	demo.load(update_user_info, inputs=None, outputs=user_info)

	# Store the OAuth profile in the hidden state.
	def store_profile(profile: gr.OAuthProfile \| None) -> gr.OAuthProfile \| None:
	return profile
	demo.load(store_profile, inputs=None, outputs=profile_state)

	gr.Markdown(
	"### Step 2: Read the text. "
	"You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. "
	"You can change the maximum number of words you are willing to write by using the slider below. "
	"If you wish to skip the current text, click 'Skip'."
	)

	text_box = gr.Textbox(
	value=collector.current_text_block,
	label="Text to Handwrite",
	interactive=False,
	lines=10,
	show_copy_button=True,
	visible=True,
	elem_id="text_box"
	)

	max_words_slider = gr.Slider(
	1, 201, step=5, value=201,
	label="Maximum Number of Words",
	interactive=True,
	visible=True,
	elem_id="max_words_slider"
	)

	regenerate_btn = gr.Button(
	"Regenerate Text",
	visible=True,
	elem_id="regenerate_btn"
	)

	gr.Markdown("### Step 3: Upload an image of your handwritten version of the text")

	upload_info = gr.Markdown(
	value="You must be logged in to do this, to help us prevent spam submissions",
	elem_id="upload_info"
	)

	image_input = gr.Image(
	type="pil",
	label="Upload Handwritten Image",
	sources=["upload", "webcam"],
	mirror_webcam=False, # Explicitly set to false to ensure text is readable
	visible=False,
	elem_id="image_input"
	)

	with gr.Column(visible=False) as dataset_options:
	private_checkbox = gr.Checkbox(
	value=True,
	label="Private",
	interactive=True,
	elem_id="private_cb"
	)
	private_explanation = gr.Markdown(
	"Private: Creates a new dataset on your account named '/handwriting-ocr-private' and appends data there.",
	elem_id="private_exp"
	)

	public_checkbox = gr.Checkbox(
	value=True,
	label="Public",
	interactive=True,
	elem_id="public_cb"
	)
	public_explanation = gr.Markdown(
	"Public: Will be added to our [public Handwriting OCR dataset](https://huggingface.co/datasets/rawwerks/handwriting-ocr-all). By submitting, you are giving permission to be added to the dataset.",
	elem_id="public_exp"
	)

	anonymous_checkbox = gr.Checkbox(
	value=False,
	label="Submit Anonymously",
	interactive=True,
	elem_id="anonymous_cb"
	)
	anonymous_explanation = gr.Markdown(
	"If un-checked, your HF username will be appended next to your submission and you will be added to the leaderboard. If checked, your submission will be anonymous in the public dataset.",
	elem_id="anonymous_exp"
	)

	with gr.Row(visible=False) as button_row:
	submit_btn = gr.Button("Submit", elem_id="submit_btn")

	# Update user state when profile changes
	def update_user_state(profile: gr.OAuthProfile \| None, oauth_token: gr.OAuthToken \| None = None, *args):
	user_state.update_from_profile(profile)
	is_logged_in = user_state.is_logged_in
	message = "Please upload your handwritten image of the text below." if is_logged_in else "You must be logged in to do this, to help us prevent spam submissions"

	return {
	upload_info: gr.update(value=message),
	image_input: gr.update(visible=is_logged_in),
	dataset_options: gr.update(visible=is_logged_in),
	button_row: gr.update(visible=is_logged_in)
	}

	# Load initial state and update UI visibility
	demo.load(update_user_state, inputs=profile_state, outputs=[upload_info, image_input, dataset_options, button_row])

	# Also load leaderboard on page load
	demo.load(fn=lambda: collector.get_leaderboard(), outputs=leaderboard)

	def handle_submit(
	text: str,
	upload_image: Image.Image,
	max_words: int,
	public_checkbox: bool,
	anonymous_checkbox: bool,
	collector: OCRDataCollector \| None = None,
	profile: gr.OAuthProfile \| None = None,
	oauth_token: gr.OAuthToken \| None = None,
	*args
	):
	"""Handle submission using separate credentials:
	- For public dataset updates, the master token is loaded from .env.
	- For private dataset updates, the user's OAuth token is used."""
	print(f"Debug - Initial params:")
	print(f"Text: {text[:50]}")
	image = upload_image if upload_image is not None else None
	print(f"Image type: {type(image)}")
	print(f"Max words: {max_words}")
	print(f"Public checkbox: {public_checkbox}")
	print(f"Anonymous checkbox: {anonymous_checkbox}")
	print(f"Collector type: {type(collector)}")

	if collector is None:
	raise gr.Error("Internal error: OCR collector not initialized")

	if not user_state.is_logged_in:
	raise gr.Error("Please log in to use this application")

	if not isinstance(image, Image.Image):
	raise gr.Error("Please upload a valid image before submitting")

	# Strip metadata from validated image
	stripped_image = strip_metadata(image)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	temp_dir = "temp"
	os.makedirs(temp_dir, exist_ok=True)

	# Public dataset submission using master credentials from .env
	if public_checkbox:
	master_token = os.getenv("PUBLIC_DATASET_TOKEN")
	if not master_token:
	raise gr.Error("Master token for public dataset not configured in .env")
	public_repo_id = "rawwerks/handwriting-ocr-all"
	filename_public = f"{timestamp}_public.png"
	temp_path_public = os.path.join(temp_dir, filename_public)
	stripped_image.save(temp_path_public)

	try:
	collector.hf_api.dataset_info(public_repo_id)
	except Exception:
	collector.hf_api.create_repo(public_repo_id, repo_type="dataset", private=False)

	features = datasets.Features({
	'text': datasets.Value('string'),
	'image': datasets.Image(),
	'timestamp': datasets.Value('string'),
	'user': datasets.Value('string')
	})

	try:
	dataset = datasets.load_dataset(public_repo_id, split="train")
	except Exception:
	dataset = datasets.Dataset.from_dict({
	'text': [],
	'image': [],
	'timestamp': [],
	'user': []
	}, features=features)

	dataset = dataset.add_item({
	'text': text,
	'image': temp_path_public,
	'timestamp': timestamp,
	'user': 'anonymous' if anonymous_checkbox else user_state.username
	})

	dataset.push_to_hub(public_repo_id, split="train", token=master_token)
	os.remove(temp_path_public)

	# Private dataset submission using user's OAuth token
	if private_checkbox: # Only proceed with private dataset if checkbox is checked
	if oauth_token is None:
	raise gr.Error("Authentication token is missing. Please log in again.")

	if not hasattr(oauth_token, 'token') or not oauth_token.token:
	raise gr.Error("Invalid OAuth token. Please log in again with the required scopes (write-repos, manage-repos).")

	private_repo_id = f"{user_state.username}/handwriting-ocr-private"
	filename_private = f"{timestamp}_private.png"
	temp_path_private = os.path.join(temp_dir, filename_private)
	stripped_image.save(temp_path_private)

	try:
	# Initialize HfApi with the OAuth token
	hf_api = HfApi(token=oauth_token.token)

	try:
	# Try to get dataset info first
	hf_api.dataset_info(private_repo_id)
	except Exception:
	# Create repo if it doesn't exist
	hf_api.create_repo(
	repo_id=private_repo_id,
	repo_type="dataset",
	private=True,
	token=oauth_token.token # Explicitly pass token here
	)

	features = datasets.Features({
	'text': datasets.Value('string'),
	'image': datasets.Image(),
	'timestamp': datasets.Value('string')
	})

	try:
	# Load dataset with explicit token
	dataset = datasets.load_dataset(private_repo_id, split="train", token=oauth_token.token)
	except Exception:
	# If dataset doesn't exist yet, create an empty one
	dataset = datasets.Dataset.from_dict({
	'text': [],
	'image': [],
	'timestamp': []
	}, features=features)

	# Add the new item
	dataset = dataset.add_item({
	'text': text,
	'image': temp_path_private,
	'timestamp': timestamp
	})

	# Push to hub with explicit token
	dataset.push_to_hub(
	private_repo_id,
	split="train",
	token=oauth_token.token,
	private=True
	)
	os.remove(temp_path_private)

	except Exception as e:
	raise gr.Error(f"Failed to save to private dataset: {str(e)}")

	# Ensure at least one checkbox is selected
	if not public_checkbox and not private_checkbox:
	raise gr.Error("Please select at least one dataset (public or private) to save to.")

	new_text = collector.get_random_text_block(max_words)
	return None, new_text, collector.get_leaderboard()

	# Submit button click handler with simplified inputs
	submit_btn.click(
	fn=handle_submit,
	inputs=[
	text_box, # Text to handwrite
	image_input, # Uploaded image
	max_words_slider, # Max words
	public_checkbox, # Public dataset option
	anonymous_checkbox,
	gr.State(collector),
	gr.State(None), # Profile will be filled by Gradio
	gr.State(None) # Token will be filled by Gradio
	],
	outputs=[image_input, text_box, leaderboard]
	)

	def handle_regenerate(text, max_words):
	# Allow anyone to regenerate text regardless of login status.
	return collector.get_random_text_block(max_words)

	regenerate_btn.click(
	fn=handle_regenerate,
	inputs=[text_box, max_words_slider],
	outputs=text_box
	)

	return demo

	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.launch()