Spaces:

rawwerks
/

handwriting-ocr

Runtime error

App Files Files Community

Raymond Weitekamp commited on Feb 9

Commit

0d3abd2

1 Parent(s): 31d39e4

feat: add environment variable support and improve dataset submission

Browse files

Files changed (3) hide show

.gitignore +6 -1
app.py +62 -24
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -29,4 +29,9 @@ test-image.png
 /chrome_profile/
-auth_state.json

 /chrome_profile/
+auth_state.json
+.env
+*.env
+.env.*

app.py CHANGED Viewed

@@ -12,6 +12,10 @@ if gr.NO_RELOAD:
     import datasets
     import numpy as np  # Added to help handle numpy array images
 # The list of sentences from our previous conversation.
 sentences = [
     "Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
@@ -270,7 +274,9 @@ def create_gradio_interface():
             oauth_token: gr.OAuthToken | None = None,
             *args
         ):
-            """Handle submission with clean parameter order"""
             print(f"Debug - Initial params:")
             print(f"Text: {text[:50]}")
             print(f"Image type: {type(image)}")
@@ -291,26 +297,24 @@ def create_gradio_interface():
             stripped_image = strip_metadata(image)
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            # Define targets based on checkboxes
-            targets = []
-            if public_checkbox:
-                targets.append(("public", "rawwerks/handwriting-ocr-all", "_public", False))
-            targets.append(("private", f"{user_state.username}/handwriting-ocr-private", "_private", True))
             temp_dir = "temp"
             os.makedirs(temp_dir, exist_ok=True)
-            for ds_type, repo_id, suffix, is_private in targets:
                 try:
-                    collector.hf_api.dataset_info(repo_id)
                 except Exception:
-                    collector.hf_api.create_repo(repo_id, repo_type="dataset", private=is_private)
-                filename = f"{timestamp}{suffix}.png"
-                temp_path = os.path.join(temp_dir, filename)
-                stripped_image.save(temp_path)
-                # Define features to properly handle image files
                 features = datasets.Features({
                     'text': datasets.Value('string'),
                     'image': datasets.Image(),
@@ -318,26 +322,60 @@ def create_gradio_interface():
                 })
                 try:
-                    # Try to load existing dataset
-                    dataset = datasets.load_dataset(repo_id, split="train")
                 except Exception:
-                    # If no existing dataset, create a new empty one
                     dataset = datasets.Dataset.from_dict({
                         'text': [],
                         'image': [],
                         'timestamp': []
                     }, features=features)
-                # Add the new item
                 dataset = dataset.add_item({
                     'text': text,
-                    'image': temp_path,
                     'timestamp': timestamp
                 })
-                # Push updates to hub
-                dataset.push_to_hub(repo_id, split="train", token=oauth_token.token if oauth_token else None)
-                os.remove(temp_path)
             new_text = collector.get_random_text_block(max_words)
             return None, new_text

     import datasets
     import numpy as np  # Added to help handle numpy array images
+# Load environment variables from .env if available.
+from dotenv import load_dotenv
+load_dotenv()
 # The list of sentences from our previous conversation.
 sentences = [
     "Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
             oauth_token: gr.OAuthToken | None = None,
             *args
         ):
+            """Handle submission using separate credentials:
+               - For public dataset updates, the master token is loaded from .env.
+               - For private dataset updates, the user's token is used."""
             print(f"Debug - Initial params:")
             print(f"Text: {text[:50]}")
             print(f"Image type: {type(image)}")
             stripped_image = strip_metadata(image)
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             temp_dir = "temp"
             os.makedirs(temp_dir, exist_ok=True)
+            # Public dataset submission using master credentials from .env
+            if public_checkbox:
+                master_token = os.getenv("PUBLIC_DATASET_TOKEN")
+                if not master_token:
+                    raise gr.Error("Master token for public dataset not configured in .env")
+                public_repo_id = "rawwerks/handwriting-ocr-all"
+                filename_public = f"{timestamp}_public.png"
+                temp_path_public = os.path.join(temp_dir, filename_public)
+                stripped_image.save(temp_path_public)
                 try:
+                    collector.hf_api.dataset_info(public_repo_id)
                 except Exception:
+                    collector.hf_api.create_repo(public_repo_id, repo_type="dataset", private=False)
                 features = datasets.Features({
                     'text': datasets.Value('string'),
                     'image': datasets.Image(),
                 })
                 try:
+                    dataset = datasets.load_dataset(public_repo_id, split="train")
                 except Exception:
                     dataset = datasets.Dataset.from_dict({
                         'text': [],
                         'image': [],
                         'timestamp': []
                     }, features=features)
                 dataset = dataset.add_item({
                     'text': text,
+                    'image': temp_path_public,
                     'timestamp': timestamp
                 })
+                dataset.push_to_hub(public_repo_id, split="train", token=master_token)
+                os.remove(temp_path_public)
+            # Private dataset submission using user's credentials
+            if oauth_token is None:
+                raise gr.Error("User token is missing. Please log in properly.")
+            private_repo_id = f"{user_state.username}/handwriting-ocr-private"
+            filename_private = f"{timestamp}_private.png"
+            temp_path_private = os.path.join(temp_dir, filename_private)
+            stripped_image.save(temp_path_private)
+            try:
+                collector.hf_api.dataset_info(private_repo_id)
+            except Exception:
+                collector.hf_api.create_repo(private_repo_id, repo_type="dataset", private=True)
+            features = datasets.Features({
+                'text': datasets.Value('string'),
+                'image': datasets.Image(),
+                'timestamp': datasets.Value('string')
+            })
+            try:
+                dataset = datasets.load_dataset(private_repo_id, split="train")
+            except Exception:
+                dataset = datasets.Dataset.from_dict({
+                    'text': [],
+                    'image': [],
+                    'timestamp': []
+                }, features=features)
+            dataset = dataset.add_item({
+                'text': text,
+                'image': temp_path_private,
+                'timestamp': timestamp
+            })
+            dataset.push_to_hub(private_repo_id, split="train", token=oauth_token.token)
+            os.remove(temp_path_private)
             new_text = collector.get_random_text_block(max_words)
             return None, new_text

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ pytest-playwright>=0.4.0
 pytest-asyncio>=0.23.0
 playwright>=1.40.0
 datasets>=2.16.0
-pydantic>=2.6.1

 pytest-asyncio>=0.23.0
 playwright>=1.40.0
 datasets>=2.16.0
+pydantic>=2.6.1
+python-dotenv>=1.0.0