Raymond Weitekamp commited on
Commit
0d3abd2
·
1 Parent(s): 31d39e4

feat: add environment variable support and improve dataset submission

Browse files
Files changed (3) hide show
  1. .gitignore +6 -1
  2. app.py +62 -24
  3. requirements.txt +2 -1
.gitignore CHANGED
@@ -29,4 +29,9 @@ test-image.png
29
 
30
  /chrome_profile/
31
 
32
- auth_state.json
 
 
 
 
 
 
29
 
30
  /chrome_profile/
31
 
32
+ auth_state.json
33
+
34
+ .env
35
+ *.env
36
+ .env.*
37
+
app.py CHANGED
@@ -12,6 +12,10 @@ if gr.NO_RELOAD:
12
  import datasets
13
  import numpy as np # Added to help handle numpy array images
14
 
 
 
 
 
15
  # The list of sentences from our previous conversation.
16
  sentences = [
17
  "Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
@@ -270,7 +274,9 @@ def create_gradio_interface():
270
  oauth_token: gr.OAuthToken | None = None,
271
  *args
272
  ):
273
- """Handle submission with clean parameter order"""
 
 
274
  print(f"Debug - Initial params:")
275
  print(f"Text: {text[:50]}")
276
  print(f"Image type: {type(image)}")
@@ -291,26 +297,24 @@ def create_gradio_interface():
291
  stripped_image = strip_metadata(image)
292
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
293
 
294
- # Define targets based on checkboxes
295
- targets = []
296
- if public_checkbox:
297
- targets.append(("public", "rawwerks/handwriting-ocr-all", "_public", False))
298
- targets.append(("private", f"{user_state.username}/handwriting-ocr-private", "_private", True))
299
-
300
  temp_dir = "temp"
301
  os.makedirs(temp_dir, exist_ok=True)
302
-
303
- for ds_type, repo_id, suffix, is_private in targets:
 
 
 
 
 
 
 
 
 
304
  try:
305
- collector.hf_api.dataset_info(repo_id)
306
  except Exception:
307
- collector.hf_api.create_repo(repo_id, repo_type="dataset", private=is_private)
308
-
309
- filename = f"{timestamp}{suffix}.png"
310
- temp_path = os.path.join(temp_dir, filename)
311
- stripped_image.save(temp_path)
312
 
313
- # Define features to properly handle image files
314
  features = datasets.Features({
315
  'text': datasets.Value('string'),
316
  'image': datasets.Image(),
@@ -318,26 +322,60 @@ def create_gradio_interface():
318
  })
319
 
320
  try:
321
- # Try to load existing dataset
322
- dataset = datasets.load_dataset(repo_id, split="train")
323
  except Exception:
324
- # If no existing dataset, create a new empty one
325
  dataset = datasets.Dataset.from_dict({
326
  'text': [],
327
  'image': [],
328
  'timestamp': []
329
  }, features=features)
330
 
331
- # Add the new item
332
  dataset = dataset.add_item({
333
  'text': text,
334
- 'image': temp_path,
335
  'timestamp': timestamp
336
  })
337
 
338
- # Push updates to hub
339
- dataset.push_to_hub(repo_id, split="train", token=oauth_token.token if oauth_token else None)
340
- os.remove(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  new_text = collector.get_random_text_block(max_words)
343
  return None, new_text
 
12
  import datasets
13
  import numpy as np # Added to help handle numpy array images
14
 
15
+ # Load environment variables from .env if available.
16
+ from dotenv import load_dotenv
17
+ load_dotenv()
18
+
19
  # The list of sentences from our previous conversation.
20
  sentences = [
21
  "Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
 
274
  oauth_token: gr.OAuthToken | None = None,
275
  *args
276
  ):
277
+ """Handle submission using separate credentials:
278
+ - For public dataset updates, the master token is loaded from .env.
279
+ - For private dataset updates, the user's token is used."""
280
  print(f"Debug - Initial params:")
281
  print(f"Text: {text[:50]}")
282
  print(f"Image type: {type(image)}")
 
297
  stripped_image = strip_metadata(image)
298
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
299
 
 
 
 
 
 
 
300
  temp_dir = "temp"
301
  os.makedirs(temp_dir, exist_ok=True)
302
+
303
+ # Public dataset submission using master credentials from .env
304
+ if public_checkbox:
305
+ master_token = os.getenv("PUBLIC_DATASET_TOKEN")
306
+ if not master_token:
307
+ raise gr.Error("Master token for public dataset not configured in .env")
308
+ public_repo_id = "rawwerks/handwriting-ocr-all"
309
+ filename_public = f"{timestamp}_public.png"
310
+ temp_path_public = os.path.join(temp_dir, filename_public)
311
+ stripped_image.save(temp_path_public)
312
+
313
  try:
314
+ collector.hf_api.dataset_info(public_repo_id)
315
  except Exception:
316
+ collector.hf_api.create_repo(public_repo_id, repo_type="dataset", private=False)
 
 
 
 
317
 
 
318
  features = datasets.Features({
319
  'text': datasets.Value('string'),
320
  'image': datasets.Image(),
 
322
  })
323
 
324
  try:
325
+ dataset = datasets.load_dataset(public_repo_id, split="train")
 
326
  except Exception:
 
327
  dataset = datasets.Dataset.from_dict({
328
  'text': [],
329
  'image': [],
330
  'timestamp': []
331
  }, features=features)
332
 
 
333
  dataset = dataset.add_item({
334
  'text': text,
335
+ 'image': temp_path_public,
336
  'timestamp': timestamp
337
  })
338
 
339
+ dataset.push_to_hub(public_repo_id, split="train", token=master_token)
340
+ os.remove(temp_path_public)
341
+
342
+ # Private dataset submission using user's credentials
343
+ if oauth_token is None:
344
+ raise gr.Error("User token is missing. Please log in properly.")
345
+
346
+ private_repo_id = f"{user_state.username}/handwriting-ocr-private"
347
+ filename_private = f"{timestamp}_private.png"
348
+ temp_path_private = os.path.join(temp_dir, filename_private)
349
+ stripped_image.save(temp_path_private)
350
+
351
+ try:
352
+ collector.hf_api.dataset_info(private_repo_id)
353
+ except Exception:
354
+ collector.hf_api.create_repo(private_repo_id, repo_type="dataset", private=True)
355
+
356
+ features = datasets.Features({
357
+ 'text': datasets.Value('string'),
358
+ 'image': datasets.Image(),
359
+ 'timestamp': datasets.Value('string')
360
+ })
361
+
362
+ try:
363
+ dataset = datasets.load_dataset(private_repo_id, split="train")
364
+ except Exception:
365
+ dataset = datasets.Dataset.from_dict({
366
+ 'text': [],
367
+ 'image': [],
368
+ 'timestamp': []
369
+ }, features=features)
370
+
371
+ dataset = dataset.add_item({
372
+ 'text': text,
373
+ 'image': temp_path_private,
374
+ 'timestamp': timestamp
375
+ })
376
+
377
+ dataset.push_to_hub(private_repo_id, split="train", token=oauth_token.token)
378
+ os.remove(temp_path_private)
379
 
380
  new_text = collector.get_random_text_block(max_words)
381
  return None, new_text
requirements.txt CHANGED
@@ -6,4 +6,5 @@ pytest-playwright>=0.4.0
6
  pytest-asyncio>=0.23.0
7
  playwright>=1.40.0
8
  datasets>=2.16.0
9
- pydantic>=2.6.1
 
 
6
  pytest-asyncio>=0.23.0
7
  playwright>=1.40.0
8
  datasets>=2.16.0
9
+ pydantic>=2.6.1
10
+ python-dotenv>=1.0.0