Raymond Weitekamp commited on
Commit
8388b94
·
1 Parent(s): 4478f16

feat: add anonymous submission option for public dataset

Browse files
Files changed (2) hide show
  1. app.py +20 -3
  2. clear_dataset.py +35 -0
app.py CHANGED
@@ -244,6 +244,17 @@ def create_gradio_interface():
244
  "*Public: Will be added to our public dataset. By submitting, you are giving permission to be added to the dataset.*",
245
  elem_id="public_exp"
246
  )
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  with gr.Row(visible=False) as button_row:
249
  submit_btn = gr.Button("Submit", elem_id="submit_btn")
@@ -269,6 +280,7 @@ def create_gradio_interface():
269
  image: Image.Image,
270
  max_words: int,
271
  public_checkbox: bool,
 
272
  collector: OCRDataCollector | None = None,
273
  profile: gr.OAuthProfile | None = None,
274
  oauth_token: gr.OAuthToken | None = None,
@@ -282,6 +294,7 @@ def create_gradio_interface():
282
  print(f"Image type: {type(image)}")
283
  print(f"Max words: {max_words}")
284
  print(f"Public checkbox: {public_checkbox}")
 
285
  print(f"Collector type: {type(collector)}")
286
 
287
  if collector is None:
@@ -318,7 +331,8 @@ def create_gradio_interface():
318
  features = datasets.Features({
319
  'text': datasets.Value('string'),
320
  'image': datasets.Image(),
321
- 'timestamp': datasets.Value('string')
 
322
  })
323
 
324
  try:
@@ -327,13 +341,15 @@ def create_gradio_interface():
327
  dataset = datasets.Dataset.from_dict({
328
  'text': [],
329
  'image': [],
330
- 'timestamp': []
 
331
  }, features=features)
332
 
333
  dataset = dataset.add_item({
334
  'text': text,
335
  'image': temp_path_public,
336
- 'timestamp': timestamp
 
337
  })
338
 
339
  dataset.push_to_hub(public_repo_id, split="train", token=master_token)
@@ -388,6 +404,7 @@ def create_gradio_interface():
388
  image_input, # Uploaded image
389
  max_words_slider, # Max words
390
  public_checkbox, # Public dataset option
 
391
  gr.State(collector),
392
  gr.State(None), # Profile will be filled by Gradio
393
  gr.State(None) # Token will be filled by Gradio
 
244
  "*Public: Will be added to our public dataset. By submitting, you are giving permission to be added to the dataset.*",
245
  elem_id="public_exp"
246
  )
247
+
248
+ anonymous_checkbox = gr.Checkbox(
249
+ value=False,
250
+ label="Submit Anonymously",
251
+ interactive=True,
252
+ elem_id="anonymous_cb"
253
+ )
254
+ anonymous_explanation = gr.Markdown(
255
+ "*If un-checked, your HF username will be appended next to your submission and you will be added to the leaderboard. If checked, your submission will be anonymous in the public dataset.*",
256
+ elem_id="anonymous_exp"
257
+ )
258
 
259
  with gr.Row(visible=False) as button_row:
260
  submit_btn = gr.Button("Submit", elem_id="submit_btn")
 
280
  image: Image.Image,
281
  max_words: int,
282
  public_checkbox: bool,
283
+ anonymous_checkbox: bool,
284
  collector: OCRDataCollector | None = None,
285
  profile: gr.OAuthProfile | None = None,
286
  oauth_token: gr.OAuthToken | None = None,
 
294
  print(f"Image type: {type(image)}")
295
  print(f"Max words: {max_words}")
296
  print(f"Public checkbox: {public_checkbox}")
297
+ print(f"Anonymous checkbox: {anonymous_checkbox}")
298
  print(f"Collector type: {type(collector)}")
299
 
300
  if collector is None:
 
331
  features = datasets.Features({
332
  'text': datasets.Value('string'),
333
  'image': datasets.Image(),
334
+ 'timestamp': datasets.Value('string'),
335
+ 'user': datasets.Value('string')
336
  })
337
 
338
  try:
 
341
  dataset = datasets.Dataset.from_dict({
342
  'text': [],
343
  'image': [],
344
+ 'timestamp': [],
345
+ 'user': []
346
  }, features=features)
347
 
348
  dataset = dataset.add_item({
349
  'text': text,
350
  'image': temp_path_public,
351
+ 'timestamp': timestamp,
352
+ 'user': 'anonymous' if anonymous_checkbox else user_state.username
353
  })
354
 
355
  dataset.push_to_hub(public_repo_id, split="train", token=master_token)
 
404
  image_input, # Uploaded image
405
  max_words_slider, # Max words
406
  public_checkbox, # Public dataset option
407
+ anonymous_checkbox,
408
  gr.State(collector),
409
  gr.State(None), # Profile will be filled by Gradio
410
  gr.State(None) # Token will be filled by Gradio
clear_dataset.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from huggingface_hub import HfApi
3
+ import os
4
+
5
+ # Load environment variables
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ # Get the token from environment variable
10
+ token = os.getenv('PUBLIC_DATASET_TOKEN')
11
+ if not token:
12
+ raise ValueError('PUBLIC_DATASET_TOKEN not found in environment')
13
+
14
+ # Initialize the Hugging Face API client
15
+ api = HfApi(token=token)
16
+
17
+ # Create an empty dataset with the same schema
18
+ features = {
19
+ 'text': 'string',
20
+ 'image': 'image',
21
+ 'timestamp': 'string',
22
+ 'user': 'string'
23
+ }
24
+
25
+ # Create and push empty dataset
26
+ from datasets import Dataset
27
+ empty_dataset = Dataset.from_dict({
28
+ 'text': [],
29
+ 'image': [],
30
+ 'timestamp': [],
31
+ 'user': []
32
+ })
33
+
34
+ # Push the empty dataset to hub
35
+ empty_dataset.push_to_hub('rawwerks/handwriting-ocr-all', private=False, token=token)