Spaces:
Runtime error
Runtime error
Raymond Weitekamp
commited on
Commit
·
8388b94
1
Parent(s):
4478f16
feat: add anonymous submission option for public dataset
Browse files- app.py +20 -3
- clear_dataset.py +35 -0
app.py
CHANGED
@@ -244,6 +244,17 @@ def create_gradio_interface():
|
|
244 |
"*Public: Will be added to our public dataset. By submitting, you are giving permission to be added to the dataset.*",
|
245 |
elem_id="public_exp"
|
246 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
with gr.Row(visible=False) as button_row:
|
249 |
submit_btn = gr.Button("Submit", elem_id="submit_btn")
|
@@ -269,6 +280,7 @@ def create_gradio_interface():
|
|
269 |
image: Image.Image,
|
270 |
max_words: int,
|
271 |
public_checkbox: bool,
|
|
|
272 |
collector: OCRDataCollector | None = None,
|
273 |
profile: gr.OAuthProfile | None = None,
|
274 |
oauth_token: gr.OAuthToken | None = None,
|
@@ -282,6 +294,7 @@ def create_gradio_interface():
|
|
282 |
print(f"Image type: {type(image)}")
|
283 |
print(f"Max words: {max_words}")
|
284 |
print(f"Public checkbox: {public_checkbox}")
|
|
|
285 |
print(f"Collector type: {type(collector)}")
|
286 |
|
287 |
if collector is None:
|
@@ -318,7 +331,8 @@ def create_gradio_interface():
|
|
318 |
features = datasets.Features({
|
319 |
'text': datasets.Value('string'),
|
320 |
'image': datasets.Image(),
|
321 |
-
'timestamp': datasets.Value('string')
|
|
|
322 |
})
|
323 |
|
324 |
try:
|
@@ -327,13 +341,15 @@ def create_gradio_interface():
|
|
327 |
dataset = datasets.Dataset.from_dict({
|
328 |
'text': [],
|
329 |
'image': [],
|
330 |
-
'timestamp': []
|
|
|
331 |
}, features=features)
|
332 |
|
333 |
dataset = dataset.add_item({
|
334 |
'text': text,
|
335 |
'image': temp_path_public,
|
336 |
-
'timestamp': timestamp
|
|
|
337 |
})
|
338 |
|
339 |
dataset.push_to_hub(public_repo_id, split="train", token=master_token)
|
@@ -388,6 +404,7 @@ def create_gradio_interface():
|
|
388 |
image_input, # Uploaded image
|
389 |
max_words_slider, # Max words
|
390 |
public_checkbox, # Public dataset option
|
|
|
391 |
gr.State(collector),
|
392 |
gr.State(None), # Profile will be filled by Gradio
|
393 |
gr.State(None) # Token will be filled by Gradio
|
|
|
244 |
"*Public: Will be added to our public dataset. By submitting, you are giving permission to be added to the dataset.*",
|
245 |
elem_id="public_exp"
|
246 |
)
|
247 |
+
|
248 |
+
anonymous_checkbox = gr.Checkbox(
|
249 |
+
value=False,
|
250 |
+
label="Submit Anonymously",
|
251 |
+
interactive=True,
|
252 |
+
elem_id="anonymous_cb"
|
253 |
+
)
|
254 |
+
anonymous_explanation = gr.Markdown(
|
255 |
+
"*If un-checked, your HF username will be appended next to your submission and you will be added to the leaderboard. If checked, your submission will be anonymous in the public dataset.*",
|
256 |
+
elem_id="anonymous_exp"
|
257 |
+
)
|
258 |
|
259 |
with gr.Row(visible=False) as button_row:
|
260 |
submit_btn = gr.Button("Submit", elem_id="submit_btn")
|
|
|
280 |
image: Image.Image,
|
281 |
max_words: int,
|
282 |
public_checkbox: bool,
|
283 |
+
anonymous_checkbox: bool,
|
284 |
collector: OCRDataCollector | None = None,
|
285 |
profile: gr.OAuthProfile | None = None,
|
286 |
oauth_token: gr.OAuthToken | None = None,
|
|
|
294 |
print(f"Image type: {type(image)}")
|
295 |
print(f"Max words: {max_words}")
|
296 |
print(f"Public checkbox: {public_checkbox}")
|
297 |
+
print(f"Anonymous checkbox: {anonymous_checkbox}")
|
298 |
print(f"Collector type: {type(collector)}")
|
299 |
|
300 |
if collector is None:
|
|
|
331 |
features = datasets.Features({
|
332 |
'text': datasets.Value('string'),
|
333 |
'image': datasets.Image(),
|
334 |
+
'timestamp': datasets.Value('string'),
|
335 |
+
'user': datasets.Value('string')
|
336 |
})
|
337 |
|
338 |
try:
|
|
|
341 |
dataset = datasets.Dataset.from_dict({
|
342 |
'text': [],
|
343 |
'image': [],
|
344 |
+
'timestamp': [],
|
345 |
+
'user': []
|
346 |
}, features=features)
|
347 |
|
348 |
dataset = dataset.add_item({
|
349 |
'text': text,
|
350 |
'image': temp_path_public,
|
351 |
+
'timestamp': timestamp,
|
352 |
+
'user': 'anonymous' if anonymous_checkbox else user_state.username
|
353 |
})
|
354 |
|
355 |
dataset.push_to_hub(public_repo_id, split="train", token=master_token)
|
|
|
404 |
image_input, # Uploaded image
|
405 |
max_words_slider, # Max words
|
406 |
public_checkbox, # Public dataset option
|
407 |
+
anonymous_checkbox,
|
408 |
gr.State(collector),
|
409 |
gr.State(None), # Profile will be filled by Gradio
|
410 |
gr.State(None) # Token will be filled by Gradio
|
clear_dataset.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from huggingface_hub import HfApi
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Load environment variables
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
# Get the token from environment variable
|
10 |
+
token = os.getenv('PUBLIC_DATASET_TOKEN')
|
11 |
+
if not token:
|
12 |
+
raise ValueError('PUBLIC_DATASET_TOKEN not found in environment')
|
13 |
+
|
14 |
+
# Initialize the Hugging Face API client
|
15 |
+
api = HfApi(token=token)
|
16 |
+
|
17 |
+
# Create an empty dataset with the same schema
|
18 |
+
features = {
|
19 |
+
'text': 'string',
|
20 |
+
'image': 'image',
|
21 |
+
'timestamp': 'string',
|
22 |
+
'user': 'string'
|
23 |
+
}
|
24 |
+
|
25 |
+
# Create and push empty dataset
|
26 |
+
from datasets import Dataset
|
27 |
+
empty_dataset = Dataset.from_dict({
|
28 |
+
'text': [],
|
29 |
+
'image': [],
|
30 |
+
'timestamp': [],
|
31 |
+
'user': []
|
32 |
+
})
|
33 |
+
|
34 |
+
# Push the empty dataset to hub
|
35 |
+
empty_dataset.push_to_hub('rawwerks/handwriting-ocr-all', private=False, token=token)
|