Spaces:
Runtime error
Runtime error
Raymond Weitekamp
commited on
Commit
·
0d3abd2
1
Parent(s):
31d39e4
feat: add environment variable support and improve dataset submission
Browse files- .gitignore +6 -1
- app.py +62 -24
- requirements.txt +2 -1
.gitignore
CHANGED
@@ -29,4 +29,9 @@ test-image.png
|
|
29 |
|
30 |
/chrome_profile/
|
31 |
|
32 |
-
auth_state.json
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
/chrome_profile/
|
31 |
|
32 |
+
auth_state.json
|
33 |
+
|
34 |
+
.env
|
35 |
+
*.env
|
36 |
+
.env.*
|
37 |
+
|
app.py
CHANGED
@@ -12,6 +12,10 @@ if gr.NO_RELOAD:
|
|
12 |
import datasets
|
13 |
import numpy as np # Added to help handle numpy array images
|
14 |
|
|
|
|
|
|
|
|
|
15 |
# The list of sentences from our previous conversation.
|
16 |
sentences = [
|
17 |
"Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
|
@@ -270,7 +274,9 @@ def create_gradio_interface():
|
|
270 |
oauth_token: gr.OAuthToken | None = None,
|
271 |
*args
|
272 |
):
|
273 |
-
"""Handle submission
|
|
|
|
|
274 |
print(f"Debug - Initial params:")
|
275 |
print(f"Text: {text[:50]}")
|
276 |
print(f"Image type: {type(image)}")
|
@@ -291,26 +297,24 @@ def create_gradio_interface():
|
|
291 |
stripped_image = strip_metadata(image)
|
292 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
293 |
|
294 |
-
# Define targets based on checkboxes
|
295 |
-
targets = []
|
296 |
-
if public_checkbox:
|
297 |
-
targets.append(("public", "rawwerks/handwriting-ocr-all", "_public", False))
|
298 |
-
targets.append(("private", f"{user_state.username}/handwriting-ocr-private", "_private", True))
|
299 |
-
|
300 |
temp_dir = "temp"
|
301 |
os.makedirs(temp_dir, exist_ok=True)
|
302 |
-
|
303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
try:
|
305 |
-
collector.hf_api.dataset_info(
|
306 |
except Exception:
|
307 |
-
collector.hf_api.create_repo(
|
308 |
-
|
309 |
-
filename = f"{timestamp}{suffix}.png"
|
310 |
-
temp_path = os.path.join(temp_dir, filename)
|
311 |
-
stripped_image.save(temp_path)
|
312 |
|
313 |
-
# Define features to properly handle image files
|
314 |
features = datasets.Features({
|
315 |
'text': datasets.Value('string'),
|
316 |
'image': datasets.Image(),
|
@@ -318,26 +322,60 @@ def create_gradio_interface():
|
|
318 |
})
|
319 |
|
320 |
try:
|
321 |
-
|
322 |
-
dataset = datasets.load_dataset(repo_id, split="train")
|
323 |
except Exception:
|
324 |
-
# If no existing dataset, create a new empty one
|
325 |
dataset = datasets.Dataset.from_dict({
|
326 |
'text': [],
|
327 |
'image': [],
|
328 |
'timestamp': []
|
329 |
}, features=features)
|
330 |
|
331 |
-
# Add the new item
|
332 |
dataset = dataset.add_item({
|
333 |
'text': text,
|
334 |
-
'image':
|
335 |
'timestamp': timestamp
|
336 |
})
|
337 |
|
338 |
-
|
339 |
-
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
new_text = collector.get_random_text_block(max_words)
|
343 |
return None, new_text
|
|
|
12 |
import datasets
|
13 |
import numpy as np # Added to help handle numpy array images
|
14 |
|
15 |
+
# Load environment variables from .env if available.
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
# The list of sentences from our previous conversation.
|
20 |
sentences = [
|
21 |
"Optical character recognition (OCR) is the process of converting images of text into machine-readable data.",
|
|
|
274 |
oauth_token: gr.OAuthToken | None = None,
|
275 |
*args
|
276 |
):
|
277 |
+
"""Handle submission using separate credentials:
|
278 |
+
- For public dataset updates, the master token is loaded from .env.
|
279 |
+
- For private dataset updates, the user's token is used."""
|
280 |
print(f"Debug - Initial params:")
|
281 |
print(f"Text: {text[:50]}")
|
282 |
print(f"Image type: {type(image)}")
|
|
|
297 |
stripped_image = strip_metadata(image)
|
298 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
temp_dir = "temp"
|
301 |
os.makedirs(temp_dir, exist_ok=True)
|
302 |
+
|
303 |
+
# Public dataset submission using master credentials from .env
|
304 |
+
if public_checkbox:
|
305 |
+
master_token = os.getenv("PUBLIC_DATASET_TOKEN")
|
306 |
+
if not master_token:
|
307 |
+
raise gr.Error("Master token for public dataset not configured in .env")
|
308 |
+
public_repo_id = "rawwerks/handwriting-ocr-all"
|
309 |
+
filename_public = f"{timestamp}_public.png"
|
310 |
+
temp_path_public = os.path.join(temp_dir, filename_public)
|
311 |
+
stripped_image.save(temp_path_public)
|
312 |
+
|
313 |
try:
|
314 |
+
collector.hf_api.dataset_info(public_repo_id)
|
315 |
except Exception:
|
316 |
+
collector.hf_api.create_repo(public_repo_id, repo_type="dataset", private=False)
|
|
|
|
|
|
|
|
|
317 |
|
|
|
318 |
features = datasets.Features({
|
319 |
'text': datasets.Value('string'),
|
320 |
'image': datasets.Image(),
|
|
|
322 |
})
|
323 |
|
324 |
try:
|
325 |
+
dataset = datasets.load_dataset(public_repo_id, split="train")
|
|
|
326 |
except Exception:
|
|
|
327 |
dataset = datasets.Dataset.from_dict({
|
328 |
'text': [],
|
329 |
'image': [],
|
330 |
'timestamp': []
|
331 |
}, features=features)
|
332 |
|
|
|
333 |
dataset = dataset.add_item({
|
334 |
'text': text,
|
335 |
+
'image': temp_path_public,
|
336 |
'timestamp': timestamp
|
337 |
})
|
338 |
|
339 |
+
dataset.push_to_hub(public_repo_id, split="train", token=master_token)
|
340 |
+
os.remove(temp_path_public)
|
341 |
+
|
342 |
+
# Private dataset submission using user's credentials
|
343 |
+
if oauth_token is None:
|
344 |
+
raise gr.Error("User token is missing. Please log in properly.")
|
345 |
+
|
346 |
+
private_repo_id = f"{user_state.username}/handwriting-ocr-private"
|
347 |
+
filename_private = f"{timestamp}_private.png"
|
348 |
+
temp_path_private = os.path.join(temp_dir, filename_private)
|
349 |
+
stripped_image.save(temp_path_private)
|
350 |
+
|
351 |
+
try:
|
352 |
+
collector.hf_api.dataset_info(private_repo_id)
|
353 |
+
except Exception:
|
354 |
+
collector.hf_api.create_repo(private_repo_id, repo_type="dataset", private=True)
|
355 |
+
|
356 |
+
features = datasets.Features({
|
357 |
+
'text': datasets.Value('string'),
|
358 |
+
'image': datasets.Image(),
|
359 |
+
'timestamp': datasets.Value('string')
|
360 |
+
})
|
361 |
+
|
362 |
+
try:
|
363 |
+
dataset = datasets.load_dataset(private_repo_id, split="train")
|
364 |
+
except Exception:
|
365 |
+
dataset = datasets.Dataset.from_dict({
|
366 |
+
'text': [],
|
367 |
+
'image': [],
|
368 |
+
'timestamp': []
|
369 |
+
}, features=features)
|
370 |
+
|
371 |
+
dataset = dataset.add_item({
|
372 |
+
'text': text,
|
373 |
+
'image': temp_path_private,
|
374 |
+
'timestamp': timestamp
|
375 |
+
})
|
376 |
+
|
377 |
+
dataset.push_to_hub(private_repo_id, split="train", token=oauth_token.token)
|
378 |
+
os.remove(temp_path_private)
|
379 |
|
380 |
new_text = collector.get_random_text_block(max_words)
|
381 |
return None, new_text
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ pytest-playwright>=0.4.0
|
|
6 |
pytest-asyncio>=0.23.0
|
7 |
playwright>=1.40.0
|
8 |
datasets>=2.16.0
|
9 |
-
pydantic>=2.6.1
|
|
|
|
6 |
pytest-asyncio>=0.23.0
|
7 |
playwright>=1.40.0
|
8 |
datasets>=2.16.0
|
9 |
+
pydantic>=2.6.1
|
10 |
+
python-dotenv>=1.0.0
|