Wauplin HF staff commited on
Commit
3531f81
·
1 Parent(s): a61fe81

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +111 -0
  2. app_1M_image.py +112 -0
  3. app_image.py +50 -0
  4. app_json.py +44 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import gradio as gr
4
+
5
+ from app_1M_image import get_demo as get_demo_1M_image
6
+ from app_image import get_demo as get_demo_image
7
+ from app_json import get_demo as get_demo_json
8
+ from huggingface_hub import logging
9
+
10
+
11
+ logging.set_verbosity_debug()
12
+
13
+
14
+ def _get_demo_code(path: str) -> str:
15
+ code = Path(path).read_text()
16
+ code = code.replace("def get_demo():", "with gr.Blocks() as demo:")
17
+ code += "\n\ndemo.launch()"
18
+ return code
19
+
20
+
21
+ DEMO_EXPLANATION = """
22
+ <h1 style='text-align: center; margin-bottom: 1rem'> How to persist data from a Space to a Dataset? </h1>
23
+
24
+ This demo shows how to leverage both `gradio` and `huggingface_hub` to save data from a Space to a Dataset on the Hub.
25
+ When doing so, a few things must be taken care of: file formats, concurrent writes, name collision, number of commits,
26
+ number of files,... The tabs below shows different ways of implementing a "save to dataset" feature. Depending on the
27
+ complexity and usage of your app, you might want to use one or the other.
28
+
29
+ This Space demo comes as a pair with this guide. If you need more technical details, please refer to it.
30
+ """
31
+
32
+ JSON_DEMO_EXPLANATION = """
33
+ ## Use case
34
+
35
+ - Save inputs and outputs
36
+ - Build an annotation platform
37
+
38
+ ## Data
39
+
40
+ Json-able only: text and numeric but no binaries.
41
+
42
+ ## Robustness
43
+
44
+ Works with concurrent users and replicas.
45
+
46
+ ## Limitations
47
+
48
+ if you expect millions of lines, you will need to split the local JSON file into multiple files to avoid getting your file tracked as LFS (5MB) on the Hub.
49
+
50
+ ## Demo
51
+ """
52
+
53
+ IMAGE_DEMO_EXPLANATION = """
54
+ ## Use case
55
+
56
+ Save images with metadata (caption, parameters, datetime,...).
57
+
58
+ ## Robustness
59
+
60
+ Works with concurrent users and replicas.
61
+
62
+ ## Limitations
63
+
64
+ - only 10k images/folder supported on the Hub. If you expect more usage, you must save data in subfolders.
65
+ - only 1M images/repo supported on the Hub. If you expect more usage, you can zip your data before upload. See the _1M images Dataset_ demo.
66
+
67
+ ## Demo
68
+ """
69
+
70
+ IMAGE_1M_DEMO_EXPLANATION = """
71
+ ## Use case:
72
+
73
+ Same as _Image Dataset_ example, but with very high usage expected.
74
+
75
+ ## Robustness
76
+
77
+ Works with concurrent users and replicas.
78
+
79
+ ## Limitations
80
+
81
+ None.
82
+
83
+ ## Demo
84
+ """
85
+
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown(DEMO_EXPLANATION)
88
+
89
+ with gr.Tab("JSON Dataset"):
90
+ gr.Markdown(JSON_DEMO_EXPLANATION)
91
+ get_demo_json()
92
+ gr.Markdown("## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-commit-scheduler-json\n\n## Code")
93
+ with gr.Accordion("Source code", open=True):
94
+ gr.Code(_get_demo_code("app_json.py"), language="python")
95
+
96
+ with gr.Tab("Image Dataset"):
97
+ gr.Markdown(IMAGE_DEMO_EXPLANATION)
98
+ get_demo_image()
99
+ gr.Markdown("## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-commit-scheduler-image\n\n## Code")
100
+ with gr.Accordion("Source code", open=True):
101
+ gr.Code(_get_demo_code("app_image.py"), language="python")
102
+
103
+ with gr.Tab("1M images Dataset"):
104
+ gr.Markdown(IMAGE_1M_DEMO_EXPLANATION)
105
+ get_demo_1M_image()
106
+ gr.Markdown(
107
+ "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-commit-scheduler-image-zip\n\n## Code"
108
+ )
109
+ with gr.Accordion("Source code", open=True):
110
+ gr.Code(_get_demo_code("app_1M_image.py"), language="python")
111
+ demo.launch()
app_1M_image.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import tempfile
3
+ import zipfile
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from uuid import uuid4
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
+ from PIL import Image
11
+
12
+ from huggingface_hub import CommitScheduler, InferenceClient
13
+
14
+
15
+ IMAGE_DATASET_DIR = Path("image_dataset_1M") / f"train-{uuid4()}"
16
+
17
+ IMAGE_DATASET_DIR.mkdir(parents=True, exist_ok=True)
18
+ IMAGE_JSONL_PATH = IMAGE_DATASET_DIR / "metadata.jsonl"
19
+
20
+
21
+ class ZipScheduler(CommitScheduler):
22
+ """
23
+ Example of a custom CommitScheduler with overwritten `push_to_hub` to zip images before pushing them to the Hub.
24
+
25
+ Workflow:
26
+ 1. Read metadata + list PNG files.
27
+ 2. Zip png files in a single archive.
28
+ 3. Create commit (metadata + archive).
29
+ 4. Delete local png files to avoid re-uploading them later.
30
+
31
+ Only step 1 requires to activate the lock. Once the metadata is read, the lock is released and the rest of the
32
+ process can be done without blocking the Gradio app.
33
+ """
34
+
35
+ def push_to_hub(self):
36
+ # 1. Read metadata + list PNG files
37
+ with self.lock:
38
+ png_files = list(self.folder_path.glob("*.png"))
39
+ if len(png_files) == 0:
40
+ return None # return early if nothing to commit
41
+
42
+ # Read and delete metadata file
43
+ metadata = IMAGE_JSONL_PATH.read_text()
44
+ try:
45
+ IMAGE_JSONL_PATH.unlink()
46
+ except Exception:
47
+ pass
48
+
49
+ with tempfile.TemporaryDirectory() as tmpdir:
50
+ # 2. Zip png files + metadata in a single archive
51
+ archive_path = Path(tmpdir) / "train.zip"
52
+ with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as zip:
53
+ # PNG files
54
+ for png_file in png_files:
55
+ zip.write(filename=png_file, arcname=png_file.name)
56
+
57
+ # Metadata
58
+ tmp_metadata = Path(tmpdir) / "metadata.jsonl"
59
+ tmp_metadata.write_text(metadata)
60
+ zip.write(filename=tmp_metadata, arcname="metadata.jsonl")
61
+
62
+ # 3. Create commit
63
+ self.api.upload_file(
64
+ repo_id=self.repo_id,
65
+ repo_type=self.repo_type,
66
+ revision=self.revision,
67
+ path_in_repo=f"train-{uuid4()}.zip",
68
+ path_or_fileobj=archive_path,
69
+ )
70
+
71
+ # 4. Delete local png files to avoid re-uploading them later
72
+ for png_file in png_files:
73
+ try:
74
+ png_file.unlink()
75
+ except Exception:
76
+ pass
77
+
78
+
79
+ scheduler = ZipScheduler(
80
+ repo_id="example-commit-scheduler-image-zip",
81
+ repo_type="dataset",
82
+ folder_path=IMAGE_DATASET_DIR,
83
+ )
84
+
85
+ client = InferenceClient()
86
+
87
+
88
+ def generate_image(prompt: str) -> Image:
89
+ return client.text_to_image(prompt)
90
+
91
+
92
+ def save_image(prompt: str, image_array: np.ndarray) -> None:
93
+ print("Saving: " + prompt)
94
+ image_path = IMAGE_DATASET_DIR / f"{uuid4()}.png"
95
+
96
+ with scheduler.lock:
97
+ Image.fromarray(image_array).save(image_path)
98
+ with IMAGE_JSONL_PATH.open("a") as f:
99
+ json.dump({"prompt": prompt, "file_name": image_path.name, "datetime": datetime.now().isoformat()}, f)
100
+ f.write("\n")
101
+
102
+
103
+ def get_demo():
104
+ with gr.Row():
105
+ prompt_value = gr.Textbox(label="Prompt")
106
+ image_value = gr.Image(label="Generated image")
107
+ text_to_image_btn = gr.Button("Generate")
108
+ text_to_image_btn.click(fn=generate_image, inputs=prompt_value, outputs=image_value).success(
109
+ fn=save_image,
110
+ inputs=[prompt_value, image_value],
111
+ outputs=None,
112
+ )
app_image.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from uuid import uuid4
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ from PIL import Image
9
+
10
+ from huggingface_hub import CommitScheduler, InferenceClient
11
+
12
+
13
+ IMAGE_DATASET_DIR = Path("image_dataset") / f"train-{uuid4()}"
14
+ IMAGE_DATASET_DIR.mkdir(parents=True, exist_ok=True)
15
+ IMAGE_JSONL_PATH = IMAGE_DATASET_DIR / "metadata.jsonl"
16
+
17
+ scheduler = CommitScheduler(
18
+ repo_id="example-commit-scheduler-image",
19
+ repo_type="dataset",
20
+ folder_path=IMAGE_DATASET_DIR,
21
+ path_in_repo=IMAGE_DATASET_DIR.name,
22
+ )
23
+
24
+ client = InferenceClient()
25
+
26
+
27
+ def generate_image(prompt: str) -> Image:
28
+ return client.text_to_image(prompt)
29
+
30
+
31
+ def save_image(prompt: str, image_array: np.ndarray) -> None:
32
+ image_path = IMAGE_DATASET_DIR / f"{uuid4()}.png"
33
+
34
+ with scheduler.lock:
35
+ Image.fromarray(image_array).save(image_path)
36
+ with IMAGE_JSONL_PATH.open("a") as f:
37
+ json.dump({"prompt": prompt, "file_name": image_path.name, "datetime": datetime.now().isoformat()}, f)
38
+ f.write("\n")
39
+
40
+
41
+ def get_demo():
42
+ with gr.Row():
43
+ prompt_value = gr.Textbox(label="Prompt")
44
+ image_value = gr.Image(label="Generated image")
45
+ text_to_image_btn = gr.Button("Generate")
46
+ text_to_image_btn.click(fn=generate_image, inputs=prompt_value, outputs=image_value).success(
47
+ fn=save_image,
48
+ inputs=[prompt_value, image_value],
49
+ outputs=None,
50
+ )
app_json.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from uuid import uuid4
5
+
6
+ import gradio as gr
7
+
8
+ from huggingface_hub import CommitScheduler
9
+
10
+
11
+ JSON_DATASET_DIR = Path("json_dataset")
12
+ JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
13
+
14
+ JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
15
+
16
+ scheduler = CommitScheduler(
17
+ repo_id="example-commit-scheduler-json",
18
+ repo_type="dataset",
19
+ folder_path=JSON_DATASET_DIR,
20
+ path_in_repo="data",
21
+ )
22
+
23
+
24
+ def greet(name: str) -> str:
25
+ return "Hello " + name + "!"
26
+
27
+
28
+ def save_json(name: str, greetings: str) -> None:
29
+ with scheduler.lock:
30
+ with JSON_DATASET_PATH.open("a") as f:
31
+ json.dump({"name": name, "greetings": greetings, "datetime": datetime.now().isoformat()}, f)
32
+ f.write("\n")
33
+
34
+
35
+ def get_demo():
36
+ with gr.Row():
37
+ greet_name = gr.Textbox(label="Name")
38
+ greet_output = gr.Textbox(label="Greetings")
39
+ greet_btn = gr.Button("Greet")
40
+ greet_btn.click(fn=greet, inputs=greet_name, outputs=greet_output).success(
41
+ fn=save_json,
42
+ inputs=[greet_name, greet_output],
43
+ outputs=None,
44
+ )