asoria HF staff commited on
Commit
93c417c
1 Parent(s): 4604622

Add application file

Browse files
Files changed (1) hide show
  1. app.py +250 -0
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
3
+ import nbformat as nbf
4
+ from huggingface_hub import HfApi
5
+ from httpx import Client
6
+ import logging
7
+ import pandas as pd
8
+ from utils.notebook_utils import (
9
+ eda_cells,
10
+ replace_wildcards,
11
+ rag_cells,
12
+ embeggins_cells,
13
+ )
14
+ from dotenv import load_dotenv
15
+ import os
16
+
17
+ # TODOS:
18
+ # 2. Add template for RAG and embeddings
19
+ # 3. Improve templates
20
+
21
+ load_dotenv()
22
+
23
+ HF_TOKEN = os.getenv("HF_TOKEN")
24
+ NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
25
+ assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
26
+ assert (
27
+ NOTEBOOKS_REPOSITORY is not None
28
+ ), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
29
+
30
+
31
+ BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
32
+ HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
33
+
34
+ client = Client(headers=HEADERS)
35
+
36
+ logging.basicConfig(level=logging.INFO)
37
+
38
+
39
+ def get_compatible_libraries(dataset: str):
40
+ try:
41
+ response = client.get(
42
+ f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
43
+ )
44
+ response.raise_for_status()
45
+ return response.json()
46
+ except Exception as e:
47
+ logging.error(f"Error fetching compatible libraries: {e}")
48
+ raise
49
+
50
+
51
+ def create_notebook_file(cells, notebook_name):
52
+ nb = nbf.v4.new_notebook()
53
+ nb["cells"] = [
54
+ nbf.v4.new_code_cell(
55
+ cmd["source"]
56
+ if isinstance(cmd["source"], str)
57
+ else "\n".join(cmd["source"])
58
+ )
59
+ if cmd["cell_type"] == "code"
60
+ else nbf.v4.new_markdown_cell(cmd["source"])
61
+ for cmd in cells
62
+ ]
63
+
64
+ with open(notebook_name, "w") as f:
65
+ nbf.write(nb, f)
66
+ logging.info(f"Notebook {notebook_name} created successfully")
67
+
68
+
69
+ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
70
+ try:
71
+ resp = client.get(
72
+ f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
73
+ )
74
+ resp.raise_for_status()
75
+ content = resp.json()
76
+ rows = content["rows"]
77
+ rows = [row["row"] for row in rows]
78
+ first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
79
+ features = content["features"]
80
+ features_dict = {feature["name"]: feature["type"] for feature in features}
81
+ return features_dict, first_rows_df
82
+ except Exception as e:
83
+ logging.error(f"Error fetching first rows: {e}")
84
+ raise
85
+
86
+
87
+ def generate_eda_cells(dataset_id):
88
+ yield from generate_cells(dataset_id, eda_cells, "eda")
89
+
90
+
91
+ def generate_rag_cells(dataset_id):
92
+ yield from generate_cells(dataset_id, rag_cells, "rag")
93
+
94
+
95
+ def generate_embedding_cells(dataset_id):
96
+ yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
97
+
98
+
99
+ def _push_to_hub(
100
+ dataset_id,
101
+ notebook_file,
102
+ ):
103
+ logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
104
+
105
+ notebook_name = notebook_file.split("/")[-1]
106
+ api = HfApi(token=HF_TOKEN)
107
+ try:
108
+ logging.info(f"About to push {notebook_file} - {dataset_id}")
109
+ api.upload_file(
110
+ path_or_fileobj=notebook_file,
111
+ path_in_repo=notebook_name,
112
+ repo_id=NOTEBOOKS_REPOSITORY,
113
+ repo_type="dataset",
114
+ )
115
+ link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
116
+ logging.info(f"Notebook pushed to hub: {link}")
117
+ return link
118
+ except Exception as e:
119
+ logging.info("Failed to push notebook", e)
120
+ raise
121
+
122
+
123
+ def generate_cells(dataset_id, cells, notebook_type="eda"):
124
+ try:
125
+ libraries = get_compatible_libraries(dataset_id)
126
+ except Exception as err:
127
+ gr.Error("Unable to retrieve dataset info from HF Hub.")
128
+ logging.error(f"Failed to fetch compatible libraries: {err}")
129
+ return []
130
+
131
+ if not libraries:
132
+ gr.Error("Dataset not compatible with pandas library.")
133
+ logging.error(f"Dataset not compatible with pandas library")
134
+ return gr.File(visible=False), gr.Row.update(visible=False)
135
+
136
+ pandas_library = next(
137
+ (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
138
+ None,
139
+ )
140
+ if not pandas_library:
141
+ gr.Error("Dataset not compatible with pandas library.")
142
+ return []
143
+
144
+ first_config_loading_code = pandas_library["loading_codes"][0]
145
+ first_code = first_config_loading_code["code"]
146
+ first_config = first_config_loading_code["config_name"]
147
+ first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
148
+ features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
149
+
150
+ html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
151
+ wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
152
+ replacements = [dataset_id, first_code, html_code]
153
+ cells = replace_wildcards(cells, wildcards, replacements)
154
+ generated_text = ""
155
+ # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
156
+ viewer_lines = 0
157
+ for cell in cells:
158
+ generated_text += cell["source"] + "\n"
159
+ yield generated_text, ""
160
+ if generated_text.count("\n") > 38:
161
+ generated_text += (
162
+ f"## See more lines available in the generated notebook :) ......"
163
+ )
164
+ yield generated_text, ""
165
+ break
166
+ notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
167
+ create_notebook_file(cells, notebook_name=notebook_name)
168
+ notebook_link = _push_to_hub(dataset_id, notebook_name)
169
+ yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
170
+
171
+
172
+ with gr.Blocks(fill_height=True, fill_width=True) as demo:
173
+ gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
174
+ with gr.Row(equal_height=True):
175
+ with gr.Column(scale=2):
176
+ text_input = gr.Textbox(label="Suggested notebook type", visible=False)
177
+
178
+ dataset_name = HuggingfaceHubSearch(
179
+ label="Hub Dataset ID",
180
+ placeholder="Search for dataset id on Huggingface",
181
+ search_type="dataset",
182
+ value="",
183
+ )
184
+
185
+ dataset_samples = gr.Examples(
186
+ examples=[
187
+ [
188
+ "infinite-dataset-hub/WorldPopCounts",
189
+ "Try this dataset for Exploratory Data Analysis",
190
+ ],
191
+ [
192
+ "infinite-dataset-hub/GlobaleCuisineRecipes",
193
+ "Try this dataset for Embeddings generation",
194
+ ],
195
+ [
196
+ "infinite-dataset-hub/GlobalBestSellersSummaries",
197
+ "Try this dataset for RAG generation",
198
+ ],
199
+ ],
200
+ inputs=[dataset_name, text_input],
201
+ cache_examples=False,
202
+ )
203
+
204
+ @gr.render(inputs=dataset_name)
205
+ def embed(name):
206
+ if not name:
207
+ return gr.Markdown("### No dataset provided")
208
+ html_code = f"""
209
+ <iframe
210
+ src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
211
+ frameborder="0"
212
+ width="100%"
213
+ height="350px"
214
+ ></iframe>
215
+ """
216
+ return gr.HTML(value=html_code, elem_classes="viewer")
217
+
218
+ with gr.Row():
219
+ generate_eda_btn = gr.Button("Exploratory Data Analysis")
220
+ generate_embedding_btn = gr.Button("Embeddings")
221
+ generate_rag_btn = gr.Button("RAG")
222
+ generate_training_btn = gr.Button(
223
+ "Training - Coming soon", interactive=False
224
+ )
225
+
226
+ with gr.Column(scale=2):
227
+ code_component = gr.Code(
228
+ language="python", label="Notebook Code Preview", lines=40
229
+ )
230
+ go_to_notebook = gr.Markdown("", visible=True)
231
+
232
+ generate_eda_btn.click(
233
+ generate_eda_cells,
234
+ inputs=[dataset_name],
235
+ outputs=[code_component, go_to_notebook],
236
+ )
237
+
238
+ generate_embedding_btn.click(
239
+ generate_embedding_cells,
240
+ inputs=[dataset_name],
241
+ outputs=[code_component, go_to_notebook],
242
+ )
243
+
244
+ generate_rag_btn.click(
245
+ generate_rag_cells,
246
+ inputs=[dataset_name],
247
+ outputs=[code_component, go_to_notebook],
248
+ )
249
+
250
+ demo.launch()