lhoestq HF staff commited on
Commit
4c86203
·
1 Parent(s): bf29377

add all functions

Browse files
app.py CHANGED
@@ -3,22 +3,19 @@ from functools import partial, lru_cache
3
  import duckdb
4
  import gradio as gr
5
  import pandas as pd
 
6
  import requests
7
  from huggingface_hub import HfApi
8
 
9
  READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
10
- EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10)
 
11
  MAX_NUM_COLUMNS = 20
 
 
12
  css = """
13
- @media (prefers-color-scheme: dark) {
14
- .transparent-dropdown, .transparent-dropdown .container .wrap {
15
- background: var(--bg-dark);
16
- }
17
- }
18
- @media (prefers-color-scheme: light) {
19
- .transparent-dropdown, .transparent-dropdown .container .wrap {
20
- background: var(--bg);
21
- }
22
  }
23
  input {
24
  -webkit-user-select: none;
@@ -32,9 +29,25 @@ input {
32
  thead {
33
  display: none;
34
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  """
36
  js = """
37
- function setDataFrameReadonly() {
 
38
  MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
39
  var observer = new MutationObserver(function(mutations, observer) {
40
  // fired when a mutation occurs
@@ -46,38 +59,82 @@ function setDataFrameReadonly() {
46
  subtree: true,
47
  childList: true
48
  });
49
-
 
 
 
 
 
 
 
 
 
 
50
  }
51
  """
52
  text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
 
 
 
 
 
53
 
54
  @lru_cache(maxsize=3)
55
  def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
56
  return duckdb.sql(query)
57
 
58
- def prepare_function(func: str, placeholder: str, column_name: str) -> str:
59
- if "(" in func:
60
- prepared_func = func.split("(")
61
- prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1)
62
- prepared_func = "(".join(prepared_func)
 
63
  else:
64
- prepared_func = func.replace(placeholder, column_name, 1)
65
- return prepared_func
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  with gr.Blocks(css=css, js=js) as demo:
68
  loading_codes_json = gr.JSON(visible=False)
69
  dataset_subset_split_textbox = gr.Textbox(visible=False)
70
- input_dataframe = gr.DataFrame(visible=False)
 
 
71
  with gr.Group():
72
  with gr.Row():
73
- dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10)
74
  subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
75
  split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
76
  gr.LoginButton()
77
  with gr.Row():
78
- transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns]
79
- transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
80
  dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
 
 
81
 
82
  def show_subset_dropdown(dataset: str):
83
  if dataset and "/" not in dataset.strip().strip("/"):
@@ -93,79 +150,127 @@ with gr.Blocks(css=css, js=js) as demo:
93
  split = (splits or [""])[0]
94
  return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
95
 
96
- def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
97
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
98
  if dataset and subset and split and pattern:
99
- df = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
100
- input_df = df
101
  else:
102
- input_df = EMPTY_DF
103
- new_transform_dropdowns = [dict(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
 
104
  new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
105
- return [dict(value=df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns
 
106
 
107
- def set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
108
  try:
109
- return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;").df()
110
  except Exception as e:
111
- gr.Error(f"{type(e).__name__}: {e}")
112
- return input_df
113
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  for column_index, transform_dropdown in enumerate(transform_dropdowns):
115
- transform_dropdown.select(partial(set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
116
 
117
- @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
 
 
118
  def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
119
  api = HfApi(token=oauth_token.token if oauth_token else None)
120
- datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
121
  if oauth_token and (user := api.whoami().get("name")):
122
- datasets += list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
123
  dataset = request.query_params.get("dataset") or datasets[0].id
124
  subsets, loading_codes = show_subset_dropdown(dataset)
125
  splits = show_split_dropdown(subsets["value"], loading_codes)
126
- input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
 
127
  return {
128
  dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
129
  loading_codes_json: loading_codes,
130
  subset_dropdown: gr.Dropdown(**subsets),
131
  split_dropdown: gr.Dropdown(**splits),
132
- input_dataframe: gr.DataFrame(**input_df),
133
- dataframe: gr.DataFrame(**input_df),
134
- **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
 
 
 
 
 
 
135
  }
136
 
137
- @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
138
  def _show_subset_dropdown(dataset: str):
139
  subsets, loading_codes = show_subset_dropdown(dataset)
140
  splits = show_split_dropdown(subsets["value"], loading_codes)
141
- input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
 
142
  return {
143
  loading_codes_json: loading_codes,
144
  subset_dropdown: gr.Dropdown(**subsets),
145
  split_dropdown: gr.Dropdown(**splits),
146
- input_dataframe: gr.DataFrame(**input_df),
147
- dataframe: gr.DataFrame(**input_df),
148
- **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
 
 
 
 
 
 
149
  }
150
 
151
- @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
152
  def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
153
  splits = show_split_dropdown(subset, loading_codes)
154
- input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
 
155
  return {
156
  split_dropdown: gr.Dropdown(**splits),
157
- input_dataframe: gr.DataFrame(**input_df),
158
- dataframe: gr.DataFrame(**input_df),
159
- **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
 
 
 
 
 
 
160
  }
161
 
162
- @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_dataframe, dataframe] + transform_dropdowns)
163
  def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
164
- input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes)
 
165
  return {
166
- input_dataframe: gr.DataFrame(**input_df),
167
- dataframe: gr.DataFrame(**input_df),
168
- **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
 
 
 
 
 
 
169
  }
170
 
171
 
 
3
  import duckdb
4
  import gradio as gr
5
  import pandas as pd
6
+ import pyarrow as pa
7
  import requests
8
  from huggingface_hub import HfApi
9
 
10
  READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
11
+ EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
12
+ EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
13
  MAX_NUM_COLUMNS = 20
14
+ NUM_TRENDING_DATASETS = 10
15
+ NUM_USER_DATASETS = 10
16
  css = """
17
+ .transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
18
+ background: var(--body-background-fill);
 
 
 
 
 
 
 
19
  }
20
  input {
21
  -webkit-user-select: none;
 
29
  thead {
30
  display: none;
31
  }
32
+ .secondary-wrap:has(input[aria-expanded="true"]) {
33
+ background: var(--table-odd-background-fill);
34
+ }
35
+ .secondary-wrap:has(input[aria-expanded="true"])::after {
36
+ content: '↵';
37
+ margin-right: var(--size-10);
38
+ border-width: 1px;
39
+ border-color: var(--block-border-color);
40
+ border-radius: .23rem;
41
+ background-color: #141c2e;
42
+ padding-left: 2px;
43
+ font-size: .75rem;
44
+ color: var(--block-title-text-color);
45
+ }
46
+ var(--body-background-fill)
47
  """
48
  js = """
49
+ function load() {
50
+ // Set DataFrame readonly
51
  MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
52
  var observer = new MutationObserver(function(mutations, observer) {
53
  // fired when a mutation occurs
 
59
  subtree: true,
60
  childList: true
61
  });
62
+
63
+ // Run query on Enter in transform dropdown
64
+ document.querySelectorAll("input").forEach(i => {
65
+ if (i.parentElement.parentElement.parentElement.parentElement.parentElement.classList.contains("transform_dropdown")) {
66
+ i.onkeydown = (event) => {
67
+ if (event.code == "Enter") {
68
+ document.getElementById("run_button").click();
69
+ }
70
+ }
71
+ }
72
+ })
73
  }
74
  """
75
  text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
76
+ date_functions_df = pd.read_csv("date_functions.tsv", delimiter="\t")
77
+ list_functions_df = pd.read_csv("list_functions.tsv", delimiter="\t")
78
+ numeric_functions_df = pd.read_csv("numeric_functions.tsv", delimiter="\t")
79
+ time_functions_df = pd.read_csv("time_functions.tsv", delimiter="\t")
80
+ timestamp_functions_df = pd.read_csv("timestamp_functions.tsv", delimiter="\t")
81
 
82
  @lru_cache(maxsize=3)
83
  def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
84
  return duckdb.sql(query)
85
 
86
+ def prepare_function(func: str, placeholders: list[str], column_name: str) -> str:
87
+ prepared_func = func.split("(", 1)
88
+ for placeholder in placeholders:
89
+ if placeholder in prepared_func[-1]:
90
+ prepared_func[-1] = prepared_func[-1].replace(placeholder, column_name, 1)
91
+ return "(".join(prepared_func)
92
  else:
93
+ return None
94
+
95
+ def prettify_df(df: pd.DataFrame):
96
+ return df.apply(lambda s: s.apply(str))
97
+
98
+ def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]:
99
+ prepared_functions = {}
100
+ for field in table.schema:
101
+ if pa.types.is_integer(field.type) or pa.types.is_floating(field.type):
102
+ prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
103
+ elif pa.types.is_string(field.type):
104
+ prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
105
+ elif pa.types.is_date(field.type):
106
+ prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
107
+ elif pa.types.is_list(field.type):
108
+ prepared_functions[field.name] = [prepare_function(list_func, ["list"], field.name) for list_func in list_functions_df.Name]
109
+ elif pa.types.is_time(field.type):
110
+ prepared_functions[field.name] = [prepare_function(time_func, ["starttime", "time"], field.name) for time_func in time_functions_df.Name]
111
+ elif pa.types.is_timestamp(field.type):
112
+ prepared_functions[field.name] = [prepare_function(timestamp_func, ["startdate", "timestamp"], field.name) for timestamp_func in timestamp_functions_df.Name]
113
+ elif pa.types.is_struct(field.type):
114
+ prepared_functions[field.name] = [f"{field.name}.{subfield.name}" for subfield in field.type.fields]
115
+ else:
116
+ prepared_functions[field.name] = []
117
+ prepared_functions[field.name] = [prepared_function for prepared_function in prepared_functions[field.name] if prepared_function]
118
+ return prepared_functions
119
 
120
  with gr.Blocks(css=css, js=js) as demo:
121
  loading_codes_json = gr.JSON(visible=False)
122
  dataset_subset_split_textbox = gr.Textbox(visible=False)
123
+ input_table_state = gr.State()
124
+ run_button = gr.Button(visible=False, elem_id="run_button")
125
+ gr.Markdown("# Dataset Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
126
  with gr.Group():
127
  with gr.Row():
128
+ dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
129
  subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
130
  split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
131
  gr.LoginButton()
132
  with gr.Row():
133
+ transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
134
+ transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
135
  dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
136
+ with gr.Accordion("Show SQL command", open=False, elem_classes="transparent-accordion"):
137
+ code_markdown = gr.Markdown()
138
 
139
  def show_subset_dropdown(dataset: str):
140
  if dataset and "/" not in dataset.strip().strip("/"):
 
150
  split = (splits or [""])[0]
151
  return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
152
 
153
+ def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
154
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
155
  if dataset and subset and split and pattern:
156
+ table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").arrow()
 
157
  else:
158
+ table = EMPTY_TABLE
159
+ prepared_functions = get_prepared_functions_from_table(table)
160
+ new_transform_dropdowns = [dict(choices=[column_name] + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in table.column_names]
161
  new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
162
+ df = table.to_pandas()
163
+ return [table, dict(value=prettify_df(df), column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns
164
 
165
+ def set_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], input_table: pa.Table, df: pd.DataFrame, *transforms, show_warning=True):
166
  try:
167
+ table = duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_table;").arrow()
168
  except Exception as e:
169
+ if show_warning:
170
+ gr.Warning(f"{type(e).__name__}: {e}")
171
+ return {
172
+ dataframe: df
173
+ }
174
+ prepared_functions = get_prepared_functions_from_table(table)
175
+ new_transform_dropdowns = [dict(choices=list({original_column_name: None, column_name: None}) + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for original_column_name, column_name in zip(input_table.column_names, table.column_names)]
176
+ new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
177
+ pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
178
+ return {
179
+ dataframe: prettify_df(table.to_pandas()),
180
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
181
+ code_markdown: (
182
+ "```sql\n"
183
+ + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
184
+ + f"FROM 'hf://datasets/{dataset}/{pattern}';"
185
+ + "\n```"
186
+ ) if pattern else "",
187
+ }
188
+
189
  for column_index, transform_dropdown in enumerate(transform_dropdowns):
190
+ transform_dropdown.select(partial(set_dataframe, show_warning=False), inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns)
191
 
192
+ run_button.click(set_dataframe, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns)
193
+
194
+ @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
195
  def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
196
  api = HfApi(token=oauth_token.token if oauth_token else None)
197
+ datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
198
  if oauth_token and (user := api.whoami().get("name")):
199
+ datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
200
  dataset = request.query_params.get("dataset") or datasets[0].id
201
  subsets, loading_codes = show_subset_dropdown(dataset)
202
  splits = show_split_dropdown(subsets["value"], loading_codes)
203
+ input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
204
+ pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0]
205
  return {
206
  dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
207
  loading_codes_json: loading_codes,
208
  subset_dropdown: gr.Dropdown(**subsets),
209
  split_dropdown: gr.Dropdown(**splits),
210
+ input_table_state: input_table,
211
+ dataframe: gr.DataFrame(**input_dataframe),
212
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
213
+ code_markdown: (
214
+ "```sql\n"
215
+ + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
216
+ + f"FROM 'hf://datasets/{dataset}/{pattern}';"
217
+ + "\n```"
218
+ ) if pattern else "",
219
  }
220
 
221
+ @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
222
  def _show_subset_dropdown(dataset: str):
223
  subsets, loading_codes = show_subset_dropdown(dataset)
224
  splits = show_split_dropdown(subsets["value"], loading_codes)
225
+ input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
226
+ pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0]
227
  return {
228
  loading_codes_json: loading_codes,
229
  subset_dropdown: gr.Dropdown(**subsets),
230
  split_dropdown: gr.Dropdown(**splits),
231
+ input_table_state: input_table,
232
+ dataframe: gr.DataFrame(**input_dataframe),
233
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
234
+ code_markdown: (
235
+ "```sql\n"
236
+ + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
237
+ + f"FROM 'hf://datasets/{dataset}/{pattern}';"
238
+ + "\n```"
239
+ ) if pattern else "",
240
  }
241
 
242
+ @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
243
  def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
244
  splits = show_split_dropdown(subset, loading_codes)
245
+ input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
246
+ pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
247
  return {
248
  split_dropdown: gr.Dropdown(**splits),
249
+ input_table_state: input_table,
250
+ dataframe: gr.DataFrame(**input_dataframe),
251
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
252
+ code_markdown: (
253
+ "```sql\n"
254
+ + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
255
+ + f"FROM 'hf://datasets/{dataset}/{pattern}';"
256
+ + "\n```"
257
+ ) if pattern else "",
258
  }
259
 
260
+ @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_table_state, dataframe, code_markdown] + transform_dropdowns)
261
  def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
262
+ input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes)
263
+ pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
264
  return {
265
+ input_table_state: input_table,
266
+ dataframe: gr.DataFrame(**input_dataframe),
267
+ **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
268
+ code_markdown: (
269
+ "```sql\n"
270
+ + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
271
+ + f"FROM 'hf://datasets/{dataset}/{pattern}';"
272
+ + "\n```"
273
+ ) if pattern else "",
274
  }
275
 
276
 
date_functions.tsv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name Description
2
+ current_date Current date (at start of current transaction).
3
+ date_add(date, interval) Add the interval to the date.
4
+ date_diff(part, startdate, enddate) The number of partition boundaries between the dates.
5
+ date_part(part, date) Get the subfield (equivalent to extract).
6
+ date_sub(part, startdate, enddate) The number of complete partitions between the dates.
7
+ date_trunc(part, date) Truncate to specified precision.
8
+ datediff(part, startdate, enddate) The number of partition boundaries between the dates. Alias of date_diff.
9
+ datepart(part, date) Get the subfield (equivalent to extract). Alias of date_part.
10
+ datesub(part, startdate, enddate) The number of complete partitions between the dates. Alias of date_sub.
11
+ datetrunc(part, date) Truncate to specified precision. Alias of date_trunc.
12
+ dayname(date) The (English) name of the weekday.
13
+ extract(part from date) Get subfield from a date.
14
+ greatest(date, date) The later of two dates.
15
+ isfinite(date) Returns true if the date is finite, false otherwise.
16
+ isinf(date) Returns true if the date is infinite, false otherwise.
17
+ last_day(date) The last day of the corresponding month in the date.
18
+ least(date, date) The earlier of two dates.
19
+ make_date(year, month, day) The date for the given parts.
20
+ monthname(date) The (English) name of the month.
21
+ strftime(date, format) Converts a date to a string according to the format string.
22
+ time_bucket(bucket_width, date[, offset]) Truncate date by the specified interval bucket_width. Buckets are offset by offset interval.
23
+ time_bucket(bucket_width, date[, origin]) Truncate date by the specified interval bucket_width. Buckets are aligned relative to origin date. origin defaults to 2000-01-03 for buckets that don't include a month or year interval, and to 2000-01-01 for month and year buckets.
24
+ today() Current date (start of current transaction).
list_functions.tsv ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name Description
2
+ list[index] Bracket notation serves as an alias for list_extract.
3
+ list[begin:end] Bracket notation with colon is an alias for list_slice.
4
+ list[begin:end:step] list_slice in bracket notation with an added step feature.
5
+ array_pop_back(list) Returns the list without the last element.
6
+ array_pop_front(list) Returns the list without the first element.
7
+ flatten(list_of_lists) Concatenate a list of lists into a single list. This only flattens one level of the list (see examples).
8
+ len(list) Return the length of the list.
9
+ list_aggregate(list, name) Executes the aggregate function name on the elements of list. See the List Aggregates section for more details.
10
+ list_any_value(list) Returns the first non-null value in the list.
11
+ list_append(list, element) Appends element to list.
12
+ list_concat(list1, list2) Concatenate two lists. NULL inputs are skipped. See also ||
13
+ list_contains(list, element) Returns true if the list contains the element.
14
+ list_cosine_similarity(list1, list2) Compute the cosine similarity between two lists.
15
+ list_cosine_distance(list1, list2) Compute the cosine distance between two lists. Equivalent to 1.0 - list_cosine_similarity.
16
+ list_distance(list1, list2) Calculates the Euclidean distance between two points with coordinates given in two inputs lists of equal length.
17
+ list_distinct(list) Removes all duplicates and NULL values from a list. Does not preserve the original order.
18
+ list_dot_product(list1, list2) Computes the dot product of two same-sized lists of numbers.
19
+ list_negative_dot_product(list1, list2) Computes the negative dot product of two same-sized lists of numbers. Equivalent to - list_dot_product.
20
+ list_extract(list, index) Extract the indexth (1-based) value from the list.
21
+ list_filter(list, lambda) Constructs a list from those elements of the input list for which the lambda function returns true. See the Lambda Functions page for more details.
22
+ list_grade_up(list) Works like sort, but the results are the indexes that correspond to the position in the original list instead of the actual values.
23
+ list_has_all(list, sub-list) Returns true if all elements of sub-list exist in list.
24
+ list_has_any(list1, list2) Returns true if any elements exist is both lists.
25
+ list_intersect(list1, list2) Returns a list of all the elements that exist in both l1 and l2, without duplicates.
26
+ list_position(list, element) Returns the index of the element if the list contains the element. If the element is not found, it returns NULL.
27
+ list_prepend(element, list) Prepends element to list.
28
+ list_reduce(list, lambda) Returns a single value that is the result of applying the lambda function to each element of the input list. See the Lambda Functions page for more details.
29
+ list_resize(list, size[, value]) Resizes the list to contain size elements. Initializes new elements with value or NULL if value is not set.
30
+ list_reverse_sort(list) Sorts the elements of the list in reverse order. See the Sorting Lists section for more details about the NULL sorting order.
31
+ list_reverse(list) Reverses the list.
32
+ list_select(value_list, index_list) Returns a list based on the elements selected by the index_list.
33
+ list_slice(list, begin, end, step) list_slice with added step feature.
34
+ list_slice(list, begin, end) Extract a sublist using slice conventions. Negative values are accepted. See slicing.
35
+ list_sort(list) Sorts the elements of the list. See the Sorting Lists section for more details about the sorting order and the NULL sorting order.
36
+ list_transform(list, lambda) Returns a list that is the result of applying the lambda function to each element of the input list. See the Lambda Functions page for more details.
37
+ list_unique(list) Counts the unique elements of a list.
38
+ list_value(any, ...) Create a LIST containing the argument values.
39
+ list_where(value_list, mask_list) Returns a list with the BOOLEANs in mask_list applied as a mask to the value_list.
40
+ list_zip(list_1, list_2, ...[, truncate]) Zips k LISTs to a new LIST whose length will be that of the longest list. Its elements are structs of k elements from each list list_1, …, list_k, missing elements are replaced with NULL. If truncate is set, all lists are truncated to the smallest list length.
41
+ unnest(list) Unnests a list by one level. Note that this is a special function that alters the cardinality of the result. See the unnest page for more details.
numeric_functions.tsv ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name Description
2
+ @(x) Absolute value. Parentheses are optional if x is a column name.
3
+ abs(x) Absolute value.
4
+ acos(x) Computes the arccosine of x.
5
+ add(x, y) Alias for x + y.
6
+ asin(x) Computes the arcsine of x.
7
+ atan(x) Computes the arctangent of x.
8
+ atan2(y, x) Computes the arctangent (y, x).
9
+ bit_count(x) Returns the number of bits that are set.
10
+ cbrt(x) Returns the cube root of the number.
11
+ ceil(x) Rounds the number up.
12
+ ceiling(x) Rounds the number up. Alias of ceil.
13
+ cos(x) Computes the cosine of x.
14
+ cot(x) Computes the cotangent of x.
15
+ degrees(x) Converts radians to degrees.
16
+ divide(x, y) Alias for x // y.
17
+ even(x) Round to next even number by rounding away from zero.
18
+ exp(x) Computes e ** x.
19
+ factorial(x) See ! operator. Computes the product of the current integer and all integers below it.
20
+ fdiv(x, y) Performs integer division (x // y) but returns a DOUBLE value.
21
+ floor(x) Rounds the number down.
22
+ fmod(x, y) Calculates the modulo value. Always returns a DOUBLE value.
23
+ gamma(x) Interpolation of the factorial of x - 1. Fractional inputs are allowed.
24
+ gcd(x, y) Computes the greatest common divisor of x and y.
25
+ greatest_common_divisor(x, y) Computes the greatest common divisor of x and y.
26
+ greatest(x1, x2, ...) Selects the largest value.
27
+ isfinite(x) Returns true if the floating point value is finite, false otherwise.
28
+ isinf(x) Returns true if the floating point value is infinite, false otherwise.
29
+ isnan(x) Returns true if the floating point value is not a number, false otherwise.
30
+ lcm(x, y) Computes the least common multiple of x and y.
31
+ least_common_multiple(x, y) Computes the least common multiple of x and y.
32
+ least(x1, x2, ...) Selects the smallest value.
33
+ lgamma(x) Computes the log of the gamma function.
34
+ ln(x) Computes the natural logarithm of x.
35
+ log(x) Computes the base-10 logarithm of x.
36
+ log10(x) Alias of log. Computes the base-10 logarithm of x.
37
+ log2(x) Computes the base-2 log of x.
38
+ multiply(x, y) Alias for x * y.
39
+ nextafter(x, y) Return the next floating point value after x in the direction of y.
40
+ pi() Returns the value of pi.
41
+ pow(x, y) Computes x to the power of y.
42
+ power(x, y) Alias of pow. computes x to the power of y.
43
+ radians(x) Converts degrees to radians.
44
+ random() Returns a random number x in the range 0.0 <= x < 1.0.
45
+ round_even(v NUMERIC, s INTEGER) Alias of roundbankers(v, s). Round to s decimal places using the rounding half to even rule. Values s < 0 are allowed.
46
+ round(v NUMERIC, s INTEGER) Round to s decimal places. Values s < 0 are allowed.
47
+ setseed(x) Sets the seed to be used for the random function.
48
+ sign(x) Returns the sign of x as -1, 0 or 1.
49
+ signbit(x) Returns whether the signbit is set or not.
50
+ sin(x) Computes the sin of x.
51
+ sqrt(x) Returns the square root of the number.
52
+ subtract(x, y) Alias for x - y.
53
+ tan(x) Computes the tangent of x.
54
+ trunc(x) Truncates the number.
55
+ xor(x, y) Bitwise XOR.
requirements.txt CHANGED
@@ -1 +1,2 @@
 
1
  duckdb
 
1
+ pyarrow
2
  duckdb
time_functions.tsv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name Description
2
+ current_time Current time (start of current transaction).
3
+ date_diff(part, starttime, endtime) The number of partition boundaries between the times.
4
+ date_part(part, time) Get subfield (equivalent to extract).
5
+ date_sub(part, starttime, endtime) The number of complete partitions between the times.
6
+ datediff(part, starttime, endtime) Alias of date_diff. The number of partition boundaries between the times.
7
+ datepart(part, time) Alias of date_part. Get subfield (equivalent to extract).
8
+ datesub(part, starttime, endtime) Alias of date_sub. The number of complete partitions between the times.
9
+ extract(part FROM time) Get subfield from a time.
10
+ get_current_time() Current time (start of current transaction).
11
+ make_time(bigint, bigint, double) The time for the given parts.
timestamp_functions.tsv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Name Description
2
+ age(timestamp, timestamp) Subtract arguments, resulting in the time difference between the two timestamps.
3
+ age(timestamp) Subtract from current_date.
4
+ century(timestamp) Extracts the century of a timestamp.
5
+ current_timestamp Returns the current timestamp (at the start of the transaction).
6
+ date_diff(part, startdate, enddate) The number of partition boundaries between the timestamps.
7
+ date_part([part, ...], timestamp) Get the listed subfields as a struct. The list must be constant.
8
+ date_part(part, timestamp) Get subfield (equivalent to extract).
9
+ date_sub(part, startdate, enddate) The number of complete partitions between the timestamps.
10
+ date_trunc(part, timestamp) Truncate to specified precision.
11
+ datediff(part, startdate, enddate) Alias of date_diff. The number of partition boundaries between the timestamps.
12
+ datepart([part, ...], timestamp) Alias of date_part. Get the listed subfields as a struct. The list must be constant.
13
+ datepart(part, timestamp) Alias of date_part. Get subfield (equivalent to extract).
14
+ datesub(part, startdate, enddate) Alias of date_sub. The number of complete partitions between the timestamps.
15
+ datetrunc(part, timestamp) Alias of date_trunc. Truncate to specified precision.
16
+ dayname(timestamp) The (English) name of the weekday.
17
+ epoch_ms(ms) Converts ms since epoch to a timestamp.
18
+ epoch_ms(timestamp) Converts a timestamp to milliseconds since the epoch.
19
+ epoch_ms(timestamp) Return the total number of milliseconds since the epoch.
20
+ epoch_ns(timestamp) Return the total number of nanoseconds since the epoch.
21
+ epoch_us(timestamp) Return the total number of microseconds since the epoch.
22
+ epoch(timestamp) Converts a timestamp to seconds since the epoch.
23
+ extract(field FROM timestamp) Get subfield from a timestamp.
24
+ greatest(timestamp, timestamp) The later of two timestamps.
25
+ isfinite(timestamp) Returns true if the timestamp is finite, false otherwise.
26
+ isinf(timestamp) Returns true if the timestamp is infinite, false otherwise.
27
+ last_day(timestamp) The last day of the month.
28
+ least(timestamp, timestamp) The earlier of two timestamps.
29
+ make_timestamp(bigint, bigint, bigint, bigint, bigint, double) The timestamp for the given parts.
30
+ make_timestamp(microseconds) The timestamp for the given number of µs since the epoch.
31
+ monthname(timestamp) The (English) name of the month.
32
+ strftime(timestamp, format) Converts timestamp to string according to the format string.
33
+ strptime(text, format-list) Converts the string text to timestamp applying the format strings in the list until one succeeds. Throws an error on failure. To return NULL on failure, use try_strptime.
34
+ strptime(text, format) Converts the string text to timestamp according to the format string. Throws an error on failure. To return NULL on failure, use try_strptime.
35
+ time_bucket(bucket_width, timestamp[, offset]) Truncate timestamp by the specified interval bucket_width. Buckets are offset by offset interval.
36
+ time_bucket(bucket_width, timestamp[, origin]) Truncate timestamp by the specified interval bucket_width. Buckets are aligned relative to origin timestamp. origin defaults to 2000-01-03 00:00:00 for buckets that don't include a month or year interval, and to 2000-01-01 00:00:00 for month and year buckets.
37
+ to_timestamp(double) Converts seconds since the epoch to a timestamp with time zone.
38
+ try_strptime(text, format-list) Converts the string text to timestamp applying the format strings in the list until one succeeds. Returns NULL on failure.
39
+ try_strptime(text, format) Converts the string text to timestamp according to the format string. Returns NULL on failure.