Spaces:
Running
Running
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- components/tabs/frontend/build/asset-manifest.json +3 -3
- components/tabs/frontend/build/index.html +1 -1
- components/tabs/frontend/build/static/js/main.a44b10fc.js +0 -0
- components/tabs/frontend/build/static/js/main.a44b10fc.js.LICENSE.txt +73 -0
- components/tabs/frontend/build/static/js/main.a44b10fc.js.map +0 -0
- components/tabs/frontend/src/Tabs.tsx +1 -1
- core/constants.py +5 -0
- core/files.py +1 -1
- core/names.py +5 -0
- core/names_test.py +1 -0
- core/state.py +4 -0
- events/metadata.py +14 -1
- views/files.py +19 -9
- views/load.py +1 -1
- views/metadata.py +64 -1
- views/overview.py +17 -8
- views/previous_files.py +1 -1
- views/record_sets.py +31 -14
- views/source.py +46 -11
- views/splash.py +43 -6
components/tabs/frontend/build/asset-manifest.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"files": {
|
| 3 |
-
"main.js": "./static/js/main.
|
| 4 |
"index.html": "./index.html",
|
| 5 |
-
"main.
|
| 6 |
},
|
| 7 |
"entrypoints": [
|
| 8 |
-
"static/js/main.
|
| 9 |
]
|
| 10 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"files": {
|
| 3 |
+
"main.js": "./static/js/main.a44b10fc.js",
|
| 4 |
"index.html": "./index.html",
|
| 5 |
+
"main.a44b10fc.js.map": "./static/js/main.a44b10fc.js.map"
|
| 6 |
},
|
| 7 |
"entrypoints": [
|
| 8 |
+
"static/js/main.a44b10fc.js"
|
| 9 |
]
|
| 10 |
}
|
components/tabs/frontend/build/index.html
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
<!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.
|
|
|
|
| 1 |
+
<!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.a44b10fc.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
components/tabs/frontend/build/static/js/main.a44b10fc.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
components/tabs/frontend/build/static/js/main.a44b10fc.js.LICENSE.txt
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
object-assign
|
| 3 |
+
(c) Sindre Sorhus
|
| 4 |
+
@license MIT
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
* @license React
|
| 9 |
+
* react-dom.production.min.js
|
| 10 |
+
*
|
| 11 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 12 |
+
*
|
| 13 |
+
* This source code is licensed under the MIT license found in the
|
| 14 |
+
* LICENSE file in the root directory of this source tree.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
/**
|
| 18 |
+
* @license React
|
| 19 |
+
* react-is.production.min.js
|
| 20 |
+
*
|
| 21 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 22 |
+
*
|
| 23 |
+
* This source code is licensed under the MIT license found in the
|
| 24 |
+
* LICENSE file in the root directory of this source tree.
|
| 25 |
+
*/
|
| 26 |
+
|
| 27 |
+
/**
|
| 28 |
+
* @license React
|
| 29 |
+
* react-jsx-runtime.production.min.js
|
| 30 |
+
*
|
| 31 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 32 |
+
*
|
| 33 |
+
* This source code is licensed under the MIT license found in the
|
| 34 |
+
* LICENSE file in the root directory of this source tree.
|
| 35 |
+
*/
|
| 36 |
+
|
| 37 |
+
/**
|
| 38 |
+
* @license React
|
| 39 |
+
* react.production.min.js
|
| 40 |
+
*
|
| 41 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 42 |
+
*
|
| 43 |
+
* This source code is licensed under the MIT license found in the
|
| 44 |
+
* LICENSE file in the root directory of this source tree.
|
| 45 |
+
*/
|
| 46 |
+
|
| 47 |
+
/**
|
| 48 |
+
* @license React
|
| 49 |
+
* scheduler.production.min.js
|
| 50 |
+
*
|
| 51 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 52 |
+
*
|
| 53 |
+
* This source code is licensed under the MIT license found in the
|
| 54 |
+
* LICENSE file in the root directory of this source tree.
|
| 55 |
+
*/
|
| 56 |
+
|
| 57 |
+
/** @license React v16.13.1
|
| 58 |
+
* react-is.production.min.js
|
| 59 |
+
*
|
| 60 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 61 |
+
*
|
| 62 |
+
* This source code is licensed under the MIT license found in the
|
| 63 |
+
* LICENSE file in the root directory of this source tree.
|
| 64 |
+
*/
|
| 65 |
+
|
| 66 |
+
/** @license React v16.14.0
|
| 67 |
+
* react.production.min.js
|
| 68 |
+
*
|
| 69 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
| 70 |
+
*
|
| 71 |
+
* This source code is licensed under the MIT license found in the
|
| 72 |
+
* LICENSE file in the root directory of this source tree.
|
| 73 |
+
*/
|
components/tabs/frontend/build/static/js/main.a44b10fc.js.map
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
components/tabs/frontend/src/Tabs.tsx
CHANGED
|
@@ -84,7 +84,7 @@ function BasicTabs({
|
|
| 84 |
whiteSpace: "nowrap",
|
| 85 |
}}
|
| 86 |
>
|
| 87 |
-
|
| 88 |
</Button>
|
| 89 |
</span>
|
| 90 |
</Tooltip>
|
|
|
|
| 84 |
whiteSpace: "nowrap",
|
| 85 |
}}
|
| 86 |
>
|
| 87 |
+
Export
|
| 88 |
</Button>
|
| 89 |
</span>
|
| 90 |
</Tooltip>
|
core/constants.py
CHANGED
|
@@ -35,3 +35,8 @@ METADATA = "Metadata"
|
|
| 35 |
RESOURCES = "Resources"
|
| 36 |
RECORD_SETS = "Record Sets"
|
| 37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
RESOURCES = "Resources"
|
| 36 |
RECORD_SETS = "Record Sets"
|
| 37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
| 38 |
+
|
| 39 |
+
NAMES_INFO = (
|
| 40 |
+
"Names are used as identifiers. They are unique and cannot contain special"
|
| 41 |
+
" characters. The interface will replace any special characters."
|
| 42 |
+
)
|
core/files.py
CHANGED
|
@@ -204,7 +204,7 @@ def file_from_form(
|
|
| 204 |
if type == FILE_OBJECT:
|
| 205 |
return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
|
| 206 |
elif type == FILE_SET:
|
| 207 |
-
return FileSet(name=find_unique_name(names, "file_set")
|
| 208 |
else:
|
| 209 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
| 210 |
|
|
|
|
| 204 |
if type == FILE_OBJECT:
|
| 205 |
return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
|
| 206 |
elif type == FILE_SET:
|
| 207 |
+
return FileSet(name=find_unique_name(names, "file_set"))
|
| 208 |
else:
|
| 209 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
| 210 |
|
core/names.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
| 1 |
"""Module to handle naming of RecordSets and distribution."""
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def find_unique_name(names: set[str], name: str):
|
| 5 |
"""Find a unique UID."""
|
|
|
|
| 6 |
while name in names:
|
| 7 |
name = f"{name}_0"
|
| 8 |
return name
|
|
|
|
| 1 |
"""Module to handle naming of RecordSets and distribution."""
|
| 2 |
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
NAME_PATTERN_REGEX = "[^a-zA-Z0-9\\-_\\.]"
|
| 6 |
+
|
| 7 |
|
| 8 |
def find_unique_name(names: set[str], name: str):
|
| 9 |
"""Find a unique UID."""
|
| 10 |
+
name = re.sub(NAME_PATTERN_REGEX, "_", name)
|
| 11 |
while name in names:
|
| 12 |
name = f"{name}_0"
|
| 13 |
return name
|
core/names_test.py
CHANGED
|
@@ -5,6 +5,7 @@ from .names import find_unique_name
|
|
| 5 |
|
| 6 |
def test_find_unique_name():
|
| 7 |
names = set(["first", "second", "first_0"])
|
|
|
|
| 8 |
assert find_unique_name(names, "first") == "first_0_0"
|
| 9 |
assert find_unique_name(names, "second") == "second_0"
|
| 10 |
assert find_unique_name(names, "third") == "third"
|
|
|
|
| 5 |
|
| 6 |
def test_find_unique_name():
|
| 7 |
names = set(["first", "second", "first_0"])
|
| 8 |
+
assert find_unique_name(names, "are there spaces") == "are_there_spaces"
|
| 9 |
assert find_unique_name(names, "first") == "first_0_0"
|
| 10 |
assert find_unique_name(names, "second") == "second_0"
|
| 11 |
assert find_unique_name(names, "third") == "third"
|
core/state.py
CHANGED
|
@@ -183,11 +183,15 @@ class Metadata:
|
|
| 183 |
name: str = ""
|
| 184 |
description: str | None = None
|
| 185 |
citation: str | None = None
|
|
|
|
|
|
|
| 186 |
license: str | None = ""
|
|
|
|
| 187 |
url: str = ""
|
| 188 |
distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
|
| 189 |
record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
|
| 190 |
rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
|
|
|
|
| 191 |
|
| 192 |
def __bool__(self):
|
| 193 |
return self.name != "" and self.url != ""
|
|
|
|
| 183 |
name: str = ""
|
| 184 |
description: str | None = None
|
| 185 |
citation: str | None = None
|
| 186 |
+
data_biases: str | None = None
|
| 187 |
+
data_collection: str | None = None
|
| 188 |
license: str | None = ""
|
| 189 |
+
personal_sensitive_information: str | None = None
|
| 190 |
url: str = ""
|
| 191 |
distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
|
| 192 |
record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
|
| 193 |
rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
|
| 194 |
+
version: str | None = None
|
| 195 |
|
| 196 |
def __bool__(self):
|
| 197 |
return self.name != "" and self.url != ""
|
events/metadata.py
CHANGED
|
@@ -2,6 +2,7 @@ import enum
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
|
|
|
|
| 5 |
from core.state import Metadata
|
| 6 |
|
| 7 |
# List from:
|
|
@@ -93,11 +94,15 @@ class MetadataEvent(enum.Enum):
|
|
| 93 |
URL = "URL"
|
| 94 |
LICENSE = "LICENSE"
|
| 95 |
CITATION = "CITATION"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
| 99 |
if event == MetadataEvent.NAME:
|
| 100 |
-
metadata.name = st.session_state[key]
|
| 101 |
elif event == MetadataEvent.DESCRIPTION:
|
| 102 |
metadata.description = st.session_state[key]
|
| 103 |
elif event == MetadataEvent.LICENSE:
|
|
@@ -106,3 +111,11 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
|
| 106 |
metadata.citation = st.session_state[key]
|
| 107 |
elif event == MetadataEvent.URL:
|
| 108 |
metadata.url = st.session_state[key]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import streamlit as st
|
| 4 |
|
| 5 |
+
from core.names import find_unique_name
|
| 6 |
from core.state import Metadata
|
| 7 |
|
| 8 |
# List from:
|
|
|
|
| 94 |
URL = "URL"
|
| 95 |
LICENSE = "LICENSE"
|
| 96 |
CITATION = "CITATION"
|
| 97 |
+
VERSION = "VERSION"
|
| 98 |
+
DATA_BIASES = "DATA_BIASES"
|
| 99 |
+
DATA_COLLECTION = "DATA_COLLECTION"
|
| 100 |
+
PERSONAL_SENSITIVE_INFORMATION = "PERSONAL_SENSITIVE_INFORMATION"
|
| 101 |
|
| 102 |
|
| 103 |
def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
| 104 |
if event == MetadataEvent.NAME:
|
| 105 |
+
metadata.name = find_unique_name(set(), st.session_state[key])
|
| 106 |
elif event == MetadataEvent.DESCRIPTION:
|
| 107 |
metadata.description = st.session_state[key]
|
| 108 |
elif event == MetadataEvent.LICENSE:
|
|
|
|
| 111 |
metadata.citation = st.session_state[key]
|
| 112 |
elif event == MetadataEvent.URL:
|
| 113 |
metadata.url = st.session_state[key]
|
| 114 |
+
elif event == MetadataEvent.VERSION:
|
| 115 |
+
metadata.version = st.session_state[key]
|
| 116 |
+
elif event == MetadataEvent.DATA_BIASES:
|
| 117 |
+
metadata.data_biases = st.session_state[key]
|
| 118 |
+
elif event == MetadataEvent.DATA_COLLECTION:
|
| 119 |
+
metadata.data_collection = st.session_state[key]
|
| 120 |
+
elif event == MetadataEvent.PERSONAL_SENSITIVE_INFORMATION:
|
| 121 |
+
metadata.personal_sensitive_information = st.session_state[key]
|
views/files.py
CHANGED
|
@@ -3,6 +3,7 @@ import streamlit as st
|
|
| 3 |
from components.safe_button import button_with_confirmation
|
| 4 |
from components.tree import render_tree
|
| 5 |
from core.constants import DF_HEIGHT
|
|
|
|
| 6 |
from core.constants import OAUTH_CLIENT_ID
|
| 7 |
from core.files import code_to_index
|
| 8 |
from core.files import file_from_form
|
|
@@ -39,16 +40,15 @@ resources on the web or manually create new resources."""
|
|
| 39 |
def render_files():
|
| 40 |
"""Renders the views of the files: warnings and panels to display information."""
|
| 41 |
_render_warnings()
|
| 42 |
-
col1, col2
|
| 43 |
with col1:
|
| 44 |
-
st.markdown("#####
|
| 45 |
_render_upload_panel()
|
| 46 |
-
with col2:
|
| 47 |
st.markdown("##### Uploaded resources")
|
| 48 |
files = st.session_state[Metadata].distribution
|
| 49 |
resource = _render_resources_panel(files)
|
| 50 |
st.session_state[SelectedResource] = resource
|
| 51 |
-
with
|
| 52 |
_render_right_panel()
|
| 53 |
|
| 54 |
|
|
@@ -111,9 +111,7 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
|
|
| 111 |
def _render_upload_panel():
|
| 112 |
"""Renders the form to upload from local or upload from URL."""
|
| 113 |
with st.form(key="upload_form", clear_on_submit=True):
|
| 114 |
-
tab1, tab2, tab3 = st.tabs([
|
| 115 |
-
"Import from a local file", "Import from a URL", "Add manually"
|
| 116 |
-
])
|
| 117 |
|
| 118 |
with tab1:
|
| 119 |
st.file_uploader("Select a file", key=_LOCAL_FILE_KEY)
|
|
@@ -202,6 +200,11 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
| 202 |
default=file.contained_in,
|
| 203 |
options=parent_options,
|
| 204 |
key=key,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
on_change=handle_resource_change,
|
| 206 |
args=(ResourceEvent.CONTAINED_IN, file, key),
|
| 207 |
)
|
|
@@ -210,6 +213,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
| 210 |
needed_field("Name"),
|
| 211 |
value=file.name,
|
| 212 |
key=key,
|
|
|
|
| 213 |
on_change=handle_resource_change,
|
| 214 |
args=(ResourceEvent.NAME, file, key),
|
| 215 |
)
|
|
@@ -217,7 +221,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
| 217 |
st.text_area(
|
| 218 |
"Description",
|
| 219 |
value=file.description,
|
| 220 |
-
placeholder="Provide a
|
| 221 |
key=key,
|
| 222 |
on_change=handle_resource_change,
|
| 223 |
args=(ResourceEvent.DESCRIPTION, file, key),
|
|
@@ -225,9 +229,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
| 225 |
if is_file_object:
|
| 226 |
key = f"{prefix}_content_url"
|
| 227 |
st.text_input(
|
| 228 |
-
needed_field("Content URL"),
|
| 229 |
value=file.content_url,
|
| 230 |
key=key,
|
|
|
|
| 231 |
on_change=handle_resource_change,
|
| 232 |
args=(ResourceEvent.CONTENT_URL, file, key),
|
| 233 |
)
|
|
@@ -244,6 +249,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
| 244 |
"Content size",
|
| 245 |
value=file.content_size,
|
| 246 |
key=key,
|
|
|
|
| 247 |
on_change=handle_resource_change,
|
| 248 |
args=(ResourceEvent.CONTENT_SIZE, file, key),
|
| 249 |
)
|
|
@@ -262,6 +268,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
| 262 |
index=code_to_index(file.encoding_format),
|
| 263 |
options=FILE_TYPES.keys(),
|
| 264 |
key=key,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
on_change=handle_resource_change,
|
| 266 |
args=(ResourceEvent.ENCODING_FORMAT, file, key),
|
| 267 |
)
|
|
|
|
| 3 |
from components.safe_button import button_with_confirmation
|
| 4 |
from components.tree import render_tree
|
| 5 |
from core.constants import DF_HEIGHT
|
| 6 |
+
from core.constants import NAMES_INFO
|
| 7 |
from core.constants import OAUTH_CLIENT_ID
|
| 8 |
from core.files import code_to_index
|
| 9 |
from core.files import file_from_form
|
|
|
|
| 40 |
def render_files():
|
| 41 |
"""Renders the views of the files: warnings and panels to display information."""
|
| 42 |
_render_warnings()
|
| 43 |
+
col1, col2 = st.columns([1, 1], gap="small")
|
| 44 |
with col1:
|
| 45 |
+
st.markdown("##### Add a resource")
|
| 46 |
_render_upload_panel()
|
|
|
|
| 47 |
st.markdown("##### Uploaded resources")
|
| 48 |
files = st.session_state[Metadata].distribution
|
| 49 |
resource = _render_resources_panel(files)
|
| 50 |
st.session_state[SelectedResource] = resource
|
| 51 |
+
with col2:
|
| 52 |
_render_right_panel()
|
| 53 |
|
| 54 |
|
|
|
|
| 111 |
def _render_upload_panel():
|
| 112 |
"""Renders the form to upload from local or upload from URL."""
|
| 113 |
with st.form(key="upload_form", clear_on_submit=True):
|
| 114 |
+
tab1, tab2, tab3 = st.tabs(["From a local file", "From a URL", "Add manually"])
|
|
|
|
|
|
|
| 115 |
|
| 116 |
with tab1:
|
| 117 |
st.file_uploader("Select a file", key=_LOCAL_FILE_KEY)
|
|
|
|
| 200 |
default=file.contained_in,
|
| 201 |
options=parent_options,
|
| 202 |
key=key,
|
| 203 |
+
help=(
|
| 204 |
+
"FileObjects and FileSets can be nested. Specifying `Parents` allows to"
|
| 205 |
+
" nest a FileObject/FileSet within another FileObject/FileSet. An example"
|
| 206 |
+
" of this is when images (FileSet) are nested within an archive (FileSet)."
|
| 207 |
+
),
|
| 208 |
on_change=handle_resource_change,
|
| 209 |
args=(ResourceEvent.CONTAINED_IN, file, key),
|
| 210 |
)
|
|
|
|
| 213 |
needed_field("Name"),
|
| 214 |
value=file.name,
|
| 215 |
key=key,
|
| 216 |
+
help=f"The name of the resource. {NAMES_INFO}",
|
| 217 |
on_change=handle_resource_change,
|
| 218 |
args=(ResourceEvent.NAME, file, key),
|
| 219 |
)
|
|
|
|
| 221 |
st.text_area(
|
| 222 |
"Description",
|
| 223 |
value=file.description,
|
| 224 |
+
placeholder="Provide a description of the file.",
|
| 225 |
key=key,
|
| 226 |
on_change=handle_resource_change,
|
| 227 |
args=(ResourceEvent.DESCRIPTION, file, key),
|
|
|
|
| 229 |
if is_file_object:
|
| 230 |
key = f"{prefix}_content_url"
|
| 231 |
st.text_input(
|
| 232 |
+
needed_field("Content URL or local path"),
|
| 233 |
value=file.content_url,
|
| 234 |
key=key,
|
| 235 |
+
help="The URL or local file path pointing to the original FileObject.",
|
| 236 |
on_change=handle_resource_change,
|
| 237 |
args=(ResourceEvent.CONTENT_URL, file, key),
|
| 238 |
)
|
|
|
|
| 249 |
"Content size",
|
| 250 |
value=file.content_size,
|
| 251 |
key=key,
|
| 252 |
+
help="The size of the original FileObject in bytes.",
|
| 253 |
on_change=handle_resource_change,
|
| 254 |
args=(ResourceEvent.CONTENT_SIZE, file, key),
|
| 255 |
)
|
|
|
|
| 268 |
index=code_to_index(file.encoding_format),
|
| 269 |
options=FILE_TYPES.keys(),
|
| 270 |
key=key,
|
| 271 |
+
help=(
|
| 272 |
+
"MIME type corresponding to"
|
| 273 |
+
" ([sc:encodingFormat](https://schema.org/encodingFormat))."
|
| 274 |
+
),
|
| 275 |
on_change=handle_resource_change,
|
| 276 |
args=(ResourceEvent.ENCODING_FORMAT, file, key),
|
| 277 |
)
|
views/load.py
CHANGED
|
@@ -30,5 +30,5 @@ def _on_file_upload(key):
|
|
| 30 |
def render_load():
|
| 31 |
key = "json-ld-file-upload"
|
| 32 |
st.file_uploader(
|
| 33 |
-
"
|
| 34 |
)
|
|
|
|
| 30 |
def render_load():
|
| 31 |
key = "json-ld-file-upload"
|
| 32 |
st.file_uploader(
|
| 33 |
+
"Drop a JSON-LD", type="json", key=key, on_change=_on_file_upload, args=(key,)
|
| 34 |
)
|
views/metadata.py
CHANGED
|
@@ -10,7 +10,57 @@ from events.metadata import MetadataEvent
|
|
| 10 |
|
| 11 |
def render_metadata():
|
| 12 |
"""Renders the `Metadata` view."""
|
| 13 |
-
metadata = st.session_state[Metadata]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
index = find_license_index(metadata.license)
|
| 15 |
key = "metadata-url"
|
| 16 |
st.text_input(
|
|
@@ -21,6 +71,19 @@ def render_metadata():
|
|
| 21 |
on_change=handle_metadata_change,
|
| 22 |
args=(MetadataEvent.URL, metadata, key),
|
| 23 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
key = "metadata-license"
|
| 25 |
st.selectbox(
|
| 26 |
label="License",
|
|
|
|
| 10 |
|
| 11 |
def render_metadata():
|
| 12 |
"""Renders the `Metadata` view."""
|
| 13 |
+
metadata: Metadata = st.session_state[Metadata]
|
| 14 |
+
col1, col2 = st.columns([1, 1])
|
| 15 |
+
with col1.expander("**Generic metadata**", expanded=True):
|
| 16 |
+
_render_generic_metadata(metadata)
|
| 17 |
+
with col2.expander("**Responsible AI (RAI) metadata**", expanded=True):
|
| 18 |
+
_render_rai_metadata(metadata)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _render_rai_metadata(metadata: Metadata):
|
| 22 |
+
"""Renders RAI (Responsible AI) metadata."""
|
| 23 |
+
key = "metadata-data-collection"
|
| 24 |
+
st.text_area(
|
| 25 |
+
label=(
|
| 26 |
+
"**Data collection**. Key stages of the data collection process encourage"
|
| 27 |
+
" its creators to reflect on the process and improves understanding for"
|
| 28 |
+
" users."
|
| 29 |
+
),
|
| 30 |
+
key=key,
|
| 31 |
+
value=metadata.data_collection,
|
| 32 |
+
on_change=handle_metadata_change,
|
| 33 |
+
args=(MetadataEvent.DATA_COLLECTION, metadata, key),
|
| 34 |
+
)
|
| 35 |
+
key = "metadata-data-biases"
|
| 36 |
+
st.text_area(
|
| 37 |
+
label=(
|
| 38 |
+
"**Data biases**. Involves understanding the potential risks associated"
|
| 39 |
+
" with data usage and to prevent unintended and potentially harmful"
|
| 40 |
+
" consequences that may arise from using models trained on or evaluated"
|
| 41 |
+
" with the respective data."
|
| 42 |
+
),
|
| 43 |
+
key=key,
|
| 44 |
+
value=metadata.data_biases,
|
| 45 |
+
on_change=handle_metadata_change,
|
| 46 |
+
args=(MetadataEvent.DATA_BIASES, metadata, key),
|
| 47 |
+
)
|
| 48 |
+
key = "metadata-personal-sensitive-information"
|
| 49 |
+
st.text_area(
|
| 50 |
+
label=(
|
| 51 |
+
"**Personal sensitive information**. Personal and sensitive information, if"
|
| 52 |
+
" contained within the dataset, can play an important role in the"
|
| 53 |
+
" mitigation of any risks and the responsible use of the datasets."
|
| 54 |
+
),
|
| 55 |
+
key=key,
|
| 56 |
+
value=metadata.personal_sensitive_information,
|
| 57 |
+
on_change=handle_metadata_change,
|
| 58 |
+
args=(MetadataEvent.PERSONAL_SENSITIVE_INFORMATION, metadata, key),
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _render_generic_metadata(metadata: Metadata):
|
| 63 |
+
"""Renders all non-RAI generic metadata."""
|
| 64 |
index = find_license_index(metadata.license)
|
| 65 |
key = "metadata-url"
|
| 66 |
st.text_input(
|
|
|
|
| 71 |
on_change=handle_metadata_change,
|
| 72 |
args=(MetadataEvent.URL, metadata, key),
|
| 73 |
)
|
| 74 |
+
key = "metadata-version"
|
| 75 |
+
st.text_input(
|
| 76 |
+
label="Version (`MAJOR.MINOR.PATCH`)",
|
| 77 |
+
key=key,
|
| 78 |
+
help=(
|
| 79 |
+
"Refer to https://semver.org/spec/v2.0.0.html for more information on the"
|
| 80 |
+
" format."
|
| 81 |
+
),
|
| 82 |
+
value=metadata.version,
|
| 83 |
+
placeholder="1.0.0",
|
| 84 |
+
on_change=handle_metadata_change,
|
| 85 |
+
args=(MetadataEvent.VERSION, metadata, key),
|
| 86 |
+
)
|
| 87 |
key = "metadata-license"
|
| 88 |
st.selectbox(
|
| 89 |
label="License",
|
views/overview.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Any
|
|
| 3 |
|
| 4 |
import streamlit as st
|
| 5 |
|
|
|
|
| 6 |
from core.state import Metadata
|
| 7 |
import mlcroissant as mlc
|
| 8 |
from utils import needed_field
|
|
@@ -22,9 +23,9 @@ _INFO_TEXT = """Croissant files are composed of three layers:
|
|
| 22 |
(typically a file or set of files) and the structure of these records,
|
| 23 |
expressed as a set of fields (e.g., the columns of a table).
|
| 24 |
|
| 25 |
-
The next three tabs will guide you through filling those layers.
|
| 26 |
-
|
| 27 |
-
the export button in the upper right corner."""
|
| 28 |
|
| 29 |
|
| 30 |
def _relevant_fields(class_or_instance: type):
|
|
@@ -51,6 +52,7 @@ def render_overview():
|
|
| 51 |
label=needed_field("Name"),
|
| 52 |
key=key,
|
| 53 |
value=metadata.name,
|
|
|
|
| 54 |
placeholder="Dataset",
|
| 55 |
on_change=handle_metadata_change,
|
| 56 |
args=(MetadataEvent.NAME, metadata, key),
|
|
@@ -62,7 +64,7 @@ def render_overview():
|
|
| 62 |
label="Description",
|
| 63 |
key=key,
|
| 64 |
value=metadata.description,
|
| 65 |
-
placeholder="Provide a
|
| 66 |
on_change=handle_metadata_change,
|
| 67 |
args=(MetadataEvent.DESCRIPTION, metadata, key),
|
| 68 |
)
|
|
@@ -82,10 +84,17 @@ def render_overview():
|
|
| 82 |
* 100
|
| 83 |
/ (3 * metadata_weight)
|
| 84 |
)
|
| 85 |
-
col_a.metric(
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
with col2:
|
| 90 |
user_started_editing = metadata.record_sets or metadata.distribution
|
| 91 |
if user_started_editing:
|
|
|
|
| 3 |
|
| 4 |
import streamlit as st
|
| 5 |
|
| 6 |
+
from core.constants import NAMES_INFO
|
| 7 |
from core.state import Metadata
|
| 8 |
import mlcroissant as mlc
|
| 9 |
from utils import needed_field
|
|
|
|
| 23 |
(typically a file or set of files) and the structure of these records,
|
| 24 |
expressed as a set of fields (e.g., the columns of a table).
|
| 25 |
|
| 26 |
+
The next three tabs will guide you through filling those layers. Any error will be
|
| 27 |
+
displayed on the overview. Once the dataset is finished, you can download the dataset by
|
| 28 |
+
clicking the export button in the upper right corner."""
|
| 29 |
|
| 30 |
|
| 31 |
def _relevant_fields(class_or_instance: type):
|
|
|
|
| 52 |
label=needed_field("Name"),
|
| 53 |
key=key,
|
| 54 |
value=metadata.name,
|
| 55 |
+
help=f"The name of the dataset. {NAMES_INFO}",
|
| 56 |
placeholder="Dataset",
|
| 57 |
on_change=handle_metadata_change,
|
| 58 |
args=(MetadataEvent.NAME, metadata, key),
|
|
|
|
| 64 |
label="Description",
|
| 65 |
key=key,
|
| 66 |
value=metadata.description,
|
| 67 |
+
placeholder="Provide a description of the dataset.",
|
| 68 |
on_change=handle_metadata_change,
|
| 69 |
args=(MetadataEvent.DESCRIPTION, metadata, key),
|
| 70 |
)
|
|
|
|
| 84 |
* 100
|
| 85 |
/ (3 * metadata_weight)
|
| 86 |
)
|
| 87 |
+
col_a.metric(
|
| 88 |
+
"Completion",
|
| 89 |
+
f"{completion}%",
|
| 90 |
+
help=(
|
| 91 |
+
"Approximation of the total completion based on the number of fields"
|
| 92 |
+
" that are filled."
|
| 93 |
+
),
|
| 94 |
+
)
|
| 95 |
+
col_b.metric("Metadata fields", fields)
|
| 96 |
+
col_c.metric("Resources", len(metadata.distribution))
|
| 97 |
+
col_d.metric("RecordSets", len(metadata.record_sets))
|
| 98 |
with col2:
|
| 99 |
user_started_editing = metadata.record_sets or metadata.distribution
|
| 100 |
if user_started_editing:
|
views/previous_files.py
CHANGED
|
@@ -50,4 +50,4 @@ def render_previous_files():
|
|
| 50 |
except:
|
| 51 |
pass
|
| 52 |
if has_no_project:
|
| 53 |
-
st.write("No
|
|
|
|
| 50 |
except:
|
| 51 |
pass
|
| 52 |
if has_no_project:
|
| 53 |
+
st.write("No recent project to load. Create one on the left!")
|
views/record_sets.py
CHANGED
|
@@ -10,6 +10,7 @@ from rdflib import term
|
|
| 10 |
import streamlit as st
|
| 11 |
|
| 12 |
from components.safe_button import button_with_confirmation
|
|
|
|
| 13 |
from core.data_types import MLC_DATA_TYPES
|
| 14 |
from core.data_types import mlc_to_str_data_type
|
| 15 |
from core.data_types import STR_DATA_TYPES
|
|
@@ -240,6 +241,7 @@ def _render_left_panel():
|
|
| 240 |
needed_field("Name"),
|
| 241 |
placeholder="Name without special character.",
|
| 242 |
key=key,
|
|
|
|
| 243 |
value=record_set.name,
|
| 244 |
on_change=handle_record_set_change,
|
| 245 |
args=(RecordSetEvent.NAME, record_set, key),
|
|
@@ -247,7 +249,7 @@ def _render_left_panel():
|
|
| 247 |
key = f"{prefix}-description"
|
| 248 |
col2.text_input(
|
| 249 |
"Description",
|
| 250 |
-
placeholder="Provide a
|
| 251 |
key=key,
|
| 252 |
value=record_set.description,
|
| 253 |
on_change=handle_record_set_change,
|
|
@@ -257,6 +259,13 @@ def _render_left_panel():
|
|
| 257 |
st.checkbox(
|
| 258 |
"The RecordSet is an enumeration",
|
| 259 |
key=key,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
value=record_set.is_enumeration,
|
| 261 |
on_change=handle_record_set_change,
|
| 262 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
|
@@ -265,6 +274,10 @@ def _render_left_panel():
|
|
| 265 |
st.checkbox(
|
| 266 |
"The RecordSet has in-line data",
|
| 267 |
key=key,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
value=bool(record_set.data),
|
| 269 |
on_change=handle_record_set_change,
|
| 270 |
args=(RecordSetEvent.HAS_DATA, record_set, key),
|
|
@@ -324,8 +337,14 @@ def _render_left_panel():
|
|
| 324 |
)
|
| 325 |
data_editor_key = _data_editor_key(record_set_key, record_set)
|
| 326 |
st.markdown(
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
)
|
| 330 |
st.data_editor(
|
| 331 |
fields,
|
|
@@ -437,6 +456,7 @@ def _render_right_panel():
|
|
| 437 |
needed_field("Name"),
|
| 438 |
placeholder="Name without special character.",
|
| 439 |
key=key,
|
|
|
|
| 440 |
value=field.name,
|
| 441 |
on_change=handle_field_change,
|
| 442 |
args=(FieldEvent.NAME, field, key),
|
|
@@ -444,38 +464,35 @@ def _render_right_panel():
|
|
| 444 |
key = f"{prefix}-description"
|
| 445 |
col2.text_input(
|
| 446 |
"Description",
|
| 447 |
-
placeholder="Provide a
|
| 448 |
key=key,
|
| 449 |
on_change=handle_field_change,
|
| 450 |
value=field.description,
|
| 451 |
args=(FieldEvent.DESCRIPTION, field, key),
|
| 452 |
)
|
|
|
|
| 453 |
if field.data_types:
|
| 454 |
data_type = field.data_types[0]
|
| 455 |
if isinstance(data_type, str):
|
| 456 |
data_type = term.URIRef(data_type)
|
| 457 |
if data_type in MLC_DATA_TYPES:
|
| 458 |
data_type_index = MLC_DATA_TYPES.index(data_type)
|
| 459 |
-
else:
|
| 460 |
-
data_type_index = None
|
| 461 |
-
else:
|
| 462 |
-
data_type_index = None
|
| 463 |
key = f"{prefix}-datatypes"
|
| 464 |
col3.selectbox(
|
| 465 |
needed_field("Data type"),
|
| 466 |
index=data_type_index,
|
| 467 |
options=STR_DATA_TYPES,
|
| 468 |
key=key,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
on_change=handle_field_change,
|
| 470 |
args=(FieldEvent.DATA_TYPE, field, key),
|
| 471 |
)
|
| 472 |
possible_sources = _get_possible_sources(metadata)
|
| 473 |
-
render_source(
|
| 474 |
-
|
| 475 |
-
)
|
| 476 |
-
render_references(
|
| 477 |
-
record_set_key, record_set, field, field_key, possible_sources
|
| 478 |
-
)
|
| 479 |
|
| 480 |
st.divider()
|
| 481 |
|
|
|
|
| 10 |
import streamlit as st
|
| 11 |
|
| 12 |
from components.safe_button import button_with_confirmation
|
| 13 |
+
from core.constants import NAMES_INFO
|
| 14 |
from core.data_types import MLC_DATA_TYPES
|
| 15 |
from core.data_types import mlc_to_str_data_type
|
| 16 |
from core.data_types import STR_DATA_TYPES
|
|
|
|
| 241 |
needed_field("Name"),
|
| 242 |
placeholder="Name without special character.",
|
| 243 |
key=key,
|
| 244 |
+
help=f"The name of the RecordSet. {NAMES_INFO}",
|
| 245 |
value=record_set.name,
|
| 246 |
on_change=handle_record_set_change,
|
| 247 |
args=(RecordSetEvent.NAME, record_set, key),
|
|
|
|
| 249 |
key = f"{prefix}-description"
|
| 250 |
col2.text_input(
|
| 251 |
"Description",
|
| 252 |
+
placeholder="Provide a description of the RecordSet.",
|
| 253 |
key=key,
|
| 254 |
value=record_set.description,
|
| 255 |
on_change=handle_record_set_change,
|
|
|
|
| 259 |
st.checkbox(
|
| 260 |
"The RecordSet is an enumeration",
|
| 261 |
key=key,
|
| 262 |
+
help=(
|
| 263 |
+
"Enumerations indicate that the RecordSet takes its values from a"
|
| 264 |
+
" finite set. Similar to `ClassLabel` in"
|
| 265 |
+
" [TFDS](https://www.tensorflow.org/datasets/api_docs/python/tfds/features/ClassLabel)"
|
| 266 |
+
" or [Hugging"
|
| 267 |
+
" Face](https://huggingface.co/docs/datasets/v2.15.0/en/package_reference/main_classes#datasets.ClassLabel)."
|
| 268 |
+
),
|
| 269 |
value=record_set.is_enumeration,
|
| 270 |
on_change=handle_record_set_change,
|
| 271 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
|
|
|
| 274 |
st.checkbox(
|
| 275 |
"The RecordSet has in-line data",
|
| 276 |
key=key,
|
| 277 |
+
help=(
|
| 278 |
+
"In-line data allows to embed data directly within the JSON-LD"
|
| 279 |
+
" without referencing another data source."
|
| 280 |
+
),
|
| 281 |
value=bool(record_set.data),
|
| 282 |
on_change=handle_record_set_change,
|
| 283 |
args=(RecordSetEvent.HAS_DATA, record_set, key),
|
|
|
|
| 337 |
)
|
| 338 |
data_editor_key = _data_editor_key(record_set_key, record_set)
|
| 339 |
st.markdown(
|
| 340 |
+
needed_field("Fields"),
|
| 341 |
+
help=(
|
| 342 |
+
"Add/delete fields by directly editing the table. **Warning**: the"
|
| 343 |
+
" table contains information about the fields--not the data"
|
| 344 |
+
" directly. If you wish to embed data, tick the `The RecordSet is"
|
| 345 |
+
" an enumeration` box. To edit fields details, click the"
|
| 346 |
+
" button `Edit fields details` below."
|
| 347 |
+
),
|
| 348 |
)
|
| 349 |
st.data_editor(
|
| 350 |
fields,
|
|
|
|
| 456 |
needed_field("Name"),
|
| 457 |
placeholder="Name without special character.",
|
| 458 |
key=key,
|
| 459 |
+
help=f"The name of the field. {NAMES_INFO}",
|
| 460 |
value=field.name,
|
| 461 |
on_change=handle_field_change,
|
| 462 |
args=(FieldEvent.NAME, field, key),
|
|
|
|
| 464 |
key = f"{prefix}-description"
|
| 465 |
col2.text_input(
|
| 466 |
"Description",
|
| 467 |
+
placeholder="Provide a description of the RecordSet.",
|
| 468 |
key=key,
|
| 469 |
on_change=handle_field_change,
|
| 470 |
value=field.description,
|
| 471 |
args=(FieldEvent.DESCRIPTION, field, key),
|
| 472 |
)
|
| 473 |
+
data_type_index = None
|
| 474 |
if field.data_types:
|
| 475 |
data_type = field.data_types[0]
|
| 476 |
if isinstance(data_type, str):
|
| 477 |
data_type = term.URIRef(data_type)
|
| 478 |
if data_type in MLC_DATA_TYPES:
|
| 479 |
data_type_index = MLC_DATA_TYPES.index(data_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
key = f"{prefix}-datatypes"
|
| 481 |
col3.selectbox(
|
| 482 |
needed_field("Data type"),
|
| 483 |
index=data_type_index,
|
| 484 |
options=STR_DATA_TYPES,
|
| 485 |
key=key,
|
| 486 |
+
help=(
|
| 487 |
+
"The type of the data. `Text` corresponds to"
|
| 488 |
+
" https://schema.org/Text, etc."
|
| 489 |
+
),
|
| 490 |
on_change=handle_field_change,
|
| 491 |
args=(FieldEvent.DATA_TYPE, field, key),
|
| 492 |
)
|
| 493 |
possible_sources = _get_possible_sources(metadata)
|
| 494 |
+
render_source(record_set, field, possible_sources)
|
| 495 |
+
render_references(record_set, field, possible_sources)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
st.divider()
|
| 498 |
|
views/source.py
CHANGED
|
@@ -12,6 +12,15 @@ from events.fields import TransformType
|
|
| 12 |
import mlcroissant as mlc
|
| 13 |
from utils import needed_field
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
class SourceType:
|
| 17 |
"""The type of the source (distribution or field)."""
|
|
@@ -105,10 +114,8 @@ def _handle_remove_reference(field):
|
|
| 105 |
|
| 106 |
|
| 107 |
def render_source(
|
| 108 |
-
record_set_key: int,
|
| 109 |
record_set: RecordSet,
|
| 110 |
field: Field,
|
| 111 |
-
field_key: int,
|
| 112 |
possible_sources: list[str],
|
| 113 |
):
|
| 114 |
"""Renders the form for the source."""
|
|
@@ -123,10 +130,13 @@ def render_source(
|
|
| 123 |
index = None
|
| 124 |
key = f"{prefix}-source"
|
| 125 |
col1.selectbox(
|
| 126 |
-
needed_field("
|
| 127 |
index=index,
|
| 128 |
options=options,
|
| 129 |
key=key,
|
|
|
|
|
|
|
|
|
|
| 130 |
on_change=handle_field_change,
|
| 131 |
args=(FieldEvent.SOURCE, field, key),
|
| 132 |
)
|
|
@@ -135,6 +145,7 @@ def render_source(
|
|
| 135 |
needed_field("Extract"),
|
| 136 |
index=_get_extract_index(source),
|
| 137 |
key=f"{prefix}-extract",
|
|
|
|
| 138 |
options=EXTRACT_TYPES,
|
| 139 |
on_change=handle_field_change,
|
| 140 |
args=(FieldEvent.SOURCE_EXTRACT, field, key),
|
|
@@ -145,6 +156,7 @@ def render_source(
|
|
| 145 |
needed_field("Column name"),
|
| 146 |
value=source.extract.column,
|
| 147 |
key=key,
|
|
|
|
| 148 |
on_change=handle_field_change,
|
| 149 |
args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
|
| 150 |
)
|
|
@@ -154,6 +166,7 @@ def render_source(
|
|
| 154 |
needed_field("JSON path"),
|
| 155 |
value=source.extract.json_path,
|
| 156 |
key=key,
|
|
|
|
| 157 |
on_change=handle_field_change,
|
| 158 |
args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
|
| 159 |
)
|
|
@@ -170,18 +183,23 @@ def render_source(
|
|
| 170 |
key=key,
|
| 171 |
options=TRANSFORM_TYPES,
|
| 172 |
on_change=handle_field_change,
|
|
|
|
| 173 |
args=(FieldEvent.TRANSFORM, field, key),
|
| 174 |
kwargs={"number": number},
|
| 175 |
)
|
| 176 |
if selected == TransformType.FORMAT:
|
| 177 |
key = f"{prefix}-{number}-transform-format"
|
| 178 |
col3.text_input(
|
| 179 |
-
needed_field("Format"),
|
| 180 |
value=transform.format,
|
| 181 |
key=key,
|
| 182 |
on_change=handle_field_change,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
args=(selected, field, key),
|
| 184 |
-
kwargs={"number": number
|
| 185 |
)
|
| 186 |
elif selected == TransformType.JSON_PATH:
|
| 187 |
key = f"{prefix}-{number}-jsonpath"
|
|
@@ -190,8 +208,9 @@ def render_source(
|
|
| 190 |
value=transform.json_path,
|
| 191 |
key=key,
|
| 192 |
on_change=handle_field_change,
|
|
|
|
| 193 |
args=(selected, field, key),
|
| 194 |
-
kwargs={"number": number
|
| 195 |
)
|
| 196 |
elif selected == TransformType.REGEX:
|
| 197 |
key = f"{prefix}-{number}-regex"
|
|
@@ -200,8 +219,14 @@ def render_source(
|
|
| 200 |
value=transform.regex,
|
| 201 |
key=key,
|
| 202 |
on_change=handle_field_change,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
args=(selected, field, key),
|
| 204 |
-
kwargs={"number": number
|
| 205 |
)
|
| 206 |
elif selected == TransformType.REPLACE:
|
| 207 |
key = f"{prefix}-{number}-replace"
|
|
@@ -210,8 +235,13 @@ def render_source(
|
|
| 210 |
value=transform.replace,
|
| 211 |
key=key,
|
| 212 |
on_change=handle_field_change,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
args=(selected, field, key),
|
| 214 |
-
kwargs={"number": number
|
| 215 |
)
|
| 216 |
elif selected == TransformType.SEPARATOR:
|
| 217 |
key = f"{prefix}-{number}-separator"
|
|
@@ -220,8 +250,9 @@ def render_source(
|
|
| 220 |
value=transform.separator,
|
| 221 |
key=key,
|
| 222 |
on_change=handle_field_change,
|
|
|
|
| 223 |
args=(selected, field, key),
|
| 224 |
-
kwargs={"number": number
|
| 225 |
)
|
| 226 |
|
| 227 |
def _handle_remove_transform(field, number):
|
|
@@ -230,6 +261,7 @@ def render_source(
|
|
| 230 |
col4.button(
|
| 231 |
"✖️",
|
| 232 |
key=f"{prefix}-{number}-remove-transform",
|
|
|
|
| 233 |
on_click=_handle_remove_transform,
|
| 234 |
args=(field, number),
|
| 235 |
)
|
|
@@ -243,16 +275,15 @@ def render_source(
|
|
| 243 |
col1.button(
|
| 244 |
"Add transform on data",
|
| 245 |
key=f"{prefix}-close-fields",
|
|
|
|
| 246 |
on_click=_handle_add_transform,
|
| 247 |
args=(field,),
|
| 248 |
)
|
| 249 |
|
| 250 |
|
| 251 |
def render_references(
|
| 252 |
-
record_set_key: int,
|
| 253 |
record_set: RecordSet,
|
| 254 |
field: Field,
|
| 255 |
-
field_key: int,
|
| 256 |
possible_sources: list[str],
|
| 257 |
):
|
| 258 |
"""Renders the form for references."""
|
|
@@ -286,6 +317,7 @@ def render_references(
|
|
| 286 |
index=_get_extract_index(references),
|
| 287 |
key=key,
|
| 288 |
options=EXTRACT_TYPES,
|
|
|
|
| 289 |
on_change=handle_field_change,
|
| 290 |
args=(FieldEvent.REFERENCE_EXTRACT, field, key),
|
| 291 |
)
|
|
@@ -295,6 +327,7 @@ def render_references(
|
|
| 295 |
needed_field("Column name"),
|
| 296 |
value=references.extract.column,
|
| 297 |
key=key,
|
|
|
|
| 298 |
on_change=handle_field_change,
|
| 299 |
args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
|
| 300 |
)
|
|
@@ -304,12 +337,14 @@ def render_references(
|
|
| 304 |
needed_field("JSON path"),
|
| 305 |
value=references.extract.json_path,
|
| 306 |
key=key,
|
|
|
|
| 307 |
on_change=handle_field_change,
|
| 308 |
args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
|
| 309 |
)
|
| 310 |
col4.button(
|
| 311 |
"✖️",
|
| 312 |
key=f"{key}-remove-reference",
|
|
|
|
| 313 |
on_click=_handle_remove_reference,
|
| 314 |
args=(field,),
|
| 315 |
)
|
|
|
|
| 12 |
import mlcroissant as mlc
|
| 13 |
from utils import needed_field
|
| 14 |
|
| 15 |
+
_JSON_PATH_DOCUMENTATION = (
|
| 16 |
+
"The JSON path if the data source is a JSON (see"
|
| 17 |
+
" [documentation](https://www.ietf.org/archive/id/draft-goessner-dispatch-jsonpath-00.html))."
|
| 18 |
+
)
|
| 19 |
+
_EXTRACT_DOCUMENTATION = (
|
| 20 |
+
"The extraction method to get the value of the field (column in a CSV, etc)."
|
| 21 |
+
)
|
| 22 |
+
_COLUMN_NAME_DOCUMENTATION = "The name of the column if the data source is a CSV."
|
| 23 |
+
|
| 24 |
|
| 25 |
class SourceType:
|
| 26 |
"""The type of the source (distribution or field)."""
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
def render_source(
|
|
|
|
| 117 |
record_set: RecordSet,
|
| 118 |
field: Field,
|
|
|
|
| 119 |
possible_sources: list[str],
|
| 120 |
):
|
| 121 |
"""Renders the form for the source."""
|
|
|
|
| 130 |
index = None
|
| 131 |
key = f"{prefix}-source"
|
| 132 |
col1.selectbox(
|
| 133 |
+
needed_field("Data source"),
|
| 134 |
index=index,
|
| 135 |
options=options,
|
| 136 |
key=key,
|
| 137 |
+
help=(
|
| 138 |
+
"Data sources can be other resources (FileObject, FileSet) or other fields."
|
| 139 |
+
),
|
| 140 |
on_change=handle_field_change,
|
| 141 |
args=(FieldEvent.SOURCE, field, key),
|
| 142 |
)
|
|
|
|
| 145 |
needed_field("Extract"),
|
| 146 |
index=_get_extract_index(source),
|
| 147 |
key=f"{prefix}-extract",
|
| 148 |
+
help=_EXTRACT_DOCUMENTATION,
|
| 149 |
options=EXTRACT_TYPES,
|
| 150 |
on_change=handle_field_change,
|
| 151 |
args=(FieldEvent.SOURCE_EXTRACT, field, key),
|
|
|
|
| 156 |
needed_field("Column name"),
|
| 157 |
value=source.extract.column,
|
| 158 |
key=key,
|
| 159 |
+
help=_COLUMN_NAME_DOCUMENTATION,
|
| 160 |
on_change=handle_field_change,
|
| 161 |
args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
|
| 162 |
)
|
|
|
|
| 166 |
needed_field("JSON path"),
|
| 167 |
value=source.extract.json_path,
|
| 168 |
key=key,
|
| 169 |
+
help=_JSON_PATH_DOCUMENTATION,
|
| 170 |
on_change=handle_field_change,
|
| 171 |
args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
|
| 172 |
)
|
|
|
|
| 183 |
key=key,
|
| 184 |
options=TRANSFORM_TYPES,
|
| 185 |
on_change=handle_field_change,
|
| 186 |
+
help="One or more transformations to apply after extracting the field.",
|
| 187 |
args=(FieldEvent.TRANSFORM, field, key),
|
| 188 |
kwargs={"number": number},
|
| 189 |
)
|
| 190 |
if selected == TransformType.FORMAT:
|
| 191 |
key = f"{prefix}-{number}-transform-format"
|
| 192 |
col3.text_input(
|
| 193 |
+
needed_field("Format a date"),
|
| 194 |
value=transform.format,
|
| 195 |
key=key,
|
| 196 |
on_change=handle_field_change,
|
| 197 |
+
help=(
|
| 198 |
+
"For dates, use [`Python format"
|
| 199 |
+
" codes`](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)."
|
| 200 |
+
),
|
| 201 |
args=(selected, field, key),
|
| 202 |
+
kwargs={"number": number},
|
| 203 |
)
|
| 204 |
elif selected == TransformType.JSON_PATH:
|
| 205 |
key = f"{prefix}-{number}-jsonpath"
|
|
|
|
| 208 |
value=transform.json_path,
|
| 209 |
key=key,
|
| 210 |
on_change=handle_field_change,
|
| 211 |
+
help=_JSON_PATH_DOCUMENTATION,
|
| 212 |
args=(selected, field, key),
|
| 213 |
+
kwargs={"number": number},
|
| 214 |
)
|
| 215 |
elif selected == TransformType.REGEX:
|
| 216 |
key = f"{prefix}-{number}-regex"
|
|
|
|
| 219 |
value=transform.regex,
|
| 220 |
key=key,
|
| 221 |
on_change=handle_field_change,
|
| 222 |
+
help=(
|
| 223 |
+
"A regular expression following [`re` Python"
|
| 224 |
+
" convention](https://docs.python.org/3/library/re.html#regular-expression-syntax)"
|
| 225 |
+
" with one capturing group. The result of the operation will be"
|
| 226 |
+
" the last captured group."
|
| 227 |
+
),
|
| 228 |
args=(selected, field, key),
|
| 229 |
+
kwargs={"number": number},
|
| 230 |
)
|
| 231 |
elif selected == TransformType.REPLACE:
|
| 232 |
key = f"{prefix}-{number}-replace"
|
|
|
|
| 235 |
value=transform.replace,
|
| 236 |
key=key,
|
| 237 |
on_change=handle_field_change,
|
| 238 |
+
help=(
|
| 239 |
+
"A replace pattern separated by a `/`, i.e."
|
| 240 |
+
" `string_to_replace/string_to_substitute` in order to replace"
|
| 241 |
+
" `string_to_replace` by `string_to_substitute`."
|
| 242 |
+
),
|
| 243 |
args=(selected, field, key),
|
| 244 |
+
kwargs={"number": number},
|
| 245 |
)
|
| 246 |
elif selected == TransformType.SEPARATOR:
|
| 247 |
key = f"{prefix}-{number}-separator"
|
|
|
|
| 250 |
value=transform.separator,
|
| 251 |
key=key,
|
| 252 |
on_change=handle_field_change,
|
| 253 |
+
help="A separator to split strings on, e.g. `|` to split `a|b|c`.",
|
| 254 |
args=(selected, field, key),
|
| 255 |
+
kwargs={"number": number},
|
| 256 |
)
|
| 257 |
|
| 258 |
def _handle_remove_transform(field, number):
|
|
|
|
| 261 |
col4.button(
|
| 262 |
"✖️",
|
| 263 |
key=f"{prefix}-{number}-remove-transform",
|
| 264 |
+
help="Remove the transformation.",
|
| 265 |
on_click=_handle_remove_transform,
|
| 266 |
args=(field, number),
|
| 267 |
)
|
|
|
|
| 275 |
col1.button(
|
| 276 |
"Add transform on data",
|
| 277 |
key=f"{prefix}-close-fields",
|
| 278 |
+
help="Add a transformation.",
|
| 279 |
on_click=_handle_add_transform,
|
| 280 |
args=(field,),
|
| 281 |
)
|
| 282 |
|
| 283 |
|
| 284 |
def render_references(
|
|
|
|
| 285 |
record_set: RecordSet,
|
| 286 |
field: Field,
|
|
|
|
| 287 |
possible_sources: list[str],
|
| 288 |
):
|
| 289 |
"""Renders the form for references."""
|
|
|
|
| 317 |
index=_get_extract_index(references),
|
| 318 |
key=key,
|
| 319 |
options=EXTRACT_TYPES,
|
| 320 |
+
help=_EXTRACT_DOCUMENTATION,
|
| 321 |
on_change=handle_field_change,
|
| 322 |
args=(FieldEvent.REFERENCE_EXTRACT, field, key),
|
| 323 |
)
|
|
|
|
| 327 |
needed_field("Column name"),
|
| 328 |
value=references.extract.column,
|
| 329 |
key=key,
|
| 330 |
+
help=_COLUMN_NAME_DOCUMENTATION,
|
| 331 |
on_change=handle_field_change,
|
| 332 |
args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
|
| 333 |
)
|
|
|
|
| 337 |
needed_field("JSON path"),
|
| 338 |
value=references.extract.json_path,
|
| 339 |
key=key,
|
| 340 |
+
help=_JSON_PATH_DOCUMENTATION,
|
| 341 |
on_change=handle_field_change,
|
| 342 |
args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
|
| 343 |
)
|
| 344 |
col4.button(
|
| 345 |
"✖️",
|
| 346 |
key=f"{key}-remove-reference",
|
| 347 |
+
help="Remove the join.",
|
| 348 |
on_click=_handle_remove_reference,
|
| 349 |
args=(field,),
|
| 350 |
)
|
views/splash.py
CHANGED
|
@@ -13,6 +13,8 @@ import mlcroissant as mlc
|
|
| 13 |
from views.load import render_load
|
| 14 |
from views.previous_files import render_previous_files
|
| 15 |
|
|
|
|
|
|
|
| 16 |
_DATASETS = {
|
| 17 |
"Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
|
| 18 |
"FLORES-200": [],
|
|
@@ -23,8 +25,23 @@ _DATASETS = {
|
|
| 23 |
"Bigcode-The-Stack": [],
|
| 24 |
}
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def render_splash():
|
|
|
|
| 28 |
if OAUTH_CLIENT_ID:
|
| 29 |
st.info(
|
| 30 |
"**Disclaimer**: Do not put sensitive information or datasets here. The"
|
|
@@ -34,9 +51,7 @@ def render_splash():
|
|
| 34 |
)
|
| 35 |
col1, col2 = st.columns([1, 1], gap="large")
|
| 36 |
with col1:
|
| 37 |
-
with st.expander("**
|
| 38 |
-
render_load()
|
| 39 |
-
with st.expander("**Create from scratch**", expanded=True):
|
| 40 |
|
| 41 |
def create_new_croissant():
|
| 42 |
st.session_state[Metadata] = Metadata()
|
|
@@ -47,7 +62,7 @@ def render_splash():
|
|
| 47 |
on_click=create_new_croissant,
|
| 48 |
type="primary",
|
| 49 |
)
|
| 50 |
-
with st.expander("**
|
| 51 |
|
| 52 |
def create_example(dataset: str):
|
| 53 |
base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
|
|
@@ -72,7 +87,7 @@ def render_splash():
|
|
| 72 |
)
|
| 73 |
|
| 74 |
dataset = st.selectbox(
|
| 75 |
-
label="
|
| 76 |
options=_DATASETS.keys(),
|
| 77 |
)
|
| 78 |
st.button(
|
|
@@ -81,6 +96,28 @@ def render_splash():
|
|
| 81 |
type="primary",
|
| 82 |
args=(dataset,),
|
| 83 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
with col2:
|
| 85 |
-
with st.expander("**
|
| 86 |
render_previous_files()
|
|
|
|
| 13 |
from views.load import render_load
|
| 14 |
from views.previous_files import render_previous_files
|
| 15 |
|
| 16 |
+
_HUGGING_FACE_URL = "https://huggingface.co/datasets/"
|
| 17 |
+
|
| 18 |
_DATASETS = {
|
| 19 |
"Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
|
| 20 |
"FLORES-200": [],
|
|
|
|
| 25 |
"Bigcode-The-Stack": [],
|
| 26 |
}
|
| 27 |
|
| 28 |
+
_INFO = """[Croissant](https://mlcommons.org/croissant) 🥐 is a high-level format for
|
| 29 |
+
machine learning datasets built
|
| 30 |
+
on [schema.org](https://schema.org/) and its Dataset vocabulary. A croissant
|
| 31 |
+
configuration file combines metadata, resource file descriptions, data structure, and
|
| 32 |
+
default ML semantics of dataset. You can familiarize yourself with the editor by
|
| 33 |
+
exploring the provided examples.
|
| 34 |
+
|
| 35 |
+
The editor supports creating a new configuration from scratch, as well as uploading
|
| 36 |
+
an existing Croissant JSON-MD file. Finally, you can also select any of your
|
| 37 |
+
past projects from the list.
|
| 38 |
+
|
| 39 |
+
You can change the project you are currently editing at any time by clicking
|
| 40 |
+
the Menu button and then choosing one of the options on this page."""
|
| 41 |
+
|
| 42 |
|
| 43 |
def render_splash():
|
| 44 |
+
st.info(_INFO, icon="💡")
|
| 45 |
if OAUTH_CLIENT_ID:
|
| 46 |
st.info(
|
| 47 |
"**Disclaimer**: Do not put sensitive information or datasets here. The"
|
|
|
|
| 51 |
)
|
| 52 |
col1, col2 = st.columns([1, 1], gap="large")
|
| 53 |
with col1:
|
| 54 |
+
with st.expander("**Create a new dataset**", expanded=True):
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def create_new_croissant():
|
| 57 |
st.session_state[Metadata] = Metadata()
|
|
|
|
| 62 |
on_click=create_new_croissant,
|
| 63 |
type="primary",
|
| 64 |
)
|
| 65 |
+
with st.expander("**Load an existing dataset**", expanded=True):
|
| 66 |
|
| 67 |
def create_example(dataset: str):
|
| 68 |
base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
dataset = st.selectbox(
|
| 90 |
+
label="Canonical dataset",
|
| 91 |
options=_DATASETS.keys(),
|
| 92 |
)
|
| 93 |
st.button(
|
|
|
|
| 96 |
type="primary",
|
| 97 |
args=(dataset,),
|
| 98 |
)
|
| 99 |
+
url = st.text_input(
|
| 100 |
+
label="Hugging Face dataset",
|
| 101 |
+
placeholder="Example: https://huggingface.co/datasets/mnist",
|
| 102 |
+
)
|
| 103 |
+
if url.startswith(_HUGGING_FACE_URL):
|
| 104 |
+
name = url.replace(_HUGGING_FACE_URL, "")
|
| 105 |
+
api_url = (
|
| 106 |
+
f"https://datasets-server.huggingface.co/croissant?dataset={name}"
|
| 107 |
+
)
|
| 108 |
+
json = requests.get(api_url, headers=None).json()
|
| 109 |
+
try:
|
| 110 |
+
metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
|
| 111 |
+
st.session_state[Metadata] = Metadata.from_canonical(metadata)
|
| 112 |
+
save_current_project()
|
| 113 |
+
except Exception:
|
| 114 |
+
st.error(f"Malformed JSON: {json}")
|
| 115 |
+
elif url:
|
| 116 |
+
st.error(
|
| 117 |
+
f"Unknown URL {url}. Hugging Face URLS should look like"
|
| 118 |
+
f" {_HUGGING_FACE_URL}somedataset."
|
| 119 |
+
)
|
| 120 |
+
render_load()
|
| 121 |
with col2:
|
| 122 |
+
with st.expander("**Recent projects**", expanded=True):
|
| 123 |
render_previous_files()
|