Spaces:

MLCommons
/

croissant-editor

Running

App Files Files Community

croissant-editor / views /overview.py

marcenacp

Deploy (see actual commits on https://github.com/mlcommons/croissant).

edf454b about 1 year ago

raw

history blame

3.88 kB

	import dataclasses
	from typing import Any

	import streamlit as st

	from core.state import Metadata
	import mlcroissant as mlc
	from utils import needed_field
	from views.metadata import handle_metadata_change
	from views.metadata import MetadataEvent

	_NON_RELEVANT_METADATA = ["name", "distribution", "record_sets", "rdf"]

	_INFO_TEXT = """Croissant files are composed of three layers:

	- Metadata about the dataset covering Responsible AI, licensing and attributes of
	[sc\:Dataset](https://schema.org/Dataset).
	- Resources: The contents of a dataset as the underlying files
	([`FileObject`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileobject))
	and/or sets of files ([`FileSet`](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md#fileset)).
	- RecordSets: the sets of structured records obtained from one or more resources
	(typically a file or set of files) and the structure of these records,
	expressed as a set of fields (e.g., the columns of a table).

	The next three tabs will guide you through filling those layers. The errors if any will
	be displayed on this page. Once you are ready, you can download the dataset by clicking
	the export button in the upper right corner."""


	def render_overview():
	metadata: Metadata = st.session_state[Metadata]
	col1, col2 = st.columns([1, 1], gap="medium")
	with col1:
	key = "metadata-name"
	name = st.text_input(
	label=needed_field("Name"),
	key=key,
	value=metadata.name,
	placeholder="Dataset",
	on_change=handle_metadata_change,
	args=(MetadataEvent.NAME, metadata, key),
	)
	if not name:
	st.stop()
	key = "metadata-url"
	url = st.text_input(
	label=needed_field("URL"),
	key=key,
	value=metadata.url,
	placeholder="URL to the dataset.",
	on_change=handle_metadata_change,
	args=(MetadataEvent.URL, metadata, key),
	)
	if not url:
	st.stop()
	key = "metadata-description"
	st.text_area(
	label="Description",
	key=key,
	value=metadata.description,
	placeholder="Provide a clear description of the dataset.",
	on_change=handle_metadata_change,
	args=(MetadataEvent.DESCRIPTION, metadata, key),
	)
	st.divider()
	left, middle, right = st.columns([1, 1, 1])
	fields = [
	field
	for field, value in dataclasses.asdict(metadata).items()
	if value and field not in _NON_RELEVANT_METADATA
	]
	left.metric("Number of metadata", len(fields))
	middle.metric("Number of resources", len(metadata.distribution))
	right.metric("Number of RecordSets", len(metadata.record_sets))
	with col2:
	user_started_editing = metadata.record_sets or metadata.distribution
	if user_started_editing:
	warning = ""
	try:
	issues = metadata.to_canonical().issues
	if issues.errors:
	warning += "Errors\n"
	for error in issues.errors:
	warning += f"{error}\n"
	if issues.warnings:
	warning += "Warnings\n"
	for warning in issues.warnings:
	warning += f"{warning}\n"
	except mlc.ValidationError as exception:
	warning += "Errors\n"
	warning += f"{str(exception)}\n"
	if warning:
	st.warning(warning, icon="⚠️")
	else:
	st.success("No validation issues detected!", icon="✅")
	st.info(_INFO_TEXT, icon="💡")