Spaces:

GEM
/

DatasetCardForm

Runtime error

File size: 10,621 Bytes

import json
import streamlit as st

from os.path import join as pjoin

from .streamlit_utils import (
    make_multiselect,
    make_selectbox,
    make_text_area,
    make_text_input,
    make_radio,
)

N_FIELDS_WHERE = 9
N_FIELDS_LANGUAGES = 8
N_FIELDS_CREDIT = 3
N_FIELDS_STRUCTURE = 7

N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE


languages_bcp47 = [
    x
    for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))[
        "subtags"
    ]
    if x["type"] == "language"
]

license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8"))


def overview_page():
    st.session_state.card_dict["overview"] = st.session_state.card_dict.get(
        "overview", {}
    )
    with st.expander("Where to find the data and its documentation", expanded=False):
        key_pref = ["overview", "where"]
        st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict[
            "overview"
        ].get("where", {})
        make_text_input(
            label="What is the webpage for the dataset (if it exists)?",
            key_list=key_pref + ["website"],
            help="[URL]",
        )
        make_text_input(
            label="What is the link to where the original dataset is hosted?",
            key_list=key_pref + ["data-url"],
            help="[URL]",
        )
        make_text_input(
            label="What is the link to the paper describing the dataset (open access preferred)?",
            key_list=key_pref + ["paper-url"],
            help="[URL]",
        )
        make_text_area(
            label="Provide the BibTex-formatted reference for the dataset.",
            key_list=key_pref + ["paper-bibtext"],
            help="[free text]",
        )
        make_radio(
            label="Does the dataset have an active leaderboard?",
            options=["no", "yes"],
            key_list=key_pref + ["has-leaderboard"],
            help="If no, enter N/A for the following two fields",
        )
        make_text_input(
            label="Provide a link to the leaderboard if it exists. Otherwise, enter N/A.",
            key_list=key_pref + ["leaderboard-url"],
            help="[URL] or N/A",
        )
        make_text_area(
            label="Briefly describe how the leaderboard evaluates models if it exists. Otherwise, enter N/A.",
            key_list=key_pref + ["leaderboard-description"],
            help="[free text; a paragraph] or N/A",
        )
        make_text_input(
            label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
            key_list=key_pref + ["contact-name"],
            help="[free text]",
        )
        make_text_input(
            label="If known, provide the email of at least one person the reader can contact for questions about the dataset.",
            key_list=key_pref + ["contact-email"],
            help="[free text]",
        )
    with st.expander("Languages and Intended Use", expanded=False):
        key_pref = ["overview", "languages"]
        st.session_state.card_dict["overview"][
            "languages"
        ] = st.session_state.card_dict["overview"].get("languages", {})
        make_radio(
            label="Is the dataset multilingual?",
            options=["no", "yes"],
            key_list=key_pref + ["is-multilingual"],
            help="More than one language present in all of the text fields",
        )
        make_multiselect(
            label="What languages/dialects are covered in the dataset?",
            key_list=key_pref + ["language-names"],
            options=[", ".join(x["description"]) for x in languages_bcp47],
            help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
        )
        make_text_area(
            label="What dialects are covered? Are there multiple dialects per language?",
            key_list=key_pref + ["language-dialects"],
            help="[free text, paragraphs] - Describe the dialect(s) as appropriate.",
        )
        make_text_area(
            label="Whose language is in the dataset?",
            key_list=key_pref + ["language-speakers"],
            help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.",
        )
        make_text_area(
            label="What is the intended use of the dataset?",
            key_list=key_pref + ["intended-use"],
            help="[free text, paragraphs]",
        )
        make_selectbox(
            label="What is the license of the dataset?",
            key_list=key_pref + ["license"],
            options=license_list,
            help="select `other` if missing from list, `unkown` if not provided.",
        )
        make_selectbox(
            label="What primary task does the dataset support?",
            key_list=key_pref + ["task"],
            options=[
                "Content Transfer",
                "Data-to-Text",
                "Dialog Response Generation",
                "Paraphrasing",
                "Question Generation",
                "Reasoning",
                "Simplification",
                "Style Transfer",
                "Summarization",
                "Text-to-Slide",
            ],
            help="Select `other` if the task is not included in the list.",
        )
        make_text_area(
            label="Provide a short description of the communicative goal of a model trained for this task on this dataset.",
            key_list=key_pref + ["communicative"],
            help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)",
        )
    with st.expander("Credit", expanded=False):
        key_pref = ["overview", "credit"]
        st.session_state.card_dict["overview"][
            "credit"
        ] = st.session_state.card_dict.get("credit", {})
        make_text_input(
            label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
            key_list=key_pref + ["creators"],
            help="name (affiliation); comma-separated",
        )
        make_text_input(
            label="Who funded the data creation?",
            key_list=key_pref + ["funding"],
            help="[free text] enter N/A if unkown",
        )
        make_text_input(
            label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.",
            key_list=key_pref + ["gem-added-by"],
            help="name (affiliation); comma-separated",
        )
    with st.expander("Structure", expanded=False):
        key_pref = ["overview", "structure"]
        st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict[
            "overview"
        ].get("structure", {})
        data_fields_help = """
        [free text; paragraphs]
        - Mention their data type, and whether and how they are used as part of the generation pipeline.
        - Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc.
        - If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.
        """
        make_text_area(
            label="List and describe the fields present in the dataset.",
            key_list=key_pref + ["data-fields"],
            help=data_fields_help,
        )
        make_text_area(
            label="How was the dataset structure determined?",
            key_list=key_pref + ["structure-description"],
            help="[free text; paragraph]",
        )
        make_text_area(
            label="How were the labels chosen?",
            key_list=key_pref + ["structure-labels"],
            help="[free text; paragraph]",
        )
        make_text_area(
            label="Provide a JSON formatted example of a typical instance in the dataset.",
            key_list=key_pref + ["structure-example"],
            help="[JSON]",
        )
        make_text_area(
            label="Describe and name the splits in the dataset if there are more than one.",
            key_list=key_pref + ["structure-splits"],
            help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.",
        )
        make_text_area(
            label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.",
            key_list=key_pref + ["structure-splits-criteria"],
            help="[free text, paragraphs]",
        )
        make_text_area(
            label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?",
            key_list=key_pref + ["structure-outlier"],
            help="[free text + json formatted text/file for an example]",
        )


def overview_summary():
    total_filled = sum(
        [len(dct) for dct in st.session_state.card_dict.get("overview", {}).values()]
    )
    with st.expander(
        f"Dataset Overview Completion - {total_filled} of {N_FIELDS}", expanded=False
    ):
        completion_markdown = ""
        completion_markdown += (
            f"- **Overall competion:**\n  - {total_filled} of {N_FIELDS} fields\n"
        )
        completion_markdown += f"- **Sub-section - Where to find:**\n  - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
        completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n  - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
        completion_markdown += f"- **Sub-section - Credit:**\n  - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n"
        completion_markdown += f"- **Sub-section - Structure:**\n  - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n"
        st.markdown(completion_markdown)