Spaces:

GEM
/

DatasetCardForm

Runtime error

File size: 6,772 Bytes

ac6c40f
 
13fd677
 
 
 
 
 
 
ac6c40f
0ea4f6d
dd1054a
 
13fd677
 
ac6c40f
57616af
ac6c40f
13fd677
 
 
 
 
 
 
 
0ea4f6d
dd1054a
0ea4f6d
 
 
13fd677
 
 
 
 
 
 
0ea4f6d
dd1054a
0ea4f6d
 
 
 
 
 
 
 
 
 
ead9ac7
d822486
13fd677
ead9ac7
 
 
 
 
 
 
 
0ea4f6d
dd1054a
 
 
 
 
 
 
 
 
 
 
13fd677
dd1054a
13fd677
ead9ac7
 
 
 
 
 
 
13fd677
dd1054a
13fd677
 
 
 
 
fcb0120
 
13fd677
fcb0120
13fd677
 
dd1054a
 
13fd677
dd1054a
13fd677
 
dd1054a
 
13fd677
dd1054a
13fd677
ac6c40f
57616af
ac6c40f
13fd677
 
 
 
3578aa2
13fd677
 
 
969e2c4
13fd677
 
 
dd1054a
13fd677

import streamlit as st

from .streamlit_utils import (
    make_multiselect,
    make_selectbox,
    make_text_area,
    make_text_input,
    make_radio,
)

N_FIELDS_PII = 1
N_FIELDS_LICENSES = 2
N_FIELDS_LIMITATIONS = 3

N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS


def considerations_page():
    st.session_state.card_dict["considerations"] = st.session_state.card_dict.get(
        "considerations", {}
    )
    with st.expander("PII Risks and Liability", expanded=False):
        key_pref = ["considerations", "pii"]
        st.session_state.card_dict["considerations"]["pii"] = st.session_state.card_dict[
            "considerations"
        ].get("pii", {})
        make_text_area(
            label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy to the data subjects and creators risks when using the dataset.",
            key_list=key_pref+["risks-description"],
            help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
        )

    with st.expander("Licenses", expanded=False):
        key_pref = ["considerations", "licenses"]
        st.session_state.card_dict["considerations"]["licenses"] = st.session_state.card_dict[
            "considerations"
        ].get("licenses", {})

        make_multiselect(
            label="Based on your answers in the Intended Use part of the Data Overview Section, which of the following best describe the copyright and licensing status of the dataset?",
            options=[
                "public domain",
                "multiple licenses",
                "copyright - all rights reserved",
                "open license - commercial use allowed",
                "research use only",
                "non-commercial use only",
                "do not distribute",
                "other",
            ],
            key_list=key_pref + ["dataset-restrictions"],
            help="Does the license restrict how the dataset can be used?",
        )
        if "other" in st.session_state.card_dict["considerations"]["licenses"].get("dataset-restrictions", []):
            make_text_area(
                label="You selected `other` for the dataset licensing status, please elaborate here:",
                key_list=key_pref+["dataset-restrictions-other"]
            )
        else:
            st.session_state.card_dict["considerations"]["licenses"]["dataset-restrictions-other"] = "N/A"

        make_multiselect(
            label="Based on your answers in the Language part of the Data Curation Section, which of the following best describe the copyright and licensing status of the underlying language data?",
            options=[
                "public domain",
                "multiple licenses",
                "copyright - all rights reserved",
                "open license - commercial use allowed",
                "research use only",
                "non-commercial use only",
                "do not distribute",
                "other",
            ],
            key_list=key_pref + ["data-copyright"],
            help="For example if the dataset uses data from Wikipedia, we are asking about the status of Wikipedia text in general.",
        )
        if "other" in st.session_state.card_dict["considerations"]["licenses"].get("data-copyright", []):
            make_text_area(
                label="You selected `other` for the source data licensing status, please elaborate here:",
                key_list=key_pref+["data-copyright-other"]
            )
        else:
            st.session_state.card_dict["considerations"]["licenses"]["data-copyright-other"] = "N/A"

    with st.expander("Known Technical Limitations", expanded=False):
        key_pref = ["considerations", "limitations"]
        st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
            "considerations"
        ].get("limitations", {})
        make_text_area(
            label="Describe any known technical limitations, such as spurrious correlations, train/test overlap, annotation biases, or mis-annotations, " + \
            "and cite the works that first identified these limitations when possible.",
            key_list=key_pref + ["data-technical-limitations"],
            help="Outline any properties of the dataset that might lead a trained model with good performance on the metric to not behave as expected.",
        )
        make_text_area(
            label="When using a model trained on this dataset in a setting where users or the public may interact with its predictions, what are some pitfalls to look out for? " + \
            "In particular, describe some applications of the general task featured in this dataset that its curation or properties make it less suitable for.",
            key_list=key_pref + ["data-unsuited-applications"],
            help="For example, outline language varieties or domains that the model might underperform for.",
        )
        make_text_area(
            label="What are some discouraged use cases of a model trained to maximize the proposed metrics on this dataset? " +
            "In particular, think about settings where decisions made by a model that performs reasonably well on the metric my still have strong negative consequences for user or members of the public.",
            key_list=key_pref + ["data-discouraged-use"],
            help="For example, think about application settings where certain types of mistakes (such as missing a negation) might have a particularly strong negative impact but are not particularly singled out by the aggregated evaluation.",
        )


def considerations_summary():
    total_filled = sum(
        [len(dct) for dct in st.session_state.card_dict.get("considerations", {}).values()]
    )
    with st.expander(
        f"Considerations for Using Data Completion - {total_filled} of {N_FIELDS}", expanded=False
    ):
        completion_markdown = ""
        completion_markdown += (
            f"- **Overall completion:**\n  - {total_filled} of {N_FIELDS} fields\n"
        )
        completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
        completion_markdown += f"- **Sub-section - Licenses:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
        completion_markdown += f"- **Sub-section - Known Technical Limitations:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('limitations', {}))} of {N_FIELDS_LIMITATIONS} fields\n"
        st.markdown(completion_markdown)