Yacine Jernite
curation part 1
d1a58c9
raw
history blame
8.04 kB
import streamlit as st
from .streamlit_utils import (
make_text_input
)
from .streamlit_utils import (
make_multiselect,
make_selectbox,
make_text_area,
make_text_input,
make_radio,
)
N_FIELDS_ORIGINAL = 4
N_FIELDS_LANGUAGE = 12
N_FIELDS_ANNOTATIONS = 0
N_FIELDS_CONSENT = 0
N_FIELDS_PII = 0
N_FIELDS_MAINTENANCE = 0
N_FIELDS_GEM = 0
N_FIELDS = N_FIELDS_ORIGINAL + \
N_FIELDS_LANGUAGE + \
N_FIELDS_ANNOTATIONS + \
N_FIELDS_CONSENT + \
N_FIELDS_PII + \
N_FIELDS_MAINTENANCE + \
N_FIELDS_GEM
"""
What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
"""
def curation_page():
st.session_state.card_dict["curation"] = st.session_state.card_dict.get("curation", {})
with st.expander("Original Curation", expanded=False):
key_pref = ["curation", "original"]
st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict["curation"].get("original", {})
make_text_area(
label="Original curation rationale",
key_list=key_pref + ["rationale"],
help="Describe the curation rationale behind the original dataset(s)."
)
make_text_area(
label="What was the communicative goal?",
key_list=key_pref + ["communicative"],
help="Describe the communicative goal that the original dataset(s) was trying to represent."
)
make_radio(
label="Is the dataset aggregated from different data sources?",
options=["no", "yes"],
key_list=key_pref + ["is-aggregated"],
help="e.g. Wikipedia, movi dialogues, etc.",
)
make_text_area(
label="If yes, list the sources",
key_list=key_pref + ["aggregated-sources"],
help="Otherwise, type N/A"
)
with st.expander("Language Data", expanded=False):
key_pref = ["curation", "language"]
st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict["curation"].get("language", {})
make_multiselect(
label="How was the language data obtained?",
options=["found", "created for the dataset", "crowdsourced", "machine-generated", "other"],
key_list=key_pref+["obtained"],
)
make_multiselect(
label="If found, where from?",
options=["website", "offline media collection", "other", "N/A"],
key_list=key_pref+["found"],
help="select N/A if none of the language data was found"
)
make_multiselect(
label="If crowdsourced, where from?",
options=["Amazon Mechanical Turk", "other crowdworker platform", "participatory experiment", "other", "N/A"],
key_list=key_pref+["crowdsourced"],
help="select N/A if none of the language data was crowdsourced"
)
make_text_area(
label="If created for the dataset, describe the creation process.",
key_list=key_pref+["created"],
)
make_text_area(
label="What further information do we have on the language producers?",
key_list=key_pref+["producers-description"],
help="Provide a description of the context in which the language was produced and who produced it.",
)
make_text_input(
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
key_list=key_pref+["machine-generated"],
help="if the generation code is unavailable, enter N/A",
)
make_selectbox(
label="Was the text validated by a different worker or a data curator?",
options=["not validated", "validated by crowdworker", "validated by data curator", "other"],
key_list=key_pref+["validated"],
help="this question is about human or human-in-the-loop validation only"
)
make_multiselect(
label="In what kind of organization did the curation happen?",
options= ["industry", "academic", "independent", "other"],
key_list=key_pref+["organization-type"],
)
make_text_input(
label="Name the organization(s).",
key_list=key_pref+["organization-names"],
help="comma-separated",
)
make_text_area(
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
key_list=key_pref+["pre-processed"],
help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken."
)
make_selectbox(
label="Were text instances selected or filtered?",
options=["not filtered", "manually", "algorithmically", "hybrid"],
key_list=key_pref+["is-filtered"],
)
make_text_area(
label="What were the selection criteria?",
key_list=key_pref+["filtered-criteria"],
help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A."
)
with st.expander("Structured Annotations", expanded=False):
key_pref = ["curation", "annotations"]
st.session_state.card_dict["curation"]["annotations"] = st.session_state.card_dict["curation"].get("annotations", {})
with st.expander("Consent", expanded=False):
key_pref = ["curation", "consent"]
st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict["curation"].get("consent", {})
with st.expander("Private Identifying Information (PII)", expanded=False):
key_pref = ["curation", "pii"]
st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict["curation"].get("pii", {})
with st.expander("Maintenance", expanded=False):
key_pref = ["curation", "maintenance"]
st.session_state.card_dict["curation"]["maintenance"] = st.session_state.card_dict["curation"].get("maintenance", {})
with st.expander("GEM Additional Curation", expanded=False):
key_pref = ["curation", "gem"]
st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict["curation"].get("gem", {})
def curation_summary():
total_filled = sum([len(dct) for dct in st.session_state.card_dict.get('curation', {}).values()])
with st.expander(f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False):
completion_markdown = ""
completion_markdown += f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
completion_markdown += f"- **Sub-section - Structured Annotations:**\n - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n"
completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
st.markdown(completion_markdown)