Spaces:
Runtime error
Runtime error
import streamlit as st | |
from .streamlit_utils import make_text_input | |
from .streamlit_utils import ( | |
make_multiselect, | |
make_selectbox, | |
make_text_area, | |
make_text_input, | |
make_radio, | |
) | |
N_FIELDS_ORIGINAL = 4 | |
N_FIELDS_LANGUAGE = 12 | |
N_FIELDS_ANNOTATIONS = 10 | |
N_FIELDS_CONSENT = 0 | |
N_FIELDS_PII = 0 | |
N_FIELDS_MAINTENANCE = 0 | |
N_FIELDS_GEM = 0 | |
N_FIELDS = ( | |
N_FIELDS_ORIGINAL | |
+ N_FIELDS_LANGUAGE | |
+ N_FIELDS_ANNOTATIONS | |
+ N_FIELDS_CONSENT | |
+ N_FIELDS_PII | |
+ N_FIELDS_MAINTENANCE | |
+ N_FIELDS_GEM | |
) | |
""" | |
What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.] | |
""" | |
def curation_page(): | |
st.session_state.card_dict["curation"] = st.session_state.card_dict.get( | |
"curation", {} | |
) | |
with st.expander("Original Curation", expanded=False): | |
key_pref = ["curation", "original"] | |
st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict[ | |
"curation" | |
].get("original", {}) | |
make_text_area( | |
label="Original curation rationale", | |
key_list=key_pref + ["rationale"], | |
help="Describe the curation rationale behind the original dataset(s).", | |
) | |
make_text_area( | |
label="What was the communicative goal?", | |
key_list=key_pref + ["communicative"], | |
help="Describe the communicative goal that the original dataset(s) was trying to represent.", | |
) | |
make_radio( | |
label="Is the dataset aggregated from different data sources?", | |
options=["no", "yes"], | |
key_list=key_pref + ["is-aggregated"], | |
help="e.g. Wikipedia, movi dialogues, etc.", | |
) | |
make_text_area( | |
label="If yes, list the sources", | |
key_list=key_pref + ["aggregated-sources"], | |
help="Otherwise, type N/A", | |
) | |
with st.expander("Language Data", expanded=False): | |
key_pref = ["curation", "language"] | |
st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[ | |
"curation" | |
].get("language", {}) | |
make_multiselect( | |
label="How was the language data obtained?", | |
options=[ | |
"Found", | |
"Created for the dataset", | |
"Crowdsourced", | |
"Machine-generated", | |
"Other", | |
], | |
key_list=key_pref + ["obtained"], | |
) | |
make_multiselect( | |
label="If found, where from?", | |
options=["Multiple websites", "Single website", "Offline media collection", "Other", "N/A"], | |
key_list=key_pref + ["found"], | |
help="select N/A if none of the language data was found", | |
) | |
make_multiselect( | |
label="If crowdsourced, where from?", | |
options=[ | |
"Amazon Mechanical Turk", | |
"Other crowdworker platform", | |
"Participatory experiment", | |
"Other", | |
"N/A", | |
], | |
key_list=key_pref + ["crowdsourced"], | |
help="select N/A if none of the language data was crowdsourced", | |
) | |
make_text_area( | |
label="If created for the dataset, describe the creation process.", | |
key_list=key_pref + ["created"], | |
) | |
make_text_area( | |
label="What further information do we have on the language producers?", | |
key_list=key_pref + ["producers-description"], | |
help="Provide a description of the context in which the language was produced and who produced it.", | |
) | |
make_text_input( | |
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).", | |
key_list=key_pref + ["machine-generated"], | |
help="if the generation code is unavailable, enter N/A", | |
) | |
make_selectbox( | |
label="Was the text validated by a different worker or a data curator?", | |
options=[ | |
"not validated", | |
"validated by crowdworker", | |
"validated by data curator", | |
"other", | |
], | |
key_list=key_pref + ["validated"], | |
help="this question is about human or human-in-the-loop validation only", | |
) | |
make_multiselect( | |
label="In what kind of organization did the curation happen?", | |
options=["industry", "academic", "independent", "other"], | |
key_list=key_pref + ["organization-type"], | |
) | |
make_text_input( | |
label="Name the organization(s).", | |
key_list=key_pref + ["organization-names"], | |
help="comma-separated", | |
) | |
make_text_area( | |
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)", | |
key_list=key_pref + ["pre-processed"], | |
help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken.", | |
) | |
make_selectbox( | |
label="Were text instances selected or filtered?", | |
options=["not filtered", "manually", "algorithmically", "hybrid"], | |
key_list=key_pref + ["is-filtered"], | |
) | |
make_text_area( | |
label="What were the selection criteria?", | |
key_list=key_pref + ["filtered-criteria"], | |
help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A.", | |
) | |
with st.expander("Structured Annotations", expanded=False): | |
key_pref = ["curation", "annotations"] | |
st.session_state.card_dict["curation"][ | |
"annotations" | |
] = st.session_state.card_dict["curation"].get("annotations", {}) | |
make_radio( | |
label="Does the dataset have additional annotations for each instance?", | |
options=["none", "found", "automatically created", "expert created", "crowd-sourced"], | |
key_list=key_pref + ["origin"], | |
help="Was any additional data collected?", | |
) | |
# TODO: If yes.... | |
# If expert or crowdsourced, this branch | |
make_radio( | |
label="What is the number of raters ", | |
options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"], | |
key_list=key_pref + ["rater-number"], | |
help="How many raters were used to create the additional annotations?", | |
) | |
make_text_area( | |
label="Describe the qualifications required of an annotator.", | |
key_list=key_pref + ["rater-qualifications"], | |
help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).", | |
) | |
make_radio( | |
label="How many annotators saw each training example?", | |
options=["0", "1", "2", "3", "4", "5", ">5"], | |
key_list=key_pref + ["rater-training-num"], | |
help="", | |
) | |
make_radio( | |
label="How many annotators saw each test example?", | |
options=["0", "1", "2", "3", "4", "5", ">5"], | |
key_list=key_pref + ["rater-test-num"], | |
help="", | |
) | |
make_radio( | |
label="Was an annotation service used?", | |
options=["yes", "no", "unknown"], | |
key_list=key_pref + ["rater-annotation-service-bool"], | |
help="", | |
) | |
# TODO if yes | |
make_multiselect( | |
label="Which annotation services were used?", | |
options=[ | |
"Amazon Mechanical Turk", "Prolific Academic", | |
"Upwork", "Appen", "Crowdflower", "other" | |
], | |
key_list=key_pref + ["rater-annotation-service"], | |
) | |
make_text_area( | |
label="Purpose and values for each annoation", | |
key_list=key_pref + ["values"], | |
help="Describe the purpose and possible values for each kind of annotation.", | |
) | |
make_multiselect( | |
label="Quality control measures?", | |
options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"], | |
key_list=key_pref + ["quality-control"], | |
help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?", | |
) | |
# TODO: If not none / unknown | |
make_text_area( | |
label="Describe the quality control measures that were taken.", | |
key_list=key_pref + ["quality-control-details"], | |
help="Describe how quality was ensured in the data curation process.", | |
) | |
with st.expander("Consent", expanded=False): | |
key_pref = ["curation", "consent"] | |
st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[ | |
"curation" | |
].get("consent", {}) | |
with st.expander("Private Identifying Information (PII)", expanded=False): | |
key_pref = ["curation", "pii"] | |
st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[ | |
"curation" | |
].get("pii", {}) | |
with st.expander("Maintenance", expanded=False): | |
key_pref = ["curation", "maintenance"] | |
st.session_state.card_dict["curation"][ | |
"maintenance" | |
] = st.session_state.card_dict["curation"].get("maintenance", {}) | |
with st.expander("GEM Additional Curation", expanded=False): | |
key_pref = ["curation", "gem"] | |
st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict[ | |
"curation" | |
].get("gem", {}) | |
def curation_summary(): | |
total_filled = sum( | |
[len(dct) for dct in st.session_state.card_dict.get("curation", {}).values()] | |
) | |
with st.expander( | |
f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False | |
): | |
completion_markdown = "" | |
completion_markdown += ( | |
f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n" | |
) | |
completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n" | |
completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n" | |
completion_markdown += f"- **Sub-section - Structured Annotations:**\n - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n" | |
completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n" | |
completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n" | |
completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n" | |
completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n" | |
st.markdown(completion_markdown) | |