import streamlit as st from .streamlit_utils import make_text_input from .streamlit_utils import ( make_multiselect, make_selectbox, make_text_area, make_text_input, make_radio, ) N_FIELDS_ORIGINAL = 4 N_FIELDS_LANGUAGE = 12 N_FIELDS_ANNOTATIONS = 10 N_FIELDS_CONSENT = 0 N_FIELDS_PII = 0 N_FIELDS_MAINTENANCE = 0 N_FIELDS_GEM = 0 N_FIELDS = ( N_FIELDS_ORIGINAL + N_FIELDS_LANGUAGE + N_FIELDS_ANNOTATIONS + N_FIELDS_CONSENT + N_FIELDS_PII + N_FIELDS_MAINTENANCE + N_FIELDS_GEM ) """ What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.] """ def curation_page(): st.session_state.card_dict["curation"] = st.session_state.card_dict.get( "curation", {} ) with st.expander("Original Curation", expanded=False): key_pref = ["curation", "original"] st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict[ "curation" ].get("original", {}) make_text_area( label="Original curation rationale", key_list=key_pref + ["rationale"], help="Describe the curation rationale behind the original dataset(s).", ) make_text_area( label="What was the communicative goal?", key_list=key_pref + ["communicative"], help="Describe the communicative goal that the original dataset(s) was trying to represent.", ) make_radio( label="Is the dataset aggregated from different data sources?", options=["no", "yes"], key_list=key_pref + ["is-aggregated"], help="e.g. Wikipedia, movi dialogues, etc.", ) make_text_area( label="If yes, list the sources", key_list=key_pref + ["aggregated-sources"], help="Otherwise, type N/A", ) with st.expander("Language Data", expanded=False): key_pref = ["curation", "language"] st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[ "curation" ].get("language", {}) make_multiselect( label="How was the language data obtained?", options=[ "Found", "Created for the dataset", "Crowdsourced", "Machine-generated", "Other", ], key_list=key_pref + ["obtained"], ) make_multiselect( label="If found, where from?", options=["Multiple websites", "Single website", "Offline media collection", "Other", "N/A"], key_list=key_pref + ["found"], help="select N/A if none of the language data was found", ) make_multiselect( label="If crowdsourced, where from?", options=[ "Amazon Mechanical Turk", "Other crowdworker platform", "Participatory experiment", "Other", "N/A", ], key_list=key_pref + ["crowdsourced"], help="select N/A if none of the language data was crowdsourced", ) make_text_area( label="If created for the dataset, describe the creation process.", key_list=key_pref + ["created"], ) make_text_area( label="What further information do we have on the language producers?", key_list=key_pref + ["producers-description"], help="Provide a description of the context in which the language was produced and who produced it.", ) make_text_input( label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).", key_list=key_pref + ["machine-generated"], help="if the generation code is unavailable, enter N/A", ) make_selectbox( label="Was the text validated by a different worker or a data curator?", options=[ "not validated", "validated by crowdworker", "validated by data curator", "other", ], key_list=key_pref + ["validated"], help="this question is about human or human-in-the-loop validation only", ) make_multiselect( label="In what kind of organization did the curation happen?", options=["industry", "academic", "independent", "other"], key_list=key_pref + ["organization-type"], ) make_text_input( label="Name the organization(s).", key_list=key_pref + ["organization-names"], help="comma-separated", ) make_text_area( label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)", key_list=key_pref + ["pre-processed"], help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken.", ) make_selectbox( label="Were text instances selected or filtered?", options=["not filtered", "manually", "algorithmically", "hybrid"], key_list=key_pref + ["is-filtered"], ) make_text_area( label="What were the selection criteria?", key_list=key_pref + ["filtered-criteria"], help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A.", ) with st.expander("Structured Annotations", expanded=False): key_pref = ["curation", "annotations"] st.session_state.card_dict["curation"][ "annotations" ] = st.session_state.card_dict["curation"].get("annotations", {}) make_radio( label="Does the dataset have additional annotations for each instance?", options=["none", "found", "automatically created", "expert created", "crowd-sourced"], key_list=key_pref + ["origin"], help="Was any additional data collected?", ) # TODO: If yes.... # If expert or crowdsourced, this branch make_radio( label="What is the number of raters ", options=["unknown", "1", "2100"], key_list=key_pref + ["rater-number"], help="How many raters were used to create the additional annotations?", ) make_text_area( label="Describe the qualifications required of an annotator.", key_list=key_pref + ["rater-qualifications"], help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).", ) make_radio( label="How many annotators saw each training example?", options=["0", "1", "2", "3", "4", "5", ">5"], key_list=key_pref + ["rater-training-num"], help="", ) make_radio( label="How many annotators saw each test example?", options=["0", "1", "2", "3", "4", "5", ">5"], key_list=key_pref + ["rater-test-num"], help="", ) make_radio( label="Was an annotation service used?", options=["yes", "no", "unknown"], key_list=key_pref + ["rater-annotation-service-bool"], help="", ) # TODO if yes make_multiselect( label="Which annotation services were used?", options=[ "Amazon Mechanical Turk", "Prolific Academic", "Upwork", "Appen", "Crowdflower", "other" ], key_list=key_pref + ["rater-annotation-service"], ) make_text_area( label="Purpose and values for each annoation", key_list=key_pref + ["values"], help="Describe the purpose and possible values for each kind of annotation.", ) make_multiselect( label="Quality control measures?", options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"], key_list=key_pref + ["quality-control"], help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?", ) # TODO: If not none / unknown make_text_area( label="Describe the quality control measures that were taken.", key_list=key_pref + ["quality-control-details"], help="Describe how quality was ensured in the data curation process.", ) with st.expander("Consent", expanded=False): key_pref = ["curation", "consent"] st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[ "curation" ].get("consent", {}) with st.expander("Private Identifying Information (PII)", expanded=False): key_pref = ["curation", "pii"] st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[ "curation" ].get("pii", {}) with st.expander("Maintenance", expanded=False): key_pref = ["curation", "maintenance"] st.session_state.card_dict["curation"][ "maintenance" ] = st.session_state.card_dict["curation"].get("maintenance", {}) with st.expander("GEM Additional Curation", expanded=False): key_pref = ["curation", "gem"] st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict[ "curation" ].get("gem", {}) def curation_summary(): total_filled = sum( [len(dct) for dct in st.session_state.card_dict.get("curation", {}).values()] ) with st.expander( f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False ): completion_markdown = "" completion_markdown += ( f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n" ) completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n" completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n" completion_markdown += f"- **Sub-section - Structured Annotations:**\n - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n" completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n" completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n" completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n" completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n" st.markdown(completion_markdown)