Spaces:

GEM
/

DatasetCardForm

Runtime error

DatasetCardForm / datacards /curation.py

Sebastian Gehrmann

added structured annotation section

ef1f338 about 3 years ago

11.5 kB

	import streamlit as st

	from .streamlit_utils import make_text_input

	from .streamlit_utils import (
	make_multiselect,
	make_selectbox,
	make_text_area,
	make_text_input,
	make_radio,
	)

	N_FIELDS_ORIGINAL = 4
	N_FIELDS_LANGUAGE = 12
	N_FIELDS_ANNOTATIONS = 10
	N_FIELDS_CONSENT = 0
	N_FIELDS_PII = 0
	N_FIELDS_MAINTENANCE = 0
	N_FIELDS_GEM = 0

	N_FIELDS = (
	N_FIELDS_ORIGINAL
	+ N_FIELDS_LANGUAGE
	+ N_FIELDS_ANNOTATIONS
	+ N_FIELDS_CONSENT
	+ N_FIELDS_PII
	+ N_FIELDS_MAINTENANCE
	+ N_FIELDS_GEM
	)


	"""
	What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
	"""


	def curation_page():
	st.session_state.card_dict["curation"] = st.session_state.card_dict.get(
	"curation", {}
	)
	with st.expander("Original Curation", expanded=False):
	key_pref = ["curation", "original"]
	st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict[
	"curation"
	].get("original", {})
	make_text_area(
	label="Original curation rationale",
	key_list=key_pref + ["rationale"],
	help="Describe the curation rationale behind the original dataset(s).",
	)
	make_text_area(
	label="What was the communicative goal?",
	key_list=key_pref + ["communicative"],
	help="Describe the communicative goal that the original dataset(s) was trying to represent.",
	)
	make_radio(
	label="Is the dataset aggregated from different data sources?",
	options=["no", "yes"],
	key_list=key_pref + ["is-aggregated"],
	help="e.g. Wikipedia, movi dialogues, etc.",
	)
	make_text_area(
	label="If yes, list the sources",
	key_list=key_pref + ["aggregated-sources"],
	help="Otherwise, type N/A",
	)
	with st.expander("Language Data", expanded=False):
	key_pref = ["curation", "language"]
	st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[
	"curation"
	].get("language", {})
	make_multiselect(
	label="How was the language data obtained?",
	options=[
	"Found",
	"Created for the dataset",
	"Crowdsourced",
	"Machine-generated",
	"Other",
	],
	key_list=key_pref + ["obtained"],
	)
	make_multiselect(
	label="If found, where from?",
	options=["Multiple websites", "Single website", "Offline media collection", "Other", "N/A"],
	key_list=key_pref + ["found"],
	help="select N/A if none of the language data was found",
	)
	make_multiselect(
	label="If crowdsourced, where from?",
	options=[
	"Amazon Mechanical Turk",
	"Other crowdworker platform",
	"Participatory experiment",
	"Other",
	"N/A",
	],
	key_list=key_pref + ["crowdsourced"],
	help="select N/A if none of the language data was crowdsourced",
	)
	make_text_area(
	label="If created for the dataset, describe the creation process.",
	key_list=key_pref + ["created"],
	)
	make_text_area(
	label="What further information do we have on the language producers?",
	key_list=key_pref + ["producers-description"],
	help="Provide a description of the context in which the language was produced and who produced it.",
	)
	make_text_input(
	label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
	key_list=key_pref + ["machine-generated"],
	help="if the generation code is unavailable, enter N/A",
	)
	make_selectbox(
	label="Was the text validated by a different worker or a data curator?",
	options=[
	"not validated",
	"validated by crowdworker",
	"validated by data curator",
	"other",
	],
	key_list=key_pref + ["validated"],
	help="this question is about human or human-in-the-loop validation only",
	)
	make_multiselect(
	label="In what kind of organization did the curation happen?",
	options=["industry", "academic", "independent", "other"],
	key_list=key_pref + ["organization-type"],
	)
	make_text_input(
	label="Name the organization(s).",
	key_list=key_pref + ["organization-names"],
	help="comma-separated",
	)
	make_text_area(
	label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
	key_list=key_pref + ["pre-processed"],
	help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken.",
	)
	make_selectbox(
	label="Were text instances selected or filtered?",
	options=["not filtered", "manually", "algorithmically", "hybrid"],
	key_list=key_pref + ["is-filtered"],
	)
	make_text_area(
	label="What were the selection criteria?",
	key_list=key_pref + ["filtered-criteria"],
	help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A.",
	)

	with st.expander("Structured Annotations", expanded=False):
	key_pref = ["curation", "annotations"]
	st.session_state.card_dict["curation"][
	"annotations"
	] = st.session_state.card_dict["curation"].get("annotations", {})

	make_radio(
	label="Does the dataset have additional annotations for each instance?",
	options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
	key_list=key_pref + ["origin"],
	help="Was any additional data collected?",
	)

	# TODO: If yes....
	# If expert or crowdsourced, this branch
	make_radio(
	label="What is the number of raters ",
	options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
	key_list=key_pref + ["rater-number"],
	help="How many raters were used to create the additional annotations?",
	)
	make_text_area(
	label="Describe the qualifications required of an annotator.",
	key_list=key_pref + ["rater-qualifications"],
	help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
	)
	make_radio(
	label="How many annotators saw each training example?",
	options=["0", "1", "2", "3", "4", "5", ">5"],
	key_list=key_pref + ["rater-training-num"],
	help="",
	)
	make_radio(
	label="How many annotators saw each test example?",
	options=["0", "1", "2", "3", "4", "5", ">5"],
	key_list=key_pref + ["rater-test-num"],
	help="",
	)
	make_radio(
	label="Was an annotation service used?",
	options=["yes", "no", "unknown"],
	key_list=key_pref + ["rater-annotation-service-bool"],
	help="",
	)
	# TODO if yes
	make_multiselect(
	label="Which annotation services were used?",
	options=[
	"Amazon Mechanical Turk", "Prolific Academic",
	"Upwork", "Appen", "Crowdflower", "other"
	],
	key_list=key_pref + ["rater-annotation-service"],
	)


	make_text_area(
	label="Purpose and values for each annoation",
	key_list=key_pref + ["values"],
	help="Describe the purpose and possible values for each kind of annotation.",
	)
	make_multiselect(
	label="Quality control measures?",
	options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
	key_list=key_pref + ["quality-control"],
	help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
	)
	# TODO: If not none / unknown
	make_text_area(
	label="Describe the quality control measures that were taken.",
	key_list=key_pref + ["quality-control-details"],
	help="Describe how quality was ensured in the data curation process.",
	)


	with st.expander("Consent", expanded=False):
	key_pref = ["curation", "consent"]
	st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[
	"curation"
	].get("consent", {})
	with st.expander("Private Identifying Information (PII)", expanded=False):
	key_pref = ["curation", "pii"]
	st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[
	"curation"
	].get("pii", {})
	with st.expander("Maintenance", expanded=False):
	key_pref = ["curation", "maintenance"]
	st.session_state.card_dict["curation"][
	"maintenance"
	] = st.session_state.card_dict["curation"].get("maintenance", {})
	with st.expander("GEM Additional Curation", expanded=False):
	key_pref = ["curation", "gem"]
	st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict[
	"curation"
	].get("gem", {})


	def curation_summary():
	total_filled = sum(
	[len(dct) for dct in st.session_state.card_dict.get("curation", {}).values()]
	)
	with st.expander(
	f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False
	):
	completion_markdown = ""
	completion_markdown += (
	f"- Overall competion:\n - {total_filled} of {N_FIELDS} fields\n"
	)
	completion_markdown += f"- Sub-section - Original Curation:\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
	completion_markdown += f"- Sub-section - Language Data:\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
	completion_markdown += f"- Sub-section - Structured Annotations:\n - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n"
	completion_markdown += f"- Sub-section - Consent:\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
	completion_markdown += f"- Sub-section - PII:\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
	completion_markdown += f"- Sub-section - Maintenance:\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
	completion_markdown += f"- Sub-section - GEM Curation:\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
	st.markdown(completion_markdown)