Sebastian Gehrmann
.
88dffd6
raw
history blame
2.87 kB
import streamlit as st
from .streamlit_utils import (
make_multiselect,
make_selectbox,
make_text_area,
make_text_input,
make_radio,
)
N_FIELDS = 7
def results_page():
st.session_state.card_dict["results"] = st.session_state.card_dict.get(
"results", {}
)
with st.expander("Previous Results", expanded=False):
key_pref = ["results", "results"]
st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
"results"
].get("results", {})
make_multiselect(
label="What metrics are typically used for this task?",
key_list=key_pref + ["metrics"],
options=[
"BERT-Score",
"BLEU",
"BLEURT",
"ChrF",
"Entailment",
"FeQA",
"METEOR" "MoverScore",
"QAGS",
"ROUGE",
"WER",
],
help="Select all metrics that are typically used when evaluating models for this task.",
)
make_text_area(
label="Describe the metrics and evaluation methodology that the dataset creators used when introducing this task.",
key_list=key_pref + ["original-evaluation"],
help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
)
make_radio(
label="Are previous results available?",
options=["no", "yes"],
key_list=key_pref + ["has-previous-results"],
help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
)
make_text_area(
label="What evaluation approaches have others used?",
key_list=key_pref + ["modern-evaluation"],
help="If the modern evaluation strategy diverts from the original, describe how models are being evaluated.",
)
make_text_area(
label="What are previous results",
key_list=key_pref + ["previous-results"],
help="List the source and performance metrics for models on this dataset.",
)
make_text_area(
label="Definitions",
key_list=key_pref + ["definitions"],
help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
)
make_text_area(
label="What aspect of model ability can be measured with this dataset?",
key_list=key_pref + ["model-abilities"],
help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
)
def results_summary():
return None