import streamlit as st from .streamlit_utils import ( make_multiselect, make_selectbox, make_text_area, make_text_input, make_radio, ) N_FIELDS = 7 def results_page(): st.session_state.card_dict["results"] = st.session_state.card_dict.get( "results", {} ) with st.expander("Previous Results", expanded=False): key_pref = ["results", "results"] st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[ "results" ].get("results", {}) make_multiselect( label="What metrics are typically used for this task?", key_list=key_pref + ["metrics"], options=[ "BERT-Score", "BLEU", "BLEURT", "ChrF", "Entailment", "FeQA", "METEOR" "MoverScore", "QAGS", "ROUGE", "WER", ], help="Select all metrics that are typically used when evaluating models for this task.", ) make_text_area( label="Describe the metrics and evaluation methodology that the dataset creators used when introducing this task.", key_list=key_pref + ["original-evaluation"], help="When the generation task was not evaluated when this dataset was introduced, write N/A.", ) make_radio( label="Are previous results available?", options=["no", "yes"], key_list=key_pref + ["has-previous-results"], help="Have papers evaluated models on this task? If no, write N/A for the following three questions.", ) make_text_area( label="What evaluation approaches have others used?", key_list=key_pref + ["modern-evaluation"], help="If the modern evaluation strategy diverts from the original, describe how models are being evaluated.", ) make_text_area( label="What are previous results", key_list=key_pref + ["previous-results"], help="List the source and performance metrics for models on this dataset.", ) make_text_area( label="Definitions", key_list=key_pref + ["definitions"], help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.", ) make_text_area( label="What aspect of model ability can be measured with this dataset?", key_list=key_pref + ["model-abilities"], help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.", ) def results_summary(): return None