File size: 3,500 Bytes
ac6c40f
 
 
57616af
 
 
 
 
ac6c40f
 
88dffd6
ac6c40f
57616af
ac6c40f
88dffd6
 
 
 
57616af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396d1e7
57616af
 
 
396d1e7
57616af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac6c40f
 
396d1e7
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import streamlit as st

from .streamlit_utils import (
    make_multiselect,
    make_selectbox,
    make_text_area,
    make_text_input,
    make_radio,
)

N_FIELDS = 7


def results_page():
    st.session_state.card_dict["results"] = st.session_state.card_dict.get(
        "results", {}
    )

    with st.expander("Previous Results", expanded=False):
        key_pref = ["results", "results"]
        st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
            "results"
        ].get("results", {})
        make_multiselect(
            label="What metrics are typically used for this task?",
            key_list=key_pref + ["metrics"],
            options=[
                "BERT-Score",
                "BLEU",
                "BLEURT",
                "ChrF",
                "Entailment",
                "FeQA",
                "METEOR", "MoverScore",
                "QAGS",
                "ROUGE",
                "WER",
                "Other: Other Metrics"
            ],
            help="Select all metrics that are typically used when evaluating models for this task.",
        )
        make_text_area(
            label="Describe the metrics and evaluation methodology that the dataset creators used when introducing this task.",
            key_list=key_pref + ["original-evaluation"],
            help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
        )
        make_radio(
            label="Are previous results available?",
            options=["no", "yes"],
            key_list=key_pref + ["has-previous-results"],
            help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
        )
        make_text_area(
            label="What evaluation approaches have others used?",
            key_list=key_pref + ["modern-evaluation"],
            help="If the modern evaluation strategy diverts from the original, describe how models are being evaluated.",
        )
        make_text_area(
            label="What are previous results",
            key_list=key_pref + ["previous-results"],
            help="List the source and performance metrics for models on this dataset.",
        )
        make_text_area(
            label="Definitions",
            key_list=key_pref + ["definitions"],
            help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
        )
        make_text_area(
            label="What aspect of model ability can be measured with this dataset?",
            key_list=key_pref + ["model-abilities"],
            help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
        )



def results_summary():
    total_filled = sum(
        [len(dct) for dct in st.session_state.card_dict.get("results", {}).values()]
    )
    with st.expander(
        f"Previous Results Completion - {total_filled} of {N_FIELDS}", expanded=False
    ):
        completion_markdown = ""
        completion_markdown += (
            f"- **Overall competion:**\n  - {total_filled} of {N_FIELDS} fields\n"
        )
        completion_markdown += f"- **Sub-section - Previous Results:**\n  - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n"
        st.markdown(completion_markdown)