File size: 1,446 Bytes
8c49cb6
460d762
 
 
 
12cea14
460d762
 
8c49cb6
 
 
460d762
b323764
460d762
 
 
 
 
12cea14
460d762
 
8c49cb6
 
 
460d762
b323764
460d762
 
 
 
 
12cea14
460d762
 
8c49cb6
 
 
460d762
b323764
460d762
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from src.display_models.utils import AutoEvalColumn, model_hyperlink

gpt4_values = {
    AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
    AutoEvalColumn.revision.name: "tech report",
    AutoEvalColumn.precision.name: None,
    AutoEvalColumn.average.name: 84.3,
    AutoEvalColumn.arc.name: 96.3,
    AutoEvalColumn.hellaswag.name: 95.3,
    AutoEvalColumn.mmlu.name: 86.4,
    AutoEvalColumn.truthfulqa.name: 59.0,
    AutoEvalColumn.dummy.name: "GPT-4",
    AutoEvalColumn.model_type.name: "",
}

gpt35_values = {
    AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
    AutoEvalColumn.revision.name: "tech report",
    AutoEvalColumn.precision.name: None,
    AutoEvalColumn.average.name: 71.9,
    AutoEvalColumn.arc.name: 85.2,
    AutoEvalColumn.hellaswag.name: 85.5,
    AutoEvalColumn.mmlu.name: 70.0,
    AutoEvalColumn.truthfulqa.name: 47.0,
    AutoEvalColumn.dummy.name: "GPT-3.5",
    AutoEvalColumn.model_type.name: "",
}

baseline = {
    AutoEvalColumn.model.name: "<p>Baseline</p>",
    AutoEvalColumn.revision.name: "N/A",
    AutoEvalColumn.precision.name: None,
    AutoEvalColumn.average.name: 25.0,
    AutoEvalColumn.arc.name: 25.0,
    AutoEvalColumn.hellaswag.name: 25.0,
    AutoEvalColumn.mmlu.name: 25.0,
    AutoEvalColumn.truthfulqa.name: 25.0,
    AutoEvalColumn.dummy.name: "baseline",
    AutoEvalColumn.model_type.name: "",
}