File size: 4,551 Bytes
460d762
 
 
 
4aff44e
460d762
d6b3d82
460d762
 
 
 
d16cee2
 
460d762
d16cee2
460d762
d16cee2
 
460d762
 
 
 
 
 
 
 
 
 
6eaad72
b323764
 
460d762
 
 
 
 
 
 
 
 
b323764
12cea14
b323764
460d762
 
 
d350941
460d762
 
d16cee2
460d762
 
 
 
 
 
 
 
d16cee2
460d762
 
4aff44e
97b27da
 
 
 
460d762
4aff44e
 
 
 
d16cee2
 
 
 
 
 
 
6eaad72
 
d16cee2
 
 
 
 
2bb5ded
460d762
d16cee2
 
2bb5ded
460d762
d16cee2
460d762
c6b775f
d16cee2
 
d350941
d16cee2
6eaad72
d16cee2
460d762
d16cee2
460d762
 
699e8ff
d6b3d82
d16cee2
d6b3d82
 
 
 
 
 
6eaad72
d6b3d82
 
 
6eaad72
d6b3d82
6eaad72
 
 
d6b3d82
 
460d762
d16cee2
 
 
 
 
 
460d762
 
 
 
 
 
699e8ff
 
460d762
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from dataclasses import dataclass

import glob
import json
import os
from typing import Dict, List, Tuple
import dateutil

from src.utils_display import AutoEvalColumn, make_clickable_model
import numpy as np

METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
BENCH_TO_NAME = {
    "arc:challenge": AutoEvalColumn.arc.name,
    "hellaswag": AutoEvalColumn.hellaswag.name,
    "hendrycksTest": AutoEvalColumn.mmlu.name,
    "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
}


@dataclass
class EvalResult:
    eval_name: str
    org: str
    model: str
    revision: str
    results: dict
    precision: str = ""
    model_type: str = ""
    weight_type: str = ""

    def to_dict(self):
        if self.org is not None:
            base_model = f"{self.org}/{self.model}"
        else:
            base_model = f"{self.model}"
        data_dict = {}

        data_dict["eval_name"] = self.eval_name # not a column, just a save name
        data_dict["weight_type"] = self.weight_type  # not a column, just a save name
        data_dict[AutoEvalColumn.precision.name] = self.precision
        data_dict[AutoEvalColumn.model_type.name] = self.model_type
        data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
        data_dict[AutoEvalColumn.dummy.name] = base_model
        data_dict[AutoEvalColumn.revision.name] = self.revision
        data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0

        for benchmark in BENCHMARKS:
            if benchmark not in self.results.keys():
                self.results[benchmark] = None

        for k, v in BENCH_TO_NAME.items():
            data_dict[v] = self.results[k]

        return data_dict


def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
    with open(json_filepath) as fp:
        data = json.load(fp)

    
    for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
        if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
            return None, [] # we skip models with the wrong version 

    try:
        config = data["config"]
    except KeyError:
        config = data["config_general"]
    model = config.get("model_name", None)
    if model is None:
        model = config.get("model_args", None)

    model_sha = config.get("model_sha", "")
    model_split = model.split("/", 1)

    precision = config.get("model_dtype")

    model = model_split[-1]

    if len(model_split) == 1:
        org = None
        model = model_split[0]
        result_key = f"{model}_{model_sha}_{precision}"
    else:
        org = model_split[0]
        model = model_split[1]
        result_key =  f"{org}_{model}_{model_sha}_{precision}"

    eval_results = []
    for benchmark, metric in zip(BENCHMARKS, METRICS):
        accs = np.array([v.get(metric, 0) for k, v in data["results"].items() if benchmark in k])
        if accs.size == 0:
            continue
        mean_acc = np.mean(accs) * 100.0
        eval_results.append(EvalResult(
            eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, precision=precision, #todo model_type=, weight_type=
        ))

    return result_key, eval_results


def get_eval_results() -> List[EvalResult]:
    json_filepaths = []

    for root, dir, files in os.walk("eval-results"):
        # We should only have json files in model results
        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
            continue

        # Sort the files by date
        # store results by precision maybe?
        try:
            files.sort(key=lambda x:  dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
        except dateutil.parser._parser.ParserError:
            files = [files[-1]]

        #up_to_date = files[-1]
        for file in files:
            json_filepaths.append(os.path.join(root, file))

    eval_results = {}
    for json_filepath in json_filepaths:
        result_key, results = parse_eval_result(json_filepath)
        for eval_result in results:
            if result_key in eval_results.keys():
                eval_results[result_key].results.update(eval_result.results)
            else:
                eval_results[result_key] = eval_result

    eval_results = [v for v in eval_results.values()]

    return eval_results


def get_eval_results_dicts() -> List[Dict]:
    eval_results = get_eval_results()

    return [e.to_dict() for e in eval_results]