Clémentine
commited on
Commit
•
6eaad72
1
Parent(s):
d350941
added precision
Browse files- src/auto_leaderboard/load_results.py +11 -8
- src/utils_display.py +2 -2
src/auto_leaderboard/load_results.py
CHANGED
@@ -26,7 +26,7 @@ class EvalResult:
|
|
26 |
model: str
|
27 |
revision: str
|
28 |
results: dict
|
29 |
-
precision: str = "
|
30 |
model_type: str = ""
|
31 |
weight_type: str = ""
|
32 |
|
@@ -77,16 +77,18 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
77 |
eval_sha = config.get("lighteval_sha", "")
|
78 |
model_split = model.split("/", 1)
|
79 |
|
|
|
|
|
80 |
model = model_split[-1]
|
81 |
|
82 |
if len(model_split) == 1:
|
83 |
org = None
|
84 |
model = model_split[0]
|
85 |
-
result_key = f"{model}_{model_sha}_{eval_sha}"
|
86 |
else:
|
87 |
org = model_split[0]
|
88 |
model = model_split[1]
|
89 |
-
result_key = f"{org}_{model}_{model_sha}_{eval_sha}"
|
90 |
|
91 |
eval_results = []
|
92 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
@@ -95,7 +97,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
95 |
continue
|
96 |
mean_acc = np.mean(accs) * 100.0
|
97 |
eval_results.append(EvalResult(
|
98 |
-
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
|
99 |
))
|
100 |
|
101 |
return result_key, eval_results
|
@@ -110,14 +112,15 @@ def get_eval_results(is_public) -> List[EvalResult]:
|
|
110 |
continue
|
111 |
|
112 |
# Sort the files by date
|
|
|
113 |
try:
|
114 |
files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
|
115 |
except dateutil.parser._parser.ParserError:
|
116 |
-
|
117 |
-
|
118 |
-
up_to_date = files[-1]
|
119 |
|
120 |
-
|
|
|
|
|
121 |
|
122 |
eval_results = {}
|
123 |
for json_filepath in json_filepaths:
|
|
|
26 |
model: str
|
27 |
revision: str
|
28 |
results: dict
|
29 |
+
precision: str = ""
|
30 |
model_type: str = ""
|
31 |
weight_type: str = ""
|
32 |
|
|
|
77 |
eval_sha = config.get("lighteval_sha", "")
|
78 |
model_split = model.split("/", 1)
|
79 |
|
80 |
+
precision = config.get("model_dtype")
|
81 |
+
|
82 |
model = model_split[-1]
|
83 |
|
84 |
if len(model_split) == 1:
|
85 |
org = None
|
86 |
model = model_split[0]
|
87 |
+
result_key = f"{model}_{model_sha}_{eval_sha}_{precision}"
|
88 |
else:
|
89 |
org = model_split[0]
|
90 |
model = model_split[1]
|
91 |
+
result_key = f"{org}_{model}_{model_sha}_{eval_sha}_{precision}"
|
92 |
|
93 |
eval_results = []
|
94 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
|
|
97 |
continue
|
98 |
mean_acc = np.mean(accs) * 100.0
|
99 |
eval_results.append(EvalResult(
|
100 |
+
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, precision=precision, #todo model_type=, weight_type=
|
101 |
))
|
102 |
|
103 |
return result_key, eval_results
|
|
|
112 |
continue
|
113 |
|
114 |
# Sort the files by date
|
115 |
+
# store results by precision maybe?
|
116 |
try:
|
117 |
files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
|
118 |
except dateutil.parser._parser.ParserError:
|
119 |
+
files = [files[-1]]
|
|
|
|
|
120 |
|
121 |
+
#up_to_date = files[-1]
|
122 |
+
for file in files:
|
123 |
+
json_filepaths.append(os.path.join(root, file))
|
124 |
|
125 |
eval_results = {}
|
126 |
for json_filepath in json_filepaths:
|
src/utils_display.py
CHANGED
@@ -22,7 +22,7 @@ class AutoEvalColumn: # Auto evals column
|
|
22 |
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
24 |
model_type = ColumnContent("Type", "str", False)
|
25 |
-
precision = ColumnContent("Precision", "str", False
|
26 |
license = ColumnContent("Hub License", "str", False)
|
27 |
params = ColumnContent("#Params (B)", "number", False)
|
28 |
likes = ColumnContent("Hub ❤️", "number", False)
|
@@ -43,7 +43,7 @@ class EvalQueueColumn: # Queue column
|
|
43 |
model = ColumnContent("model", "markdown", True)
|
44 |
revision = ColumnContent("revision", "str", True)
|
45 |
private = ColumnContent("private", "bool", True)
|
46 |
-
precision = ColumnContent("precision", "
|
47 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
48 |
status = ColumnContent("status", "str", True)
|
49 |
|
|
|
22 |
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
24 |
model_type = ColumnContent("Type", "str", False)
|
25 |
+
precision = ColumnContent("Precision", "str", False) #, True)
|
26 |
license = ColumnContent("Hub License", "str", False)
|
27 |
params = ColumnContent("#Params (B)", "number", False)
|
28 |
likes = ColumnContent("Hub ❤️", "number", False)
|
|
|
43 |
model = ColumnContent("model", "markdown", True)
|
44 |
revision = ColumnContent("revision", "str", True)
|
45 |
private = ColumnContent("private", "bool", True)
|
46 |
+
precision = ColumnContent("precision", "str", True)
|
47 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
48 |
status = ColumnContent("status", "str", True)
|
49 |
|