Display results on the dashboard even if the result is missing by filling in "missings" into the column
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -86,6 +86,9 @@ class EvalResult:
|
|
86 |
missing_benchmarks = task_benchmarks - results.keys()
|
87 |
if missing_benchmarks:
|
88 |
print(f"(Missing results) Model {model} is missing {', '.join(missing_benchmarks)} from result files")
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
return self(
|
@@ -157,11 +160,16 @@ class EvalResult:
|
|
157 |
# Calculate the mean for each category and add to data_dict
|
158 |
data_dict = {}
|
159 |
for category, scores in category_averages.items():
|
160 |
-
average
|
|
|
|
|
|
|
|
|
|
|
161 |
data_dict[category] = average
|
162 |
|
163 |
# Overall average
|
164 |
-
total_scores = [v for v in self.results.values() if v
|
165 |
overall_average = sum(total_scores) / len(total_scores) if total_scores else 0
|
166 |
|
167 |
# Add other columns
|
|
|
86 |
missing_benchmarks = task_benchmarks - results.keys()
|
87 |
if missing_benchmarks:
|
88 |
print(f"(Missing results) Model {model} is missing {', '.join(missing_benchmarks)} from result files")
|
89 |
+
for benchmark in missing_benchmarks:
|
90 |
+
results[benchmark] = "missing"
|
91 |
+
|
92 |
|
93 |
|
94 |
return self(
|
|
|
160 |
# Calculate the mean for each category and add to data_dict
|
161 |
data_dict = {}
|
162 |
for category, scores in category_averages.items():
|
163 |
+
# Calculate the average if there are valid scores, otherwise set to 0
|
164 |
+
valid_scores = [score for score in scores if score != "missing"]
|
165 |
+
if valid_scores:
|
166 |
+
average = sum(valid_scores) / len(valid_scores)
|
167 |
+
else:
|
168 |
+
average = 0
|
169 |
data_dict[category] = average
|
170 |
|
171 |
# Overall average
|
172 |
+
total_scores = [v for v in self.results.values() if v != "missing"]
|
173 |
overall_average = sum(total_scores) / len(total_scores) if total_scores else 0
|
174 |
|
175 |
# Add other columns
|