Spaces:
Runtime error
Runtime error
keep old Average
Browse files- app.py +1 -1
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +7 -1
app.py
CHANGED
@@ -76,7 +76,7 @@ def style_df(df: pd.DataFrame) -> Styler:
|
|
76 |
rounding = {'#Params (B)': "{:.1f}"}
|
77 |
for task in Tasks:
|
78 |
rounding[task.value.col_name] = "{:.2f}"
|
79 |
-
for column_name in ["Average ⬆️", "Avg g", "Avg mc"]:
|
80 |
rounding[column_name] = "{:.2f}"
|
81 |
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
|
82 |
return leaderboard_df_styled
|
|
|
76 |
rounding = {'#Params (B)': "{:.1f}"}
|
77 |
for task in Tasks:
|
78 |
rounding[task.value.col_name] = "{:.2f}"
|
79 |
+
for column_name in ["Average ⬆️", "Avg g", "Avg mc", "Average old"]:
|
80 |
rounding[column_name] = "{:.2f}"
|
81 |
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
|
82 |
return leaderboard_df_styled
|
src/display/utils.py
CHANGED
@@ -30,6 +30,7 @@ auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str"
|
|
30 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
31 |
#Scores
|
32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
33 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
34 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
35 |
for task in Tasks:
|
|
|
30 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
31 |
#Scores
|
32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
33 |
+
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
34 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
35 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
36 |
for task in Tasks:
|
src/leaderboard/read_evals.py
CHANGED
@@ -157,10 +157,11 @@ class EvalResult:
|
|
157 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
158 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
159 |
all_tasks = g_tasks + mc_tasks
|
|
|
160 |
|
161 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
162 |
|
163 |
-
|
164 |
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
165 |
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
166 |
# print('XXXXXXXXXXXX')
|
@@ -249,6 +250,11 @@ class EvalResult:
|
|
249 |
except AttributeError:
|
250 |
print(f"AttributeError revision")
|
251 |
|
|
|
|
|
|
|
|
|
|
|
252 |
try:
|
253 |
data_dict[AutoEvalColumn.average.name] = average
|
254 |
except KeyError:
|
|
|
157 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
158 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
159 |
all_tasks = g_tasks + mc_tasks
|
160 |
+
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
161 |
|
162 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
163 |
|
164 |
+
average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
|
165 |
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
166 |
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
167 |
# print('XXXXXXXXXXXX')
|
|
|
250 |
except AttributeError:
|
251 |
print(f"AttributeError revision")
|
252 |
|
253 |
+
try:
|
254 |
+
data_dict[AutoEvalColumn.average_old.name] = average_old
|
255 |
+
except KeyError:
|
256 |
+
print(f"Could not find average_old")
|
257 |
+
|
258 |
try:
|
259 |
data_dict[AutoEvalColumn.average.name] = average
|
260 |
except KeyError:
|