normalize scores to majority class baseline
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -160,13 +160,23 @@ class EvalResult:
|
|
160 |
|
161 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
162 |
|
163 |
-
average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
164 |
-
average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
165 |
-
average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
166 |
-
|
167 |
-
#
|
168 |
-
#
|
169 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
data_dict = {}
|
172 |
# data_dict = {
|
|
|
160 |
|
161 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
162 |
|
163 |
+
# average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
164 |
+
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
165 |
+
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
166 |
+
# print('XXXXXXXXXXXX')
|
167 |
+
# print(self.eval_name)
|
168 |
+
# print(all_tasks)
|
169 |
+
# print(baselines)
|
170 |
+
# print(self.results)
|
171 |
+
# print('XXXXXXXXXXXX')
|
172 |
+
|
173 |
+
# average = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in all_tasks]) / len(all_tasks)
|
174 |
+
# average_g = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in g_tasks]) / len(g_tasks)
|
175 |
+
# average_mc = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in mc_tasks]) / len(mc_tasks)
|
176 |
+
|
177 |
+
average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
|
178 |
+
average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
|
179 |
+
average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
|
180 |
|
181 |
data_dict = {}
|
182 |
# data_dict = {
|