djstrong commited on
Commit
1bea7de
·
1 Parent(s): 665a818

normalize scores to majority class baseline

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +17 -7
src/leaderboard/read_evals.py CHANGED
@@ -160,13 +160,23 @@ class EvalResult:
160
 
161
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
162
 
163
- average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
164
- average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
165
- average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
166
-
167
- # average = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
168
- # average_g = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
169
- # average_mc = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
 
 
 
 
 
 
 
 
 
 
170
 
171
  data_dict = {}
172
  # data_dict = {
 
160
 
161
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
162
 
163
+ # average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
164
+ # average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
165
+ # average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
166
+ # print('XXXXXXXXXXXX')
167
+ # print(self.eval_name)
168
+ # print(all_tasks)
169
+ # print(baselines)
170
+ # print(self.results)
171
+ # print('XXXXXXXXXXXX')
172
+
173
+ # average = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in all_tasks]) / len(all_tasks)
174
+ # average_g = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in g_tasks]) / len(g_tasks)
175
+ # average_mc = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in mc_tasks]) / len(mc_tasks)
176
+
177
+ average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
178
+ average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
179
+ average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
180
 
181
  data_dict = {}
182
  # data_dict = {