Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
81490fa
1
Parent(s):
4ca4431
separate txt and vision avgs
Browse files- src/leaderboard/read_evals.py +7 -6
- src/populate.py +2 -2
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, Tasks, TasksMultimodal
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -100,20 +100,21 @@ class EvalResult:
|
|
100 |
|
101 |
def to_dict(self):
|
102 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
103 |
vision_tasks = ("VQA", "Winoground", "DevBench")
|
104 |
text_average = sum([v for k, v in self.results.items() if v is not None and k not in vision_tasks]) / len(Tasks)
|
105 |
data_dict = {
|
106 |
"eval_name": self.eval_name, # not a column, just a save name,
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
}
|
112 |
|
113 |
if self.track.lower() == "multimodal":
|
114 |
taskset = TasksMultimodal
|
115 |
vision_average = sum([v for k, v in self.results.items() if v is not None and k in vision_tasks]) / len(Tasks)
|
116 |
-
data_dict[
|
117 |
else:
|
118 |
taskset = Tasks
|
119 |
for task in taskset:
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, Tasks, TasksMultimodal
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
100 |
|
101 |
def to_dict(self):
|
102 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
103 |
+
eval_column = AutoEvalColumnMultimodal if self.track.lower() == "multimodal" else AutoEvalColumn
|
104 |
vision_tasks = ("VQA", "Winoground", "DevBench")
|
105 |
text_average = sum([v for k, v in self.results.items() if v is not None and k not in vision_tasks]) / len(Tasks)
|
106 |
data_dict = {
|
107 |
"eval_name": self.eval_name, # not a column, just a save name,
|
108 |
+
eval_column.model.name: make_clickable_model(self.full_model),
|
109 |
+
eval_column.revision.name: self.revision,
|
110 |
+
eval_column.text_average.name: text_average,
|
111 |
+
eval_column.still_on_hub.name: self.still_on_hub,
|
112 |
}
|
113 |
|
114 |
if self.track.lower() == "multimodal":
|
115 |
taskset = TasksMultimodal
|
116 |
vision_average = sum([v for k, v in self.results.items() if v is not None and k in vision_tasks]) / len(Tasks)
|
117 |
+
data_dict[eval_column.vision_average.name] = vision_average
|
118 |
else:
|
119 |
taskset = Tasks
|
120 |
for task in taskset:
|
src/populate.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
@@ -23,7 +23,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
23 |
|
24 |
df = pd.DataFrame.from_records(all_data_json)
|
25 |
print(df)
|
26 |
-
df = df.sort_values(by=[AutoEvalColumn.
|
27 |
df = df[cols].round(decimals=1)
|
28 |
|
29 |
# filter out if any of the benchmarks have not been produced
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
|
|
23 |
|
24 |
df = pd.DataFrame.from_records(all_data_json)
|
25 |
print(df)
|
26 |
+
df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
27 |
df = df[cols].round(decimals=1)
|
28 |
|
29 |
# filter out if any of the benchmarks have not been produced
|