Spaces:
Runtime error
Runtime error
add metadata and filters
Browse files- app.py +3 -2
- src/about.py +20 -19
- src/display/utils.py +9 -7
- src/leaderboard/read_evals.py +35 -2
- src/populate.py +23 -22
app.py
CHANGED
@@ -173,7 +173,7 @@ with demo:
|
|
173 |
value=[t.to_str() for t in ModelType],
|
174 |
interactive=True,
|
175 |
elem_id="filter-columns-type",
|
176 |
-
visible=
|
177 |
)
|
178 |
filter_columns_precision = gr.CheckboxGroup(
|
179 |
label="Precision",
|
@@ -189,7 +189,7 @@ with demo:
|
|
189 |
value=list(NUMERIC_INTERVALS.keys()),
|
190 |
interactive=True,
|
191 |
elem_id="filter-columns-size",
|
192 |
-
visible=
|
193 |
)
|
194 |
filter_columns_nshot = gr.CheckboxGroup(
|
195 |
label="N-shot",
|
@@ -238,6 +238,7 @@ with demo:
|
|
238 |
interactive=False,
|
239 |
visible=True,
|
240 |
# column_widths=["2%", "33%"]
|
|
|
241 |
)
|
242 |
|
243 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
173 |
value=[t.to_str() for t in ModelType],
|
174 |
interactive=True,
|
175 |
elem_id="filter-columns-type",
|
176 |
+
visible=True,
|
177 |
)
|
178 |
filter_columns_precision = gr.CheckboxGroup(
|
179 |
label="Precision",
|
|
|
189 |
value=list(NUMERIC_INTERVALS.keys()),
|
190 |
interactive=True,
|
191 |
elem_id="filter-columns-size",
|
192 |
+
visible=True,
|
193 |
)
|
194 |
filter_columns_nshot = gr.CheckboxGroup(
|
195 |
label="N-shot",
|
|
|
238 |
interactive=False,
|
239 |
visible=True,
|
240 |
# column_widths=["2%", "33%"]
|
241 |
+
height=900
|
242 |
)
|
243 |
|
244 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
src/about.py
CHANGED
@@ -1,35 +1,36 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
-
@dataclass
|
5 |
class Task:
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
|
|
9 |
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn")
|
16 |
-
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g")
|
17 |
-
task4 = Task("polemo2_in_multiple_choice", "acc,none", "
|
18 |
-
task5 = Task("polemo2_out", "exact_match,score-first", "
|
19 |
-
task6 = Task("polemo2_out_multiple_choice", "acc,none", "
|
20 |
-
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc")
|
21 |
-
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g")
|
22 |
-
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g")
|
23 |
-
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc")
|
24 |
-
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g")
|
25 |
-
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc")
|
26 |
-
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g")
|
27 |
-
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc")
|
28 |
-
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g")
|
29 |
-
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc")
|
30 |
-
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g")
|
31 |
-
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc")
|
32 |
-
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g")
|
33 |
|
34 |
NUM_FEWSHOT = 0 # Change with your few shot
|
35 |
# ---------------------------------------------------
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
@dataclass(frozen=True)
|
5 |
class Task:
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
+
type: str
|
10 |
|
11 |
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
|
17 |
+
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
|
18 |
+
task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
|
19 |
+
task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
|
20 |
+
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
21 |
+
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
22 |
+
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
23 |
+
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
24 |
+
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
25 |
+
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
26 |
+
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice")
|
27 |
+
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until")
|
28 |
+
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice")
|
29 |
+
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until")
|
30 |
+
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice")
|
31 |
+
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
32 |
+
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
33 |
+
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
34 |
|
35 |
NUM_FEWSHOT = 0 # Change with your few shot
|
36 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -29,15 +29,17 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
29 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
30 |
#Scores
|
31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
|
|
32 |
for task in Tasks:
|
33 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
34 |
# Model information
|
35 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str",
|
36 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
37 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
38 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
39 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
40 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number",
|
41 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
42 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
43 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
@@ -67,9 +69,9 @@ class ModelDetails:
|
|
67 |
|
68 |
class ModelType(Enum):
|
69 |
PT = ModelDetails(name="pretrained", symbol="🟢")
|
70 |
-
|
71 |
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
72 |
-
RL = ModelDetails(name="RL-tuned", symbol="
|
73 |
Unknown = ModelDetails(name="", symbol="?")
|
74 |
|
75 |
def to_str(self, separator=" "):
|
@@ -77,11 +79,11 @@ class ModelType(Enum):
|
|
77 |
|
78 |
@staticmethod
|
79 |
def from_str(type):
|
80 |
-
if "
|
81 |
-
return ModelType.
|
82 |
if "pretrained" in type or "🟢" in type:
|
83 |
return ModelType.PT
|
84 |
-
if "RL-tuned" in type or "
|
85 |
return ModelType.RL
|
86 |
if "instruction-tuned" in type or "⭕" in type:
|
87 |
return ModelType.IFT
|
|
|
29 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
30 |
#Scores
|
31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
32 |
+
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
33 |
+
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
34 |
for task in Tasks:
|
35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
36 |
# Model information
|
37 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", True)])
|
38 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
39 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
40 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
41 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
42 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
43 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
44 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
45 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
|
|
69 |
|
70 |
class ModelType(Enum):
|
71 |
PT = ModelDetails(name="pretrained", symbol="🟢")
|
72 |
+
CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
|
73 |
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
74 |
+
RL = ModelDetails(name="RL-tuned", symbol="💬")
|
75 |
Unknown = ModelDetails(name="", symbol="?")
|
76 |
|
77 |
def to_str(self, separator=" "):
|
|
|
79 |
|
80 |
@staticmethod
|
81 |
def from_str(type):
|
82 |
+
if "continuously pretrained" in type or "🟩" in type:
|
83 |
+
return ModelType.CPT
|
84 |
if "pretrained" in type or "🟢" in type:
|
85 |
return ModelType.PT
|
86 |
+
if "RL-tuned" in type or "💬" in type:
|
87 |
return ModelType.RL
|
88 |
if "instruction-tuned" in type or "⭕" in type:
|
89 |
return ModelType.IFT
|
src/leaderboard/read_evals.py
CHANGED
@@ -106,8 +106,22 @@ class EvalResult:
|
|
106 |
n_shot=NShotType.from_str(n_shot_num)
|
107 |
)
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
def update_with_request_file(self, requests_path):
|
110 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
111 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
112 |
|
113 |
try:
|
@@ -125,7 +139,13 @@ class EvalResult:
|
|
125 |
def to_dict(self):
|
126 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
127 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
data_dict={}
|
130 |
# data_dict = {
|
131 |
# "eval_name": self.eval_name, # not a column, just a save name,
|
@@ -202,6 +222,16 @@ class EvalResult:
|
|
202 |
except KeyError:
|
203 |
print(f"Could not find average")
|
204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
try:
|
206 |
data_dict[AutoEvalColumn.license.name] = self.license
|
207 |
except KeyError:
|
@@ -267,7 +297,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
267 |
return request_file
|
268 |
|
269 |
|
270 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
271 |
"""From the path of the results folder root, extract all needed info for results"""
|
272 |
model_result_filepaths = []
|
273 |
|
@@ -291,6 +321,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
291 |
# Creation of result
|
292 |
eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
|
293 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
|
|
|
294 |
|
295 |
# Store results of same eval together
|
296 |
eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
|
|
|
106 |
n_shot=NShotType.from_str(n_shot_num)
|
107 |
)
|
108 |
|
109 |
+
def update_with_metadata(self, metadata):
|
110 |
+
#print('UPDATE', self.full_model, self.model, self.eval_name)
|
111 |
+
try:
|
112 |
+
meta=metadata[self.full_model]
|
113 |
+
self.model_type = ModelType.from_str(meta.get("type", "?"))
|
114 |
+
self.num_params = meta.get("params", 0)
|
115 |
+
self.license = meta.get("license", "?")
|
116 |
+
# self.lang = meta.get("lang", "?") #TODO
|
117 |
+
#TODO desc name
|
118 |
+
except KeyError:
|
119 |
+
print(f"Could not find metadata for {self.full_model}")
|
120 |
+
|
121 |
+
|
122 |
def update_with_request_file(self, requests_path):
|
123 |
"""Finds the relevant request file for the current model and updates info with it"""
|
124 |
+
return
|
125 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
126 |
|
127 |
try:
|
|
|
139 |
def to_dict(self):
|
140 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
141 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
142 |
+
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
143 |
+
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
144 |
+
|
145 |
+
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
146 |
+
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
147 |
+
|
148 |
+
|
149 |
data_dict={}
|
150 |
# data_dict = {
|
151 |
# "eval_name": self.eval_name, # not a column, just a save name,
|
|
|
222 |
except KeyError:
|
223 |
print(f"Could not find average")
|
224 |
|
225 |
+
try:
|
226 |
+
data_dict[AutoEvalColumn.average_g.name] = average_g
|
227 |
+
except KeyError:
|
228 |
+
print(f"Could not find average_g")
|
229 |
+
|
230 |
+
try:
|
231 |
+
data_dict[AutoEvalColumn.average_mc.name] = average_mc
|
232 |
+
except KeyError:
|
233 |
+
print(f"Could not find average_mc")
|
234 |
+
|
235 |
try:
|
236 |
data_dict[AutoEvalColumn.license.name] = self.license
|
237 |
except KeyError:
|
|
|
297 |
return request_file
|
298 |
|
299 |
|
300 |
+
def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> list[EvalResult]:
|
301 |
"""From the path of the results folder root, extract all needed info for results"""
|
302 |
model_result_filepaths = []
|
303 |
|
|
|
321 |
# Creation of result
|
322 |
eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
|
323 |
eval_result.update_with_request_file(requests_path)
|
324 |
+
#update with metadata
|
325 |
+
eval_result.update_with_metadata(metadata)
|
326 |
+
|
327 |
|
328 |
# Store results of same eval together
|
329 |
eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
|
src/populate.py
CHANGED
@@ -9,7 +9,8 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
-
|
|
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
print(all_data_json)
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
@@ -25,27 +26,27 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
25 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
26 |
all_evals = []
|
27 |
|
28 |
-
for entry in entries:
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
51 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
|
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
+
metadata=json.load(open(f"{requests_path}/metadata.json"))
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, metadata)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
print(all_data_json)
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
26 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
27 |
all_evals = []
|
28 |
|
29 |
+
# for entry in entries:
|
30 |
+
# if ".json" in entry:
|
31 |
+
# file_path = os.path.join(save_path, entry)
|
32 |
+
# with open(file_path) as fp:
|
33 |
+
# data = json.load(fp)
|
34 |
+
#
|
35 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
36 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
37 |
+
#
|
38 |
+
# all_evals.append(data)
|
39 |
+
# elif ".md" not in entry:
|
40 |
+
# # this is a folder
|
41 |
+
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
42 |
+
# for sub_entry in sub_entries:
|
43 |
+
# file_path = os.path.join(save_path, entry, sub_entry)
|
44 |
+
# with open(file_path) as fp:
|
45 |
+
# data = json.load(fp)
|
46 |
+
#
|
47 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
48 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
49 |
+
# all_evals.append(data)
|
50 |
|
51 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
52 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|