Spaces:
Restarting
Restarting
sort results by date
Browse files- src/about.py +23 -22
- src/leaderboard/read_evals.py +68 -37
src/about.py
CHANGED
@@ -7,35 +7,36 @@ class Task:
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
type: str
|
|
|
10 |
|
11 |
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
-
# task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
|
17 |
-
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
|
18 |
-
task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
|
19 |
-
task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
|
20 |
-
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
21 |
-
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
22 |
-
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
23 |
-
task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
|
24 |
-
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
25 |
-
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
26 |
-
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
27 |
-
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice")
|
28 |
-
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until")
|
29 |
-
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice")
|
30 |
-
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until")
|
31 |
-
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice")
|
32 |
-
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
33 |
-
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
34 |
-
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
35 |
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
36 |
-
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "
|
37 |
-
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "
|
38 |
-
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "
|
39 |
|
40 |
NUM_FEWSHOT = 0 # Change with your few shot
|
41 |
# ---------------------------------------------------
|
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
type: str
|
10 |
+
baseline: float = 0.0
|
11 |
|
12 |
|
13 |
# Select your tasks here
|
14 |
# ---------------------------------------------------
|
15 |
class Tasks(Enum):
|
16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
17 |
+
# task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice", 0.279)
|
18 |
+
task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until", 0.416)
|
19 |
+
task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice", 0.416)
|
20 |
+
task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until", 0.368)
|
21 |
+
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice", 0.368)
|
22 |
+
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice", 0.143)
|
23 |
+
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until", 0.143)
|
24 |
+
task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice", 0.279)
|
25 |
+
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until", 0.279)
|
26 |
+
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice", 0.289)
|
27 |
+
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until", 0.289)
|
28 |
+
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
|
29 |
+
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
|
30 |
+
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
|
31 |
+
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466)
|
32 |
+
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
|
33 |
+
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
34 |
+
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
35 |
+
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
|
36 |
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
37 |
+
# task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
38 |
+
# task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
39 |
+
# task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
42 |
# ---------------------------------------------------
|
src/leaderboard/read_evals.py
CHANGED
@@ -14,26 +14,28 @@ from src.submission.check_validity import is_model_on_hub
|
|
14 |
|
15 |
NUM_FEWSHOT = 0
|
16 |
|
|
|
17 |
@dataclass
|
18 |
class EvalResult:
|
19 |
-
eval_name: str
|
20 |
-
full_model: str
|
21 |
-
org: str
|
22 |
model: str
|
23 |
-
revision: str
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown
|
27 |
-
weight_type: WeightType = WeightType.Original
|
28 |
-
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
lang: str = "?"
|
31 |
likes: int = 0
|
32 |
num_params: int = 0
|
33 |
-
date: str = ""
|
34 |
still_on_hub: bool = False
|
35 |
n_shot: NShotType = NShotType.n0
|
36 |
org_and_model: str = ""
|
|
|
37 |
|
38 |
@classmethod
|
39 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
@@ -43,6 +45,7 @@ class EvalResult:
|
|
43 |
|
44 |
config = data.get("config")
|
45 |
n_shot = data.get("n-shot")
|
|
|
46 |
|
47 |
# Precision
|
48 |
precision = Precision.from_str(config.get("model_dtype"))
|
@@ -54,14 +57,17 @@ class EvalResult:
|
|
54 |
|
55 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
56 |
org_and_model = re.sub(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", SPICHLERZ_ORG, org_and_model)
|
57 |
-
org_and_model = org_and_model.replace(",dtype=bfloat16", "")
|
58 |
|
59 |
-
org_and_model=org_and_model.replace("
|
60 |
-
org_and_model=org_and_model.replace("
|
|
|
|
|
|
|
61 |
|
62 |
org_and_model = re.sub(r"^pretrained=", "", org_and_model)
|
63 |
org_and_model = org_and_model.replace(",trust_remote_code=True", "")
|
64 |
org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
|
|
|
65 |
|
66 |
org_and_model = org_and_model.split("/", 1)
|
67 |
|
@@ -90,7 +96,8 @@ class EvalResult:
|
|
90 |
task = task.value
|
91 |
|
92 |
# We average all scores of a given metric (not all metrics are present in all files)
|
93 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if
|
|
|
94 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
95 |
continue
|
96 |
|
@@ -98,7 +105,8 @@ class EvalResult:
|
|
98 |
mean_acc = np.mean(accs)
|
99 |
else:
|
100 |
mean_acc = np.mean(accs) * 100.0
|
101 |
-
results[task.benchmark] = mean_acc
|
|
|
102 |
|
103 |
return self(
|
104 |
eval_name=result_key,
|
@@ -106,27 +114,27 @@ class EvalResult:
|
|
106 |
org=org,
|
107 |
model=model,
|
108 |
results=results,
|
109 |
-
precision=precision,
|
110 |
-
revision=
|
111 |
still_on_hub=still_on_hub,
|
112 |
architecture=architecture,
|
113 |
n_shot=NShotType.from_str(n_shot_num),
|
114 |
-
org_and_model=orig_org_and_model
|
|
|
115 |
)
|
116 |
|
117 |
def update_with_metadata(self, metadata):
|
118 |
-
#print('UPDATE', self.full_model, self.model, self.eval_name)
|
119 |
try:
|
120 |
-
meta=metadata[self.full_model]
|
121 |
self.model_type = ModelType.from_str(meta.get("type", "?"))
|
122 |
self.num_params = meta.get("params", 0)
|
123 |
self.license = meta.get("license", "?")
|
124 |
self.lang = meta.get("lang", "?")
|
125 |
-
#TODO desc name
|
126 |
except KeyError:
|
127 |
print(f"Could not find metadata for {self.full_model}")
|
128 |
|
129 |
-
|
130 |
def update_with_request_file(self, requests_path):
|
131 |
"""Finds the relevant request file for the current model and updates info with it"""
|
132 |
return
|
@@ -149,12 +157,18 @@ class EvalResult:
|
|
149 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
150 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
151 |
all_tasks = g_tasks + mc_tasks
|
152 |
-
average = sum([v for task,v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
153 |
-
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
154 |
-
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
155 |
|
|
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
# data_dict = {
|
159 |
# "eval_name": self.eval_name, # not a column, just a save name,
|
160 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -188,7 +202,6 @@ class EvalResult:
|
|
188 |
except KeyError:
|
189 |
print(f"Could not find model type")
|
190 |
|
191 |
-
|
192 |
try:
|
193 |
data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
|
194 |
except KeyError:
|
@@ -209,7 +222,8 @@ class EvalResult:
|
|
209 |
print(f"AttributeError architecture")
|
210 |
|
211 |
try:
|
212 |
-
data_dict[AutoEvalColumn.model.name] = make_clickable_model(
|
|
|
213 |
except KeyError:
|
214 |
print(f"Could not find model")
|
215 |
|
@@ -305,8 +319,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
305 |
with open(tmp_request_file, "r") as f:
|
306 |
req_content = json.load(f)
|
307 |
if (
|
308 |
-
|
309 |
-
|
310 |
):
|
311 |
request_file = tmp_request_file
|
312 |
return request_file
|
@@ -330,30 +344,48 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
330 |
for file in files:
|
331 |
model_result_filepaths.append(os.path.join(root, file))
|
332 |
|
|
|
|
|
333 |
eval_results = {}
|
334 |
-
for n_shot in [0,5]:
|
335 |
for model_result_filepath in model_result_filepaths:
|
336 |
# Creation of result
|
337 |
eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
|
338 |
eval_result.update_with_request_file(requests_path)
|
339 |
-
#update with metadata
|
340 |
eval_result.update_with_metadata(metadata)
|
341 |
|
342 |
-
|
343 |
# Store results of same eval together
|
344 |
eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
|
345 |
if eval_name in eval_results.keys():
|
346 |
-
|
347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
else:
|
349 |
eval_results[eval_name] = eval_result
|
350 |
|
|
|
|
|
|
|
351 |
results = []
|
352 |
for v in eval_results.values():
|
353 |
try:
|
354 |
print(v)
|
355 |
-
v.to_dict()
|
356 |
-
#if v.results:
|
357 |
results.append(v)
|
358 |
except KeyError: # not all eval values present
|
359 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
@@ -370,7 +402,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
370 |
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
371 |
else:
|
372 |
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
373 |
-
if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name]=="?":
|
374 |
missing_metadata.append(f"{v.full_model}")
|
375 |
|
376 |
# print('missing_results_for_task', missing_results_for_task)
|
@@ -386,5 +418,4 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
386 |
print(model)
|
387 |
print()
|
388 |
|
389 |
-
|
390 |
return results
|
|
|
14 |
|
15 |
NUM_FEWSHOT = 0
|
16 |
|
17 |
+
|
18 |
@dataclass
|
19 |
class EvalResult:
|
20 |
+
eval_name: str # org_model_precision (uid)
|
21 |
+
full_model: str # org/model (path on hub)
|
22 |
+
org: str
|
23 |
model: str
|
24 |
+
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
precision: Precision = Precision.Unknown
|
27 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
+
architecture: str = "Unknown"
|
30 |
license: str = "?"
|
31 |
lang: str = "?"
|
32 |
likes: int = 0
|
33 |
num_params: int = 0
|
34 |
+
date: str = "" # submission date of request file
|
35 |
still_on_hub: bool = False
|
36 |
n_shot: NShotType = NShotType.n0
|
37 |
org_and_model: str = ""
|
38 |
+
start_date: float = 0
|
39 |
|
40 |
@classmethod
|
41 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
|
|
45 |
|
46 |
config = data.get("config")
|
47 |
n_shot = data.get("n-shot")
|
48 |
+
start_date = data.get("date", 0)
|
49 |
|
50 |
# Precision
|
51 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
57 |
|
58 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
59 |
org_and_model = re.sub(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", SPICHLERZ_ORG, org_and_model)
|
|
|
60 |
|
61 |
+
org_and_model = org_and_model.replace(",dtype=bfloat16", "")
|
62 |
+
org_and_model = org_and_model.replace(",dtype=float16", "")
|
63 |
+
|
64 |
+
org_and_model = org_and_model.replace("models/hf_v7_e1", "APT3-1B-Instruct-e1")
|
65 |
+
org_and_model = org_and_model.replace("models/hf_v7_e2", "APT3-1B-Instruct-e2")
|
66 |
|
67 |
org_and_model = re.sub(r"^pretrained=", "", org_and_model)
|
68 |
org_and_model = org_and_model.replace(",trust_remote_code=True", "")
|
69 |
org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
|
70 |
+
org_and_model = re.sub("/$", "", org_and_model)
|
71 |
|
72 |
org_and_model = org_and_model.split("/", 1)
|
73 |
|
|
|
96 |
task = task.value
|
97 |
|
98 |
# We average all scores of a given metric (not all metrics are present in all files)
|
99 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if
|
100 |
+
task.benchmark == k and n_shot.get(k, -1) == n_shot_num])
|
101 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
102 |
continue
|
103 |
|
|
|
105 |
mean_acc = np.mean(accs)
|
106 |
else:
|
107 |
mean_acc = np.mean(accs) * 100.0
|
108 |
+
results[task.benchmark] = (mean_acc, start_date)
|
109 |
+
# results[task.benchmark] = mean_acc
|
110 |
|
111 |
return self(
|
112 |
eval_name=result_key,
|
|
|
114 |
org=org,
|
115 |
model=model,
|
116 |
results=results,
|
117 |
+
precision=precision,
|
118 |
+
revision=config.get("model_sha", ""),
|
119 |
still_on_hub=still_on_hub,
|
120 |
architecture=architecture,
|
121 |
n_shot=NShotType.from_str(n_shot_num),
|
122 |
+
org_and_model=orig_org_and_model,
|
123 |
+
start_date=start_date
|
124 |
)
|
125 |
|
126 |
def update_with_metadata(self, metadata):
|
127 |
+
# print('UPDATE', self.full_model, self.model, self.eval_name)
|
128 |
try:
|
129 |
+
meta = metadata[self.full_model]
|
130 |
self.model_type = ModelType.from_str(meta.get("type", "?"))
|
131 |
self.num_params = meta.get("params", 0)
|
132 |
self.license = meta.get("license", "?")
|
133 |
self.lang = meta.get("lang", "?")
|
134 |
+
# TODO desc name
|
135 |
except KeyError:
|
136 |
print(f"Could not find metadata for {self.full_model}")
|
137 |
|
|
|
138 |
def update_with_request_file(self, requests_path):
|
139 |
"""Finds the relevant request file for the current model and updates info with it"""
|
140 |
return
|
|
|
157 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
158 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
159 |
all_tasks = g_tasks + mc_tasks
|
|
|
|
|
|
|
160 |
|
161 |
+
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
162 |
|
163 |
+
average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
164 |
+
average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
165 |
+
average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
166 |
+
|
167 |
+
# average = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
168 |
+
# average_g = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
169 |
+
# average_mc = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
170 |
+
|
171 |
+
data_dict = {}
|
172 |
# data_dict = {
|
173 |
# "eval_name": self.eval_name, # not a column, just a save name,
|
174 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
202 |
except KeyError:
|
203 |
print(f"Could not find model type")
|
204 |
|
|
|
205 |
try:
|
206 |
data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
|
207 |
except KeyError:
|
|
|
222 |
print(f"AttributeError architecture")
|
223 |
|
224 |
try:
|
225 |
+
data_dict[AutoEvalColumn.model.name] = make_clickable_model(
|
226 |
+
self.full_model) if self.still_on_hub else self.full_model
|
227 |
except KeyError:
|
228 |
print(f"Could not find model")
|
229 |
|
|
|
319 |
with open(tmp_request_file, "r") as f:
|
320 |
req_content = json.load(f)
|
321 |
if (
|
322 |
+
req_content["status"] in ["FINISHED"]
|
323 |
+
and req_content["precision"] == precision.split(".")[-1]
|
324 |
):
|
325 |
request_file = tmp_request_file
|
326 |
return request_file
|
|
|
344 |
for file in files:
|
345 |
model_result_filepaths.append(os.path.join(root, file))
|
346 |
|
347 |
+
# print('PATHS:', model_result_filepaths)
|
348 |
+
|
349 |
eval_results = {}
|
350 |
+
for n_shot in [0, 5]:
|
351 |
for model_result_filepath in model_result_filepaths:
|
352 |
# Creation of result
|
353 |
eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
|
354 |
eval_result.update_with_request_file(requests_path)
|
355 |
+
# update with metadata
|
356 |
eval_result.update_with_metadata(metadata)
|
357 |
|
|
|
358 |
# Store results of same eval together
|
359 |
eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
|
360 |
if eval_name in eval_results.keys():
|
361 |
+
|
362 |
+
for k, (v, start_date) in eval_result.results.items():
|
363 |
+
if v is not None:
|
364 |
+
if k in eval_results[eval_name].results:
|
365 |
+
if start_date > eval_results[eval_name].results[k][1]:
|
366 |
+
print(
|
367 |
+
f"Overwriting {eval_name}.results {k} {eval_results[eval_name].results[k]} with {v}: {model_result_filepath} {n_shot} {eval_result.start_date} {eval_results[eval_name].start_date}")
|
368 |
+
eval_results[eval_name].results[k] = (v, start_date)
|
369 |
+
else:
|
370 |
+
print(
|
371 |
+
f"Skipping {eval_name} {eval_result.start_date} {eval_results[eval_name].start_date}: {model_result_filepath} {n_shot}")
|
372 |
+
else:
|
373 |
+
eval_results[eval_name].results[k] = (v, start_date)
|
374 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
375 |
+
# TODO: log updated
|
376 |
+
|
377 |
else:
|
378 |
eval_results[eval_name] = eval_result
|
379 |
|
380 |
+
for k,v in eval_results.items():
|
381 |
+
v.results = {k: v for k, (v, start_date) in v.results.items()}
|
382 |
+
|
383 |
results = []
|
384 |
for v in eval_results.values():
|
385 |
try:
|
386 |
print(v)
|
387 |
+
v.to_dict() # we test if the dict version is complete
|
388 |
+
# if v.results:
|
389 |
results.append(v)
|
390 |
except KeyError: # not all eval values present
|
391 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
|
|
402 |
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
403 |
else:
|
404 |
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
405 |
+
if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
|
406 |
missing_metadata.append(f"{v.full_model}")
|
407 |
|
408 |
# print('missing_results_for_task', missing_results_for_task)
|
|
|
418 |
print(model)
|
419 |
print()
|
420 |
|
|
|
421 |
return results
|