Commit
·
ead2260
1
Parent(s):
f12b6ec
fix: Correct evaluation result mapping and display
Browse files- Update evaluation functions to return results with dataset names as keys
- Modify read_evals.py to map metric values correctly to dataset names
- Improve leaderboard display by:
- Increasing decimal precision to 4 places
- Re-enabling NaN value filtering
- Maintaining proper sorting by average score
- src/evaluator/evaluate.py +4 -4
- src/leaderboard/read_evals.py +18 -8
- src/populate.py +2 -4
src/evaluator/evaluate.py
CHANGED
@@ -149,7 +149,7 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
|
|
149 |
print(f"Total predictions: {total}")
|
150 |
print(f"Accuracy: {accuracy:.4f}")
|
151 |
|
152 |
-
return {"
|
153 |
except Exception as e:
|
154 |
print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
|
155 |
print(f"Full traceback: {traceback.format_exc()}")
|
@@ -187,7 +187,7 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
187 |
|
188 |
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
189 |
print(f"Tunisian Corpus Coverage: {coverage:.2%}")
|
190 |
-
return {"
|
191 |
except Exception as e:
|
192 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
193 |
print(f"Full traceback: {traceback.format_exc()}")
|
@@ -289,8 +289,8 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
|
|
289 |
precision=precision,
|
290 |
weight_type=weight_type,
|
291 |
results={
|
292 |
-
|
293 |
-
|
294 |
}
|
295 |
)
|
296 |
except Exception as e:
|
|
|
149 |
print(f"Total predictions: {total}")
|
150 |
print(f"Accuracy: {accuracy:.4f}")
|
151 |
|
152 |
+
return {"fbougares/tsac": accuracy}
|
153 |
except Exception as e:
|
154 |
print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
|
155 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
|
187 |
|
188 |
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
|
189 |
print(f"Tunisian Corpus Coverage: {coverage:.2%}")
|
190 |
+
return {"arbml/Tunisian_Dialect_Corpus": coverage}
|
191 |
except Exception as e:
|
192 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
193 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
|
289 |
precision=precision,
|
290 |
weight_type=weight_type,
|
291 |
results={
|
292 |
+
Tasks.tsac_sentiment.value.metric: tsac_results.get(Tasks.tsac_sentiment.value.metric),
|
293 |
+
Tasks.tunisian_corpus.value.metric: tunisian_results.get(Tasks.tunisian_corpus.value.metric)
|
294 |
}
|
295 |
)
|
296 |
except Exception as e:
|
src/leaderboard/read_evals.py
CHANGED
@@ -154,9 +154,17 @@ class EvalResult:
|
|
154 |
AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
|
155 |
}
|
156 |
|
|
|
|
|
|
|
|
|
|
|
157 |
for task in Tasks:
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
160 |
return data_dict
|
161 |
|
162 |
|
@@ -217,24 +225,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
217 |
|
218 |
# Store results of same eval together
|
219 |
eval_name = eval_result.eval_name
|
|
|
220 |
if eval_name in eval_results.keys():
|
221 |
# If we already have results for this eval, append to list
|
222 |
eval_results[eval_name].append(eval_result)
|
223 |
else:
|
224 |
# Initialize list for this eval name
|
225 |
eval_results[eval_name] = [eval_result]
|
226 |
-
|
227 |
# Process final results
|
228 |
final_results = {}
|
229 |
for eval_name, eval_list in eval_results.items():
|
230 |
# Create merged results from all evaluations, ensuring all required task keys are present
|
231 |
-
merged_results = {task.value.
|
232 |
for eval_result in eval_list:
|
233 |
merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
234 |
|
235 |
# Take the first eval_result as base and update with merged results
|
236 |
-
print("evaluation list : ", eval_list)
|
237 |
base_result = eval_list[0]
|
|
|
238 |
# print(base_result)
|
239 |
final_results[eval_name] = EvalResult(
|
240 |
eval_name=eval_name,
|
@@ -249,12 +259,12 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
249 |
date=base_result.date,
|
250 |
still_on_hub=base_result.still_on_hub
|
251 |
)
|
252 |
-
print(final_results)
|
|
|
|
|
253 |
|
254 |
results = []
|
255 |
for v in final_results.values():
|
256 |
-
print("v : ",v)
|
257 |
-
print("Merged results: ", v.results)
|
258 |
try:
|
259 |
v.to_dict() # we test if the dict version is complete
|
260 |
results.append(v)
|
|
|
154 |
AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
|
155 |
}
|
156 |
|
157 |
+
# Map dataset names to their metric values
|
158 |
+
tsac_result = self.results.get("fbougares/tsac")
|
159 |
+
tunisian_result = self.results.get("arbml/Tunisian_Dialect_Corpus")
|
160 |
+
|
161 |
+
# Map metric values to their corresponding dataset names
|
162 |
for task in Tasks:
|
163 |
+
if task.value.benchmark == "fbougares/tsac":
|
164 |
+
data_dict[task.value.col_name] = self.results.get("accuracy")
|
165 |
+
elif task.value.benchmark == "arbml/Tunisian_Dialect_Corpus":
|
166 |
+
data_dict[task.value.col_name] = self.results.get("coverage")
|
167 |
+
print("data_dict : ", data_dict)
|
168 |
return data_dict
|
169 |
|
170 |
|
|
|
225 |
|
226 |
# Store results of same eval together
|
227 |
eval_name = eval_result.eval_name
|
228 |
+
print("eval_name : ", eval_name)
|
229 |
if eval_name in eval_results.keys():
|
230 |
# If we already have results for this eval, append to list
|
231 |
eval_results[eval_name].append(eval_result)
|
232 |
else:
|
233 |
# Initialize list for this eval name
|
234 |
eval_results[eval_name] = [eval_result]
|
235 |
+
print("eval_results : ", eval_results)
|
236 |
# Process final results
|
237 |
final_results = {}
|
238 |
for eval_name, eval_list in eval_results.items():
|
239 |
# Create merged results from all evaluations, ensuring all required task keys are present
|
240 |
+
merged_results = {task.value.metric: None for task in Tasks}
|
241 |
for eval_result in eval_list:
|
242 |
merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
243 |
|
244 |
# Take the first eval_result as base and update with merged results
|
245 |
+
print("evaluation list : ", len(eval_list))
|
246 |
base_result = eval_list[0]
|
247 |
+
print("base_result : ", base_result)
|
248 |
# print(base_result)
|
249 |
final_results[eval_name] = EvalResult(
|
250 |
eval_name=eval_name,
|
|
|
259 |
date=base_result.date,
|
260 |
still_on_hub=base_result.still_on_hub
|
261 |
)
|
262 |
+
print(len(final_results))
|
263 |
+
print(final_results.keys())
|
264 |
+
print(final_results.values())
|
265 |
|
266 |
results = []
|
267 |
for v in final_results.values():
|
|
|
|
|
268 |
try:
|
269 |
v.to_dict() # we test if the dict version is complete
|
270 |
results.append(v)
|
src/populate.py
CHANGED
@@ -20,11 +20,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
20 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
21 |
return pd.DataFrame(columns=cols)
|
22 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
23 |
-
print(df)
|
24 |
-
df = df[cols].round(decimals=2)
|
25 |
-
print(df)
|
26 |
-
# df = df[has_no_nan_values(df, benchmark_cols)]
|
27 |
# print(df)
|
|
|
|
|
28 |
return df
|
29 |
|
30 |
|
|
|
20 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
21 |
return pd.DataFrame(columns=cols)
|
22 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
|
|
|
|
|
|
|
|
23 |
# print(df)
|
24 |
+
df = df[cols].round(decimals=4)
|
25 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
26 |
return df
|
27 |
|
28 |
|