hamzabouajila commited on
Commit
ead2260
·
1 Parent(s): f12b6ec

fix: Correct evaluation result mapping and display

Browse files

- Update evaluation functions to return results with dataset names as keys
- Modify read_evals.py to map metric values correctly to dataset names
- Improve leaderboard display by:
- Increasing decimal precision to 4 places
- Re-enabling NaN value filtering
- Maintaining proper sorting by average score

src/evaluator/evaluate.py CHANGED
@@ -149,7 +149,7 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
149
  print(f"Total predictions: {total}")
150
  print(f"Accuracy: {accuracy:.4f}")
151
 
152
- return {"accuracy": accuracy}
153
  except Exception as e:
154
  print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
155
  print(f"Full traceback: {traceback.format_exc()}")
@@ -187,7 +187,7 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
187
 
188
  coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
189
  print(f"Tunisian Corpus Coverage: {coverage:.2%}")
190
- return {"coverage": coverage}
191
  except Exception as e:
192
  print(f"Error in Tunisian Corpus evaluation: {str(e)}")
193
  print(f"Full traceback: {traceback.format_exc()}")
@@ -289,8 +289,8 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
289
  precision=precision,
290
  weight_type=weight_type,
291
  results={
292
- **tsac_results,
293
- **tunisian_results
294
  }
295
  )
296
  except Exception as e:
 
149
  print(f"Total predictions: {total}")
150
  print(f"Accuracy: {accuracy:.4f}")
151
 
152
+ return {"fbougares/tsac": accuracy}
153
  except Exception as e:
154
  print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
155
  print(f"Full traceback: {traceback.format_exc()}")
 
187
 
188
  coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
189
  print(f"Tunisian Corpus Coverage: {coverage:.2%}")
190
+ return {"arbml/Tunisian_Dialect_Corpus": coverage}
191
  except Exception as e:
192
  print(f"Error in Tunisian Corpus evaluation: {str(e)}")
193
  print(f"Full traceback: {traceback.format_exc()}")
 
289
  precision=precision,
290
  weight_type=weight_type,
291
  results={
292
+ Tasks.tsac_sentiment.value.metric: tsac_results.get(Tasks.tsac_sentiment.value.metric),
293
+ Tasks.tunisian_corpus.value.metric: tunisian_results.get(Tasks.tunisian_corpus.value.metric)
294
  }
295
  )
296
  except Exception as e:
src/leaderboard/read_evals.py CHANGED
@@ -154,9 +154,17 @@ class EvalResult:
154
  AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
155
  }
156
 
 
 
 
 
 
157
  for task in Tasks:
158
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
159
-
 
 
 
160
  return data_dict
161
 
162
 
@@ -217,24 +225,26 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
217
 
218
  # Store results of same eval together
219
  eval_name = eval_result.eval_name
 
220
  if eval_name in eval_results.keys():
221
  # If we already have results for this eval, append to list
222
  eval_results[eval_name].append(eval_result)
223
  else:
224
  # Initialize list for this eval name
225
  eval_results[eval_name] = [eval_result]
226
-
227
  # Process final results
228
  final_results = {}
229
  for eval_name, eval_list in eval_results.items():
230
  # Create merged results from all evaluations, ensuring all required task keys are present
231
- merged_results = {task.value.benchmark: None for task in Tasks}
232
  for eval_result in eval_list:
233
  merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
234
 
235
  # Take the first eval_result as base and update with merged results
236
- print("evaluation list : ", eval_list)
237
  base_result = eval_list[0]
 
238
  # print(base_result)
239
  final_results[eval_name] = EvalResult(
240
  eval_name=eval_name,
@@ -249,12 +259,12 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
249
  date=base_result.date,
250
  still_on_hub=base_result.still_on_hub
251
  )
252
- print(final_results)
 
 
253
 
254
  results = []
255
  for v in final_results.values():
256
- print("v : ",v)
257
- print("Merged results: ", v.results)
258
  try:
259
  v.to_dict() # we test if the dict version is complete
260
  results.append(v)
 
154
  AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
155
  }
156
 
157
+ # Map dataset names to their metric values
158
+ tsac_result = self.results.get("fbougares/tsac")
159
+ tunisian_result = self.results.get("arbml/Tunisian_Dialect_Corpus")
160
+
161
+ # Map metric values to their corresponding dataset names
162
  for task in Tasks:
163
+ if task.value.benchmark == "fbougares/tsac":
164
+ data_dict[task.value.col_name] = self.results.get("accuracy")
165
+ elif task.value.benchmark == "arbml/Tunisian_Dialect_Corpus":
166
+ data_dict[task.value.col_name] = self.results.get("coverage")
167
+ print("data_dict : ", data_dict)
168
  return data_dict
169
 
170
 
 
225
 
226
  # Store results of same eval together
227
  eval_name = eval_result.eval_name
228
+ print("eval_name : ", eval_name)
229
  if eval_name in eval_results.keys():
230
  # If we already have results for this eval, append to list
231
  eval_results[eval_name].append(eval_result)
232
  else:
233
  # Initialize list for this eval name
234
  eval_results[eval_name] = [eval_result]
235
+ print("eval_results : ", eval_results)
236
  # Process final results
237
  final_results = {}
238
  for eval_name, eval_list in eval_results.items():
239
  # Create merged results from all evaluations, ensuring all required task keys are present
240
+ merged_results = {task.value.metric: None for task in Tasks}
241
  for eval_result in eval_list:
242
  merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
243
 
244
  # Take the first eval_result as base and update with merged results
245
+ print("evaluation list : ", len(eval_list))
246
  base_result = eval_list[0]
247
+ print("base_result : ", base_result)
248
  # print(base_result)
249
  final_results[eval_name] = EvalResult(
250
  eval_name=eval_name,
 
259
  date=base_result.date,
260
  still_on_hub=base_result.still_on_hub
261
  )
262
+ print(len(final_results))
263
+ print(final_results.keys())
264
+ print(final_results.values())
265
 
266
  results = []
267
  for v in final_results.values():
 
 
268
  try:
269
  v.to_dict() # we test if the dict version is complete
270
  results.append(v)
src/populate.py CHANGED
@@ -20,11 +20,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
20
  print("No evaluation results found. Returning empty DataFrame with correct columns.")
21
  return pd.DataFrame(columns=cols)
22
  df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
23
- print(df)
24
- df = df[cols].round(decimals=2)
25
- print(df)
26
- # df = df[has_no_nan_values(df, benchmark_cols)]
27
  # print(df)
 
 
28
  return df
29
 
30
 
 
20
  print("No evaluation results found. Returning empty DataFrame with correct columns.")
21
  return pd.DataFrame(columns=cols)
22
  df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
 
 
 
 
23
  # print(df)
24
+ df = df[cols].round(decimals=4)
25
+ df = df[has_no_nan_values(df, benchmark_cols)]
26
  return df
27
 
28