djstrong commited on
Commit
8445932
1 Parent(s): 8d84e4a
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +6 -0
src/leaderboard/read_evals.py CHANGED
@@ -450,6 +450,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
450
  for_run=[]
451
  for v in eval_results.values():
452
  r = v.to_dict()
 
453
  for task in Tasks:
454
  if r[task.value.col_name] is None:
455
  task_name = f"{r['n_shot']}|{task.value.benchmark}"
@@ -457,12 +458,17 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
457
  missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
458
  if v.still_on_hub and task.value.benchmark in all_tasks:
459
  for_run.append([r["n_shot"], task.value.benchmark, v.full_model])
 
460
  # print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
461
  else:
462
  missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
463
  if v.still_on_hub and task.value.benchmark in all_tasks:
464
  for_run.append([r["n_shot"], task.value.benchmark, v.full_model])
 
465
  # print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
 
 
 
466
  if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
467
  missing_metadata.append(f"{v.full_model}")
468
  all_models.append((v.full_model, v.num_params, v.still_on_hub))
 
450
  for_run=[]
451
  for v in eval_results.values():
452
  r = v.to_dict()
453
+ in_progress=False
454
  for task in Tasks:
455
  if r[task.value.col_name] is None:
456
  task_name = f"{r['n_shot']}|{task.value.benchmark}"
 
458
  missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
459
  if v.still_on_hub and task.value.benchmark in all_tasks:
460
  for_run.append([r["n_shot"], task.value.benchmark, v.full_model])
461
+ in_progress=True
462
  # print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
463
  else:
464
  missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
465
  if v.still_on_hub and task.value.benchmark in all_tasks:
466
  for_run.append([r["n_shot"], task.value.benchmark, v.full_model])
467
+ in_progress=True
468
  # print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
469
+ if in_progress:
470
+ v.model = '⚠️' + v.model
471
+
472
  if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
473
  missing_metadata.append(f"{v.full_model}")
474
  all_models.append((v.full_model, v.num_params, v.still_on_hub))