missing
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -447,6 +447,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
447 |
all_models = []
|
448 |
missing_results_for_task = {}
|
449 |
missing_metadata = []
|
|
|
450 |
for v in eval_results.values():
|
451 |
r = v.to_dict()
|
452 |
for task in Tasks:
|
@@ -454,8 +455,12 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
454 |
task_name = f"{r['n_shot']}|{task.value.benchmark}"
|
455 |
if task_name in missing_results_for_task:
|
456 |
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
|
|
|
|
457 |
else:
|
458 |
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
|
|
|
|
459 |
if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
|
460 |
missing_metadata.append(f"{v.full_model}")
|
461 |
all_models.append((v.full_model, v.num_params, v.still_on_hub))
|
|
|
447 |
all_models = []
|
448 |
missing_results_for_task = {}
|
449 |
missing_metadata = []
|
450 |
+
for_run=[]
|
451 |
for v in eval_results.values():
|
452 |
r = v.to_dict()
|
453 |
for task in Tasks:
|
|
|
455 |
task_name = f"{r['n_shot']}|{task.value.benchmark}"
|
456 |
if task_name in missing_results_for_task:
|
457 |
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
458 |
+
if v.still_on_hub and task.value.benchmark in all_tasks:
|
459 |
+
print(f'batch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
|
460 |
else:
|
461 |
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
462 |
+
if v.still_on_hub and task.value.benchmark in all_tasks:
|
463 |
+
print(f'batch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
|
464 |
if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
|
465 |
missing_metadata.append(f"{v.full_model}")
|
466 |
all_models.append((v.full_model, v.num_params, v.still_on_hub))
|