gardarjuto commited on
Commit
7fdb5f5
1 Parent(s): b61f534

fix: show partial results even if some evaluations haven't finished

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. src/leaderboard/read_evals.py +4 -1
  3. src/populate.py +4 -5
app.py CHANGED
@@ -65,7 +65,7 @@ def update_table(
65
 
66
  def apply_format_styling(df: pd.DataFrame) -> style.Styler:
67
  df = df.style.format(
68
- {c: "{:.1f}" for c in BENCHMARK_COLS} | {AutoEvalColumn.average.name: "{:.2f}"}
69
  )
70
  return df
71
 
 
65
 
66
  def apply_format_styling(df: pd.DataFrame) -> style.Styler:
67
  df = df.style.format(
68
+ {c: "{:.1f}" for c in BENCHMARK_COLS} | {AutoEvalColumn.average.name: "{:.2f}"}, na_rep="-"
69
  )
70
  return df
71
 
src/leaderboard/read_evals.py CHANGED
@@ -124,7 +124,10 @@ class EvalResult:
124
  }
125
 
126
  for task in Tasks:
127
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
 
128
 
129
  return data_dict
130
 
 
124
  }
125
 
126
  for task in Tasks:
127
+ if task.value.benchmark in self.results.keys():
128
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
129
+ else:
130
+ data_dict[task.value.col_name] = None
131
 
132
  return data_dict
133
 
src/populate.py CHANGED
@@ -1,11 +1,12 @@
1
  import pandas as pd
2
 
3
- from src.display.formatting import has_no_nan_values
4
  from src.display.utils import AutoEvalColumn
5
- from src.leaderboard.read_evals import get_raw_eval_results
6
 
7
 
8
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
9
  """Creates a dataframe from all the individual experiment results"""
10
  raw_data = get_raw_eval_results(results_path, requests_path)
11
  all_data_json = [v.to_dict() for v in raw_data]
@@ -14,6 +15,4 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
15
  df = df[cols].round(decimals=2)
16
 
17
- # filter out if any of the benchmarks have not been produced
18
- df = df[has_no_nan_values(df, benchmark_cols)]
19
  return raw_data, df
 
1
  import pandas as pd
2
 
 
3
  from src.display.utils import AutoEvalColumn
4
+ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
5
 
6
 
7
+ def get_leaderboard_df(
8
+ results_path: str, requests_path: str, cols: list, benchmark_cols: list
9
+ ) -> tuple[list[EvalResult], pd.DataFrame]:
10
  """Creates a dataframe from all the individual experiment results"""
11
  raw_data = get_raw_eval_results(results_path, requests_path)
12
  all_data_json = [v.to_dict() for v in raw_data]
 
15
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
16
  df = df[cols].round(decimals=2)
17
 
 
 
18
  return raw_data, df