yzabc007 commited on
Commit
b47be80
β€’
1 Parent(s): d4a35ea

Update space

Browse files
Files changed (4) hide show
  1. app.py +3 -3
  2. src/display/utils.py +3 -2
  3. src/leaderboard/read_evals.py +13 -12
  4. src/populate.py +12 -4
app.py CHANGED
@@ -96,8 +96,8 @@ def init_leaderboard(dataframe):
96
  interactive=False,
97
  )
98
 
99
-
100
- model_leaderboard_df = get_model_leaderboard_df()
101
 
102
  def overall_leaderboard(dataframe):
103
  if dataframe is None or dataframe.empty:
@@ -129,7 +129,7 @@ with demo:
129
 
130
 
131
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
132
- leaderboard = overall_leaderboard(LEADERBOARD_DF)
133
 
134
  with gr.TabItem("πŸ”’ Math", elem_id="math-tab-table", id=2):
135
 
 
96
  interactive=False,
97
  )
98
 
99
+ model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
100
+ model_leaderboard_df = get_model_leaderboard_df(model_result_path)
101
 
102
  def overall_leaderboard(dataframe):
103
  if dataframe is None or dataframe.empty:
 
129
 
130
 
131
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
132
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
133
 
134
  with gr.TabItem("πŸ”’ Math", elem_id="math-tab-table", id=2):
135
 
src/display/utils.py CHANGED
@@ -80,9 +80,9 @@ auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=l
80
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
81
  AutoEvalColumn = AutoEvalColumn()
82
  # print all attributes of AutoEvalColumn
83
- print(AutoEvalColumn.__annotations__.keys())
84
  # preint precision attribute
85
- print(AutoEvalColumn.precision)
86
 
87
 
88
  ## For the queue columns in the submission tab
@@ -144,6 +144,7 @@ class Precision(Enum):
144
 
145
  # Column selection
146
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
147
 
148
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
149
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
80
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
81
  AutoEvalColumn = AutoEvalColumn()
82
  # print all attributes of AutoEvalColumn
83
+ # print(AutoEvalColumn.__annotations__.keys())
84
  # preint precision attribute
85
+ # print(AutoEvalColumn.precision)
86
 
87
 
88
  ## For the queue columns in the submission tab
 
144
 
145
  # Column selection
146
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
147
+ # print(COLS)
148
 
149
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
150
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
src/leaderboard/read_evals.py CHANGED
@@ -53,7 +53,8 @@ class ModelResult:
53
  def to_dict(self):
54
  """Converts the Eval Result to a dict compatible with our dataframe display"""
55
 
56
- average = 1 / self.results[Domains.dim0.dimension] if self.results[Domains.dim0.dimension] != 0 else 0
 
57
  # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
58
  data_dict = {
59
  "eval_name": self.eval_name, # not a column, just a save name,
@@ -62,16 +63,16 @@ class ModelResult:
62
  AutoEvalColumn.organization.name: self.org,
63
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
64
 
65
- AutoEvalColumn.precision.name: self.precision.value.name,
66
- AutoEvalColumn.model_type.name: self.model_type.value.name,
67
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
68
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
69
- AutoEvalColumn.architecture.name: self.architecture,
70
- AutoEvalColumn.revision.name: self.revision,
71
- AutoEvalColumn.average.name: average,
72
- AutoEvalColumn.likes.name: self.likes,
73
- AutoEvalColumn.params.name: self.num_params,
74
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
75
  }
76
 
77
  for task in Tasks:
@@ -180,7 +181,7 @@ class EvalResult:
180
  def to_dict(self):
181
  """Converts the Eval Result to a dict compatible with our dataframe display"""
182
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
183
- print(AutoEvalColumn.precision.name, self.precision.value.name)
184
  data_dict = {
185
  "eval_name": self.eval_name, # not a column, just a save name,
186
  AutoEvalColumn.precision.name: self.precision.value.name,
 
53
  def to_dict(self):
54
  """Converts the Eval Result to a dict compatible with our dataframe display"""
55
 
56
+ # average = 1 / self.results[Domains.dim0.dimension] if self.results[Domains.dim0.dimension] != 0 else 0
57
+ average = 1
58
  # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
59
  data_dict = {
60
  "eval_name": self.eval_name, # not a column, just a save name,
 
63
  AutoEvalColumn.organization.name: self.org,
64
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
65
 
66
+ # AutoEvalColumn.precision.name: self.precision.value.name,
67
+ # AutoEvalColumn.model_type.name: self.model_type.value.name,
68
+ # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
69
+ # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
70
+ # AutoEvalColumn.architecture.name: self.architecture,
71
+ # AutoEvalColumn.revision.name: self.revision,
72
+ # AutoEvalColumn.average.name: average,
73
+ # AutoEvalColumn.likes.name: self.likes,
74
+ # AutoEvalColumn.params.name: self.num_params,
75
+ # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
76
  }
77
 
78
  for task in Tasks:
 
181
  def to_dict(self):
182
  """Converts the Eval Result to a dict compatible with our dataframe display"""
183
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
184
+ # print(AutoEvalColumn.precision.name, self.precision.value.name)
185
  data_dict = {
186
  "eval_name": self.eval_name, # not a column, just a save name,
187
  AutoEvalColumn.precision.name: self.precision.value.name,
src/populate.py CHANGED
@@ -8,14 +8,18 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
9
 
10
 
11
- def get_model_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_model_results(results_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
 
 
 
 
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  # df = df[has_no_nan_values(df, benchmark_cols)]
@@ -31,7 +35,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
31
 
32
  df = pd.DataFrame.from_records(all_data_json)
33
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
34
- df = df[cols].round(decimals=2)
 
 
 
 
35
 
36
  # filter out if any of the benchmarks have not been produced
37
  df = df[has_no_nan_values(df, benchmark_cols)]
 
8
  from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
9
 
10
 
11
+ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_model_results(results_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+ for col in cols:
19
+ if col not in df.columns:
20
+ df[col] = None
21
+ else:
22
+ df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
25
  # df = df[has_no_nan_values(df, benchmark_cols)]
 
35
 
36
  df = pd.DataFrame.from_records(all_data_json)
37
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
38
+ for col in cols:
39
+ if col not in df.columns:
40
+ df[col] = None
41
+ else:
42
+ df[col] = df[col].round(decimals=2)
43
 
44
  # filter out if any of the benchmarks have not been produced
45
  df = df[has_no_nan_values(df, benchmark_cols)]