mirageco commited on
Commit
91f5d94
1 Parent(s): b28b782

Add average checkbox for each individual category

Browse files
Files changed (4) hide show
  1. app.py +7 -4
  2. src/display/utils.py +10 -0
  3. src/leaderboard/read_evals.py +54 -5
  4. src/populate.py +20 -4
app.py CHANGED
@@ -101,17 +101,20 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
101
  AutoEvalColumn.model.name,
102
  ]
103
 
104
- # Ensure no duplicates when never_hidden and displayed_by_default are both True
105
  unique_columns = set(always_here_cols + columns)
106
 
107
  # We use COLS to maintain sorting
108
- filtered_df = df[
109
- [c for c in COLS if c in df.columns and c in unique_columns]
110
- ]
 
 
111
  return filtered_df
112
 
113
 
114
 
 
115
  def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
116
  final_df = []
117
  if query != "":
 
101
  AutoEvalColumn.model.name,
102
  ]
103
 
104
+ # Ensure no duplicates and add the new average columns
105
  unique_columns = set(always_here_cols + columns)
106
 
107
  # We use COLS to maintain sorting
108
+ filtered_df = df[[c for c in COLS if c in df.columns and c in unique_columns]]
109
+
110
+ # Debugging print to see if the new columns are included
111
+ print(f"Columns included in DataFrame: {filtered_df.columns.tolist()}")
112
+
113
  return filtered_df
114
 
115
 
116
 
117
+
118
  def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
119
  final_df = []
120
  if query != "":
src/display/utils.py CHANGED
@@ -27,7 +27,17 @@ auto_eval_column_dict = []
27
  # Model Information
28
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, category="Model Information", never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, category="Model Information", never_hidden=True)])
 
30
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, category="Model Information")])
 
 
 
 
 
 
 
 
 
31
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, category="Model Information")])
32
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False, category="Model Information")])
33
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, category="Model Information", hidden=True)])
 
27
  # Model Information
28
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, category="Model Information", never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, category="Model Information", never_hidden=True)])
30
+
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, category="Model Information")])
32
+ auto_eval_column_dict.append(["average_IE", ColumnContent, ColumnContent("Average IE ⬆️", "number", False, category="Information Extraction (IE)")])
33
+ auto_eval_column_dict.append(["average_TA", ColumnContent, ColumnContent("Average TA ⬆️", "number", False, category="Textual Analysis (TA)")])
34
+ auto_eval_column_dict.append(["average_QA", ColumnContent, ColumnContent("Average QA ⬆️", "number", False, category="Question Answering (QA)")])
35
+ auto_eval_column_dict.append(["average_TG", ColumnContent, ColumnContent("Average TG ⬆️", "number", False, category="Text Generation (TG)")])
36
+ auto_eval_column_dict.append(["average_RM", ColumnContent, ColumnContent("Average RM ⬆️", "number", False, category="Risk Management (RM)")])
37
+ auto_eval_column_dict.append(["average_FO", ColumnContent, ColumnContent("Average FO ⬆️", "number", False, category="Forecasting (FO)")])
38
+ auto_eval_column_dict.append(["average_DM", ColumnContent, ColumnContent("Average DM ⬆️", "number", False, category="Decision-Making (DM)")])
39
+ auto_eval_column_dict.append(["average_Spanish", ColumnContent, ColumnContent("Average Spanish ⬆️", "number", False, category="Spanish")])
40
+
41
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, category="Model Information")])
42
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False, category="Model Information")])
43
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, category="Model Information", hidden=True)])
src/leaderboard/read_evals.py CHANGED
@@ -117,8 +117,53 @@ class EvalResult:
117
 
118
  def to_dict(self):
119
  """Converts the Eval Result to a dict compatible with our dataframe display"""
120
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
121
- data_dict = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  "eval_name": self.eval_name, # not a column, just a save name,
123
  AutoEvalColumn.precision.name: self.precision.value.name,
124
  AutoEvalColumn.model_type.name: self.model_type.value.name,
@@ -127,19 +172,23 @@ class EvalResult:
127
  AutoEvalColumn.architecture.name: self.architecture,
128
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
129
  AutoEvalColumn.revision.name: self.revision,
130
- AutoEvalColumn.average.name: average,
131
  AutoEvalColumn.license.name: self.license,
132
  AutoEvalColumn.likes.name: self.likes,
133
  AutoEvalColumn.params.name: self.num_params,
134
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
135
- }
136
 
 
137
  for task in Tasks:
138
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
139
 
140
  return data_dict
141
 
142
 
 
 
 
143
  def get_request_file_for_model(requests_path, model_name, precision):
144
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
145
  request_files = os.path.join(
 
117
 
118
  def to_dict(self):
119
  """Converts the Eval Result to a dict compatible with our dataframe display"""
120
+
121
+ # Initialize category averages
122
+ category_averages = {
123
+ "average_IE": [],
124
+ "average_TA": [],
125
+ "average_QA": [],
126
+ "average_TG": [],
127
+ "average_RM": [],
128
+ "average_FO": [],
129
+ "average_DM": [],
130
+ "average_Spanish": []
131
+ }
132
+
133
+ # Calculate averages for each task
134
+ for task in Tasks:
135
+ score = self.results.get(task.value.benchmark)
136
+ if score is not None:
137
+ # Append score to the appropriate category
138
+ if task.value.category == "Information Extraction (IE)":
139
+ category_averages["average_IE"].append(score)
140
+ elif task.value.category == "Textual Analysis (TA)":
141
+ category_averages["average_TA"].append(score)
142
+ elif task.value.category == "Question Answering (QA)":
143
+ category_averages["average_QA"].append(score)
144
+ elif task.value.category == "Text Generation (TG)":
145
+ category_averages["average_TG"].append(score)
146
+ elif task.value.category == "Risk Management (RM)":
147
+ category_averages["average_RM"].append(score)
148
+ elif task.value.category == "Forecasting (FO)":
149
+ category_averages["average_FO"].append(score)
150
+ elif task.value.category == "Decision-Making (DM)":
151
+ category_averages["average_DM"].append(score)
152
+ elif task.value.category == "Spanish":
153
+ category_averages["average_Spanish"].append(score)
154
+
155
+ # Calculate the mean for each category and add to data_dict
156
+ data_dict = {}
157
+ for category, scores in category_averages.items():
158
+ average = sum(scores) / len(scores) if scores else 0
159
+ data_dict[category] = average
160
+
161
+ # Overall average
162
+ total_scores = [v for v in self.results.values() if v is not None]
163
+ overall_average = sum(total_scores) / len(total_scores) if total_scores else 0
164
+
165
+ # Add other columns
166
+ data_dict.update({
167
  "eval_name": self.eval_name, # not a column, just a save name,
168
  AutoEvalColumn.precision.name: self.precision.value.name,
169
  AutoEvalColumn.model_type.name: self.model_type.value.name,
 
172
  AutoEvalColumn.architecture.name: self.architecture,
173
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
174
  AutoEvalColumn.revision.name: self.revision,
175
+ AutoEvalColumn.average.name: overall_average,
176
  AutoEvalColumn.license.name: self.license,
177
  AutoEvalColumn.likes.name: self.likes,
178
  AutoEvalColumn.params.name: self.num_params,
179
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
180
+ })
181
 
182
+ # Add task results to the data dictionary
183
  for task in Tasks:
184
+ data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
185
 
186
  return data_dict
187
 
188
 
189
+
190
+
191
+
192
  def get_request_file_for_model(requests_path, model_name, precision):
193
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
194
  request_files = os.path.join(
src/populate.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -14,8 +13,23 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
 
20
  # Apply the transformation for MCC values
21
  mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
@@ -23,14 +37,16 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
23
  if task in df.columns:
24
  df[task] = (df[task] + 100) / 2.0
25
 
 
 
 
26
  # Filter out if any of the benchmarks have not been produced
27
  df = df[has_no_nan_values(df, benchmark_cols)]
28
  return raw_data, df
29
 
30
 
31
-
32
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
33
- """Creates the different dataframes for the evaluation queues requestes"""
34
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
35
  all_evals = []
36
 
 
1
  import json
2
  import os
 
3
  import pandas as pd
4
 
5
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
13
  all_data_json = [v.to_dict() for v in raw_data]
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
+
17
+ # Add category average columns with default values
18
+ category_avg_columns = {
19
+ "Average IE ⬆️": "average_IE",
20
+ "Average TA ⬆️": "average_TA",
21
+ "Average QA ⬆️": "average_QA",
22
+ "Average TG ⬆️": "average_TG",
23
+ "Average RM ⬆️": "average_RM",
24
+ "Average FO ⬆️": "average_FO",
25
+ "Average DM ⬆️": "average_DM",
26
+ "Average Spanish ⬆️": "average_Spanish"
27
+ }
28
+
29
+ for display_name, internal_name in category_avg_columns.items():
30
+ df[display_name] = df[internal_name]
31
+
32
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
33
 
34
  # Apply the transformation for MCC values
35
  mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
 
37
  if task in df.columns:
38
  df[task] = (df[task] + 100) / 2.0
39
 
40
+ # Now, select the columns that were passed to the function
41
+ df = df[cols].round(decimals=2)
42
+
43
  # Filter out if any of the benchmarks have not been produced
44
  df = df[has_no_nan_values(df, benchmark_cols)]
45
  return raw_data, df
46
 
47
 
 
48
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
49
+ """Creates the different dataframes for the evaluation queues requests"""
50
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
51
  all_evals = []
52