Add average checkbox for each individual category
Browse files- app.py +7 -4
- src/display/utils.py +10 -0
- src/leaderboard/read_evals.py +54 -5
- src/populate.py +20 -4
app.py
CHANGED
@@ -101,17 +101,20 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
101 |
AutoEvalColumn.model.name,
|
102 |
]
|
103 |
|
104 |
-
# Ensure no duplicates
|
105 |
unique_columns = set(always_here_cols + columns)
|
106 |
|
107 |
# We use COLS to maintain sorting
|
108 |
-
filtered_df = df[
|
109 |
-
|
110 |
-
|
|
|
|
|
111 |
return filtered_df
|
112 |
|
113 |
|
114 |
|
|
|
115 |
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
116 |
final_df = []
|
117 |
if query != "":
|
|
|
101 |
AutoEvalColumn.model.name,
|
102 |
]
|
103 |
|
104 |
+
# Ensure no duplicates and add the new average columns
|
105 |
unique_columns = set(always_here_cols + columns)
|
106 |
|
107 |
# We use COLS to maintain sorting
|
108 |
+
filtered_df = df[[c for c in COLS if c in df.columns and c in unique_columns]]
|
109 |
+
|
110 |
+
# Debugging print to see if the new columns are included
|
111 |
+
print(f"Columns included in DataFrame: {filtered_df.columns.tolist()}")
|
112 |
+
|
113 |
return filtered_df
|
114 |
|
115 |
|
116 |
|
117 |
+
|
118 |
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
119 |
final_df = []
|
120 |
if query != "":
|
src/display/utils.py
CHANGED
@@ -27,7 +27,17 @@ auto_eval_column_dict = []
|
|
27 |
# Model Information
|
28 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, category="Model Information", never_hidden=True)])
|
29 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, category="Model Information", never_hidden=True)])
|
|
|
30 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, category="Model Information")])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, category="Model Information")])
|
32 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False, category="Model Information")])
|
33 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, category="Model Information", hidden=True)])
|
|
|
27 |
# Model Information
|
28 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, category="Model Information", never_hidden=True)])
|
29 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, category="Model Information", never_hidden=True)])
|
30 |
+
|
31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True, category="Model Information")])
|
32 |
+
auto_eval_column_dict.append(["average_IE", ColumnContent, ColumnContent("Average IE ⬆️", "number", False, category="Information Extraction (IE)")])
|
33 |
+
auto_eval_column_dict.append(["average_TA", ColumnContent, ColumnContent("Average TA ⬆️", "number", False, category="Textual Analysis (TA)")])
|
34 |
+
auto_eval_column_dict.append(["average_QA", ColumnContent, ColumnContent("Average QA ⬆️", "number", False, category="Question Answering (QA)")])
|
35 |
+
auto_eval_column_dict.append(["average_TG", ColumnContent, ColumnContent("Average TG ⬆️", "number", False, category="Text Generation (TG)")])
|
36 |
+
auto_eval_column_dict.append(["average_RM", ColumnContent, ColumnContent("Average RM ⬆️", "number", False, category="Risk Management (RM)")])
|
37 |
+
auto_eval_column_dict.append(["average_FO", ColumnContent, ColumnContent("Average FO ⬆️", "number", False, category="Forecasting (FO)")])
|
38 |
+
auto_eval_column_dict.append(["average_DM", ColumnContent, ColumnContent("Average DM ⬆️", "number", False, category="Decision-Making (DM)")])
|
39 |
+
auto_eval_column_dict.append(["average_Spanish", ColumnContent, ColumnContent("Average Spanish ⬆️", "number", False, category="Spanish")])
|
40 |
+
|
41 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, category="Model Information")])
|
42 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False, category="Model Information")])
|
43 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, category="Model Information", hidden=True)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -117,8 +117,53 @@ class EvalResult:
|
|
117 |
|
118 |
def to_dict(self):
|
119 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
"eval_name": self.eval_name, # not a column, just a save name,
|
123 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
124 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
@@ -127,19 +172,23 @@ class EvalResult:
|
|
127 |
AutoEvalColumn.architecture.name: self.architecture,
|
128 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
129 |
AutoEvalColumn.revision.name: self.revision,
|
130 |
-
AutoEvalColumn.average.name:
|
131 |
AutoEvalColumn.license.name: self.license,
|
132 |
AutoEvalColumn.likes.name: self.likes,
|
133 |
AutoEvalColumn.params.name: self.num_params,
|
134 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
135 |
-
}
|
136 |
|
|
|
137 |
for task in Tasks:
|
138 |
-
data_dict[task.value.col_name] = self.results
|
139 |
|
140 |
return data_dict
|
141 |
|
142 |
|
|
|
|
|
|
|
143 |
def get_request_file_for_model(requests_path, model_name, precision):
|
144 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
145 |
request_files = os.path.join(
|
|
|
117 |
|
118 |
def to_dict(self):
|
119 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
120 |
+
|
121 |
+
# Initialize category averages
|
122 |
+
category_averages = {
|
123 |
+
"average_IE": [],
|
124 |
+
"average_TA": [],
|
125 |
+
"average_QA": [],
|
126 |
+
"average_TG": [],
|
127 |
+
"average_RM": [],
|
128 |
+
"average_FO": [],
|
129 |
+
"average_DM": [],
|
130 |
+
"average_Spanish": []
|
131 |
+
}
|
132 |
+
|
133 |
+
# Calculate averages for each task
|
134 |
+
for task in Tasks:
|
135 |
+
score = self.results.get(task.value.benchmark)
|
136 |
+
if score is not None:
|
137 |
+
# Append score to the appropriate category
|
138 |
+
if task.value.category == "Information Extraction (IE)":
|
139 |
+
category_averages["average_IE"].append(score)
|
140 |
+
elif task.value.category == "Textual Analysis (TA)":
|
141 |
+
category_averages["average_TA"].append(score)
|
142 |
+
elif task.value.category == "Question Answering (QA)":
|
143 |
+
category_averages["average_QA"].append(score)
|
144 |
+
elif task.value.category == "Text Generation (TG)":
|
145 |
+
category_averages["average_TG"].append(score)
|
146 |
+
elif task.value.category == "Risk Management (RM)":
|
147 |
+
category_averages["average_RM"].append(score)
|
148 |
+
elif task.value.category == "Forecasting (FO)":
|
149 |
+
category_averages["average_FO"].append(score)
|
150 |
+
elif task.value.category == "Decision-Making (DM)":
|
151 |
+
category_averages["average_DM"].append(score)
|
152 |
+
elif task.value.category == "Spanish":
|
153 |
+
category_averages["average_Spanish"].append(score)
|
154 |
+
|
155 |
+
# Calculate the mean for each category and add to data_dict
|
156 |
+
data_dict = {}
|
157 |
+
for category, scores in category_averages.items():
|
158 |
+
average = sum(scores) / len(scores) if scores else 0
|
159 |
+
data_dict[category] = average
|
160 |
+
|
161 |
+
# Overall average
|
162 |
+
total_scores = [v for v in self.results.values() if v is not None]
|
163 |
+
overall_average = sum(total_scores) / len(total_scores) if total_scores else 0
|
164 |
+
|
165 |
+
# Add other columns
|
166 |
+
data_dict.update({
|
167 |
"eval_name": self.eval_name, # not a column, just a save name,
|
168 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
169 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
|
|
172 |
AutoEvalColumn.architecture.name: self.architecture,
|
173 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
174 |
AutoEvalColumn.revision.name: self.revision,
|
175 |
+
AutoEvalColumn.average.name: overall_average,
|
176 |
AutoEvalColumn.license.name: self.license,
|
177 |
AutoEvalColumn.likes.name: self.likes,
|
178 |
AutoEvalColumn.params.name: self.num_params,
|
179 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
180 |
+
})
|
181 |
|
182 |
+
# Add task results to the data dictionary
|
183 |
for task in Tasks:
|
184 |
+
data_dict[task.value.col_name] = self.results.get(task.value.benchmark)
|
185 |
|
186 |
return data_dict
|
187 |
|
188 |
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
def get_request_file_for_model(requests_path, model_name, precision):
|
193 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
194 |
request_files = os.path.join(
|
src/populate.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
@@ -14,8 +13,23 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
-
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# Apply the transformation for MCC values
|
21 |
mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
|
@@ -23,14 +37,16 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
23 |
if task in df.columns:
|
24 |
df[task] = (df[task] + 100) / 2.0
|
25 |
|
|
|
|
|
|
|
26 |
# Filter out if any of the benchmarks have not been produced
|
27 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
28 |
return raw_data, df
|
29 |
|
30 |
|
31 |
-
|
32 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
33 |
-
"""Creates the different dataframes for the evaluation queues
|
34 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
35 |
all_evals = []
|
36 |
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import pandas as pd
|
4 |
|
5 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
+
|
17 |
+
# Add category average columns with default values
|
18 |
+
category_avg_columns = {
|
19 |
+
"Average IE ⬆️": "average_IE",
|
20 |
+
"Average TA ⬆️": "average_TA",
|
21 |
+
"Average QA ⬆️": "average_QA",
|
22 |
+
"Average TG ⬆️": "average_TG",
|
23 |
+
"Average RM ⬆️": "average_RM",
|
24 |
+
"Average FO ⬆️": "average_FO",
|
25 |
+
"Average DM ⬆️": "average_DM",
|
26 |
+
"Average Spanish ⬆️": "average_Spanish"
|
27 |
+
}
|
28 |
+
|
29 |
+
for display_name, internal_name in category_avg_columns.items():
|
30 |
+
df[display_name] = df[internal_name]
|
31 |
+
|
32 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
33 |
|
34 |
# Apply the transformation for MCC values
|
35 |
mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
|
|
|
37 |
if task in df.columns:
|
38 |
df[task] = (df[task] + 100) / 2.0
|
39 |
|
40 |
+
# Now, select the columns that were passed to the function
|
41 |
+
df = df[cols].round(decimals=2)
|
42 |
+
|
43 |
# Filter out if any of the benchmarks have not been produced
|
44 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
45 |
return raw_data, df
|
46 |
|
47 |
|
|
|
48 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
49 |
+
"""Creates the different dataframes for the evaluation queues requests"""
|
50 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
51 |
all_evals = []
|
52 |
|