Spaces:
Sleeping
Sleeping
Update src/leaderboard/read_evals.py
#6
by
jcole1
- opened
- app.py +34 -175
- requirements.txt +2 -1
- src/display/utils.py +1 -26
- src/envs.py +1 -1
- src/leaderboard/read_evals.py +20 -8
- src/populate.py +13 -5
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
import subprocess
|
2 |
import gradio as gr
|
|
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
@@ -18,8 +18,6 @@ from src.display.utils import (
|
|
18 |
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
-
NUMERIC_INTERVALS,
|
22 |
-
TYPES,
|
23 |
AutoEvalColumn,
|
24 |
ModelType,
|
25 |
fields,
|
@@ -34,6 +32,7 @@ from src.submission.submit import add_new_eval
|
|
34 |
def restart_space():
|
35 |
API.restart_space(repo_id=REPO_ID)
|
36 |
|
|
|
37 |
try:
|
38 |
print(EVAL_REQUESTS_PATH)
|
39 |
snapshot_download(
|
@@ -50,8 +49,7 @@ except Exception:
|
|
50 |
restart_space()
|
51 |
|
52 |
|
53 |
-
|
54 |
-
leaderboard_df = original_df.copy()
|
55 |
|
56 |
(
|
57 |
finished_eval_queue_df,
|
@@ -59,77 +57,36 @@ leaderboard_df = original_df.copy()
|
|
59 |
pending_eval_queue_df,
|
60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
)
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
return filtered_df
|
93 |
-
|
94 |
-
|
95 |
-
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
96 |
-
final_df = []
|
97 |
-
if query != "":
|
98 |
-
queries = [q.strip() for q in query.split(";")]
|
99 |
-
for _q in queries:
|
100 |
-
_q = _q.strip()
|
101 |
-
if _q != "":
|
102 |
-
temp_filtered_df = search_table(filtered_df, _q)
|
103 |
-
if len(temp_filtered_df) > 0:
|
104 |
-
final_df.append(temp_filtered_df)
|
105 |
-
if len(final_df) > 0:
|
106 |
-
filtered_df = pd.concat(final_df)
|
107 |
-
filtered_df = filtered_df.drop_duplicates(
|
108 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
109 |
-
)
|
110 |
-
|
111 |
-
return filtered_df
|
112 |
-
|
113 |
-
|
114 |
-
def filter_models(
|
115 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
116 |
-
) -> pd.DataFrame:
|
117 |
-
# Show all models
|
118 |
-
if show_deleted:
|
119 |
-
filtered_df = df
|
120 |
-
else: # Show only still on the hub models
|
121 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
122 |
-
|
123 |
-
type_emoji = [t[0] for t in type_query]
|
124 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
125 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
126 |
-
|
127 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
128 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
129 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
130 |
-
filtered_df = filtered_df.loc[mask]
|
131 |
-
|
132 |
-
return filtered_df
|
133 |
|
134 |
|
135 |
demo = gr.Blocks(css=custom_css)
|
@@ -139,105 +96,7 @@ with demo:
|
|
139 |
|
140 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
141 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
142 |
-
|
143 |
-
with gr.Column():
|
144 |
-
with gr.Row():
|
145 |
-
search_bar = gr.Textbox(
|
146 |
-
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
147 |
-
show_label=False,
|
148 |
-
elem_id="search-bar",
|
149 |
-
)
|
150 |
-
with gr.Row():
|
151 |
-
shown_columns = gr.CheckboxGroup(
|
152 |
-
choices=[
|
153 |
-
c.name
|
154 |
-
for c in fields(AutoEvalColumn)
|
155 |
-
if not c.hidden and not c.never_hidden
|
156 |
-
],
|
157 |
-
value=[
|
158 |
-
c.name
|
159 |
-
for c in fields(AutoEvalColumn)
|
160 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
161 |
-
],
|
162 |
-
label="Select columns to show",
|
163 |
-
elem_id="column-select",
|
164 |
-
interactive=True,
|
165 |
-
)
|
166 |
-
with gr.Row():
|
167 |
-
deleted_models_visibility = gr.Checkbox(
|
168 |
-
value=False, label="Show gated/private/deleted models", interactive=True
|
169 |
-
)
|
170 |
-
with gr.Column(min_width=320):
|
171 |
-
#with gr.Box(elem_id="box-filter"):
|
172 |
-
filter_columns_type = gr.CheckboxGroup(
|
173 |
-
label="Model types",
|
174 |
-
choices=[t.to_str() for t in ModelType],
|
175 |
-
value=[t.to_str() for t in ModelType],
|
176 |
-
interactive=True,
|
177 |
-
elem_id="filter-columns-type",
|
178 |
-
)
|
179 |
-
filter_columns_precision = gr.CheckboxGroup(
|
180 |
-
label="Precision",
|
181 |
-
choices=[i.value.name for i in Precision],
|
182 |
-
value=[i.value.name for i in Precision],
|
183 |
-
interactive=True,
|
184 |
-
elem_id="filter-columns-precision",
|
185 |
-
)
|
186 |
-
filter_columns_size = gr.CheckboxGroup(
|
187 |
-
label="Model sizes (in billions of parameters)",
|
188 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
189 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
190 |
-
interactive=True,
|
191 |
-
elem_id="filter-columns-size",
|
192 |
-
)
|
193 |
-
|
194 |
-
leaderboard_table = gr.components.Dataframe(
|
195 |
-
value=leaderboard_df[
|
196 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
197 |
-
+ shown_columns.value
|
198 |
-
],
|
199 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
200 |
-
datatype=TYPES,
|
201 |
-
elem_id="leaderboard-table",
|
202 |
-
interactive=False,
|
203 |
-
visible=True,
|
204 |
-
)
|
205 |
-
|
206 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
207 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
208 |
-
value=original_df[COLS],
|
209 |
-
headers=COLS,
|
210 |
-
datatype=TYPES,
|
211 |
-
visible=False,
|
212 |
-
)
|
213 |
-
search_bar.submit(
|
214 |
-
update_table,
|
215 |
-
[
|
216 |
-
hidden_leaderboard_table_for_search,
|
217 |
-
shown_columns,
|
218 |
-
filter_columns_type,
|
219 |
-
filter_columns_precision,
|
220 |
-
filter_columns_size,
|
221 |
-
deleted_models_visibility,
|
222 |
-
search_bar,
|
223 |
-
],
|
224 |
-
leaderboard_table,
|
225 |
-
)
|
226 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
227 |
-
selector.change(
|
228 |
-
update_table,
|
229 |
-
[
|
230 |
-
hidden_leaderboard_table_for_search,
|
231 |
-
shown_columns,
|
232 |
-
filter_columns_type,
|
233 |
-
filter_columns_precision,
|
234 |
-
filter_columns_size,
|
235 |
-
deleted_models_visibility,
|
236 |
-
search_bar,
|
237 |
-
],
|
238 |
-
leaderboard_table,
|
239 |
-
queue=True,
|
240 |
-
)
|
241 |
|
242 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
243 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
|
|
18 |
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
|
|
|
|
21 |
AutoEvalColumn,
|
22 |
ModelType,
|
23 |
fields,
|
|
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
+
### Space initialisation
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
|
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
+
def init_leaderboard(dataframe):
|
61 |
+
if dataframe is None or dataframe.empty:
|
62 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
+
return Leaderboard(
|
64 |
+
value=dataframe,
|
65 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
+
select_columns=SelectColumns(
|
67 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
+
label="Select Columns to Display:",
|
70 |
+
),
|
71 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
+
filter_columns=[
|
74 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
ColumnFilter(
|
77 |
+
AutoEvalColumn.params.name,
|
78 |
+
type="slider",
|
79 |
+
min=0,
|
80 |
+
max=2000,
|
81 |
+
label="Select the number of parameters (M)",
|
82 |
+
),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
+
],
|
87 |
+
# bool_checkboxgroup_label="Hide models",
|
88 |
+
# interactive=False,
|
89 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
|
92 |
demo = gr.Blocks(css=custom_css)
|
|
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
requirements.txt
CHANGED
@@ -15,4 +15,5 @@ transformers==4.35.2
|
|
15 |
tokenizers>=0.15.0
|
16 |
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
accelerate==0.24.1
|
18 |
-
sentencepiece
|
|
|
|
15 |
tokenizers>=0.15.0
|
16 |
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
accelerate==0.24.1
|
18 |
+
sentencepiece
|
19 |
+
gradio_leaderboard
|
src/display/utils.py
CHANGED
@@ -26,7 +26,7 @@ auto_eval_column_dict = []
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
@@ -91,10 +91,6 @@ class WeightType(Enum):
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
94 |
-
float32 = ModelDetails("float32")
|
95 |
-
#qt_8bit = ModelDetails("8bit")
|
96 |
-
#qt_4bit = ModelDetails("4bit")
|
97 |
-
#qt_GPTQ = ModelDetails("GPTQ")
|
98 |
Unknown = ModelDetails("?")
|
99 |
|
100 |
def from_str(precision):
|
@@ -102,34 +98,13 @@ class Precision(Enum):
|
|
102 |
return Precision.float16
|
103 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
104 |
return Precision.bfloat16
|
105 |
-
if precision in ["float32"]:
|
106 |
-
return Precision.float32
|
107 |
-
#if precision in ["8bit"]:
|
108 |
-
# return Precision.qt_8bit
|
109 |
-
#if precision in ["4bit"]:
|
110 |
-
# return Precision.qt_4bit
|
111 |
-
#if precision in ["GPTQ", "None"]:
|
112 |
-
# return Precision.qt_GPTQ
|
113 |
return Precision.Unknown
|
114 |
|
115 |
# Column selection
|
116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
117 |
-
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
118 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
119 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
120 |
|
121 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
122 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
123 |
|
124 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
125 |
|
126 |
-
NUMERIC_INTERVALS = {
|
127 |
-
"?": pd.Interval(-1, 0, closed="right"),
|
128 |
-
"~1.5": pd.Interval(0, 2, closed="right"),
|
129 |
-
"~3": pd.Interval(2, 4, closed="right"),
|
130 |
-
"~7": pd.Interval(4, 9, closed="right"),
|
131 |
-
"~13": pd.Interval(9, 20, closed="right"),
|
132 |
-
"~35": pd.Interval(20, 45, closed="right"),
|
133 |
-
"~60": pd.Interval(45, 70, closed="right"),
|
134 |
-
"70+": pd.Interval(70, 10000, closed="right"),
|
135 |
-
}
|
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
|
|
|
|
|
|
94 |
Unknown = ModelDetails("?")
|
95 |
|
96 |
def from_str(precision):
|
|
|
98 |
return Precision.float16
|
99 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
return Precision.bfloat16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return Precision.Unknown
|
102 |
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
|
|
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/leaderboard/read_evals.py
CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
|
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
65 |
architectures = getattr(model_config, "architectures", None)
|
@@ -70,13 +71,15 @@ class EvalResult:
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
-
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
@@ -93,8 +96,8 @@ class EvalResult:
|
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
96 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
@@ -107,9 +110,11 @@ class EvalResult:
|
|
107 |
except Exception:
|
108 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
|
110 |
-
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average =
|
|
|
|
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
138 |
requests_path,
|
139 |
f"{model_name}_eval_request_*.json",
|
140 |
)
|
|
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
143 |
# Select correct request file (precision)
|
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
146 |
for tmp_request_file in request_files:
|
147 |
with open(tmp_request_file, "r") as f:
|
148 |
req_content = json.load(f)
|
|
|
|
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
and req_content["precision"] == precision.split(".")[-1]
|
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
186 |
eval_results[eval_name] = eval_result
|
187 |
|
188 |
results = []
|
189 |
-
for
|
|
|
|
|
|
|
|
|
190 |
try:
|
191 |
-
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
63 |
+
print("Is model on hub? \n", _)
|
64 |
architecture = "?"
|
65 |
if model_config is not None:
|
66 |
architectures = getattr(model_config, "architectures", None)
|
|
|
71 |
results = {}
|
72 |
for task in Tasks:
|
73 |
task = task.value
|
|
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
+
if task.benchmark == "mRNA":
|
79 |
+
# Keep RMSE at original value
|
80 |
+
mean_acc = np.mean(accs)
|
81 |
+
else:
|
82 |
+
mean_acc = np.mean(accs) * 100.0
|
83 |
results[task.benchmark] = mean_acc
|
84 |
|
85 |
return self(
|
|
|
96 |
|
97 |
def update_with_request_file(self, requests_path):
|
98 |
"""Finds the relevant request file for the current model and updates info with it"""
|
99 |
+
# print("Requests Path: ", requests_path)
|
100 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
|
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
103 |
request = json.load(f)
|
|
|
110 |
except Exception:
|
111 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
112 |
|
113 |
+
def to_dict(self, rank):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
+
average = rank
|
116 |
+
# average = sorted(average, reverse=True)
|
117 |
+
# rank = [rank+1 for rank, value in enumerate(average)]
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
143 |
requests_path,
|
144 |
f"{model_name}_eval_request_*.json",
|
145 |
)
|
146 |
+
# print("Request Files: ", request_files)
|
147 |
request_files = glob.glob(request_files)
|
148 |
|
149 |
# Select correct request file (precision)
|
|
|
152 |
for tmp_request_file in request_files:
|
153 |
with open(tmp_request_file, "r") as f:
|
154 |
req_content = json.load(f)
|
155 |
+
# print("Request File: ", tmp_request_file)
|
156 |
+
# print("Req Content: ", req_content)
|
157 |
if (
|
158 |
req_content["status"] in ["FINISHED"]
|
159 |
and req_content["precision"] == precision.split(".")[-1]
|
|
|
194 |
eval_results[eval_name] = eval_result
|
195 |
|
196 |
results = []
|
197 |
+
for result in eval_results.values():
|
198 |
+
result.average = np.mean(list(result.results.values()))
|
199 |
+
sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
|
200 |
+
print(f"SORTED RESULTS HERE: \n{sorted_results}")
|
201 |
+
for i,v in enumerate(sorted_results):
|
202 |
try:
|
203 |
+
v.to_dict(i) # we test if the dict version is complete
|
204 |
results.append(v)
|
205 |
except KeyError: # not all eval values present
|
206 |
continue
|
src/populate.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
@@ -11,15 +12,22 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
-
|
|
|
23 |
|
24 |
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
@@ -55,4 +63,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
55 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import numpy as np
|
4 |
import pandas as pd
|
5 |
|
6 |
+
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
+
for result in raw_data:
|
16 |
+
result.average = np.mean(list(result.results.values()))
|
17 |
+
sorted_results = sorted(raw_data, key=lambda r: r.average, reverse=True)
|
18 |
+
print(sorted_results)
|
19 |
+
# ranks = [rank+1 for rank, value in enumerate(sorted_results)]
|
20 |
+
# rank = [rank+1 for rank, value in enumerate(average)]
|
21 |
+
all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
|
22 |
|
23 |
df = pd.DataFrame.from_records(all_data_json)
|
24 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
25 |
df = df[cols].round(decimals=2)
|
26 |
|
27 |
# filter out if any of the benchmarks have not been produced
|
28 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
29 |
+
print(df)
|
30 |
+
return df
|
31 |
|
32 |
|
33 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
63 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
64 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
65 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
66 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|