Clémentine
commited on
Commit
•
b1a1395
1
Parent(s):
ccefec9
Refactor 2 - added plotting back
Browse filesOnly takes into account last submissions, but we have no way to go back at eval date apart from loading info from git commit of results files.
Also updated speed with gradio concurrency limit param
- app.py +24 -44
- src/display/formatting.py +3 -0
- src/display/utils.py +32 -8
- src/leaderboard/read_evals.py +5 -6
- src/populate.py +7 -6
- src/submission/check_validity.py +1 -1
- src/tools/plots.py +49 -120
app.py
CHANGED
@@ -31,18 +31,15 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
31 |
from src.submission.submit import add_new_eval
|
32 |
from src.tools.collections import update_collections
|
33 |
from src.tools.plots import (
|
34 |
-
HUMAN_BASELINES,
|
35 |
create_metric_plot_obj,
|
36 |
create_plot_df,
|
37 |
create_scores_df,
|
38 |
-
join_model_info_with_results,
|
39 |
)
|
40 |
|
41 |
|
42 |
def restart_space():
|
43 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
44 |
|
45 |
-
|
46 |
try:
|
47 |
snapshot_download(
|
48 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
@@ -57,13 +54,11 @@ except Exception:
|
|
57 |
restart_space()
|
58 |
|
59 |
|
60 |
-
original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
61 |
update_collections(original_df.copy())
|
62 |
leaderboard_df = original_df.copy()
|
63 |
|
64 |
-
|
65 |
-
# plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
66 |
-
# to_be_dumped = f"models = {repr(models)}\n"
|
67 |
|
68 |
(
|
69 |
finished_eval_queue_df,
|
@@ -72,16 +67,6 @@ leaderboard_df = original_df.copy()
|
|
72 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
73 |
|
74 |
|
75 |
-
# Basics
|
76 |
-
#def change_tab(query_param: str):
|
77 |
-
# query_param = query_param.replace("'", '"')
|
78 |
-
# query_param = json.loads(query_param)
|
79 |
-
# if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
|
80 |
-
# return gr.Tabs.update(selected=1)
|
81 |
-
# else:
|
82 |
-
# return gr.Tabs.update(selected=0)
|
83 |
-
|
84 |
-
|
85 |
# Searching and filtering
|
86 |
def update_table(
|
87 |
hidden_df: pd.DataFrame,
|
@@ -247,6 +232,7 @@ with demo:
|
|
247 |
search_bar,
|
248 |
],
|
249 |
leaderboard_table,
|
|
|
250 |
)
|
251 |
shown_columns.change(
|
252 |
update_table,
|
@@ -261,6 +247,7 @@ with demo:
|
|
261 |
],
|
262 |
leaderboard_table,
|
263 |
queue=True,
|
|
|
264 |
)
|
265 |
filter_columns_type.change(
|
266 |
update_table,
|
@@ -275,6 +262,7 @@ with demo:
|
|
275 |
],
|
276 |
leaderboard_table,
|
277 |
queue=True,
|
|
|
278 |
)
|
279 |
filter_columns_precision.change(
|
280 |
update_table,
|
@@ -289,6 +277,7 @@ with demo:
|
|
289 |
],
|
290 |
leaderboard_table,
|
291 |
queue=True,
|
|
|
292 |
)
|
293 |
filter_columns_size.change(
|
294 |
update_table,
|
@@ -303,6 +292,7 @@ with demo:
|
|
303 |
],
|
304 |
leaderboard_table,
|
305 |
queue=True,
|
|
|
306 |
)
|
307 |
deleted_models_visibility.change(
|
308 |
update_table,
|
@@ -317,27 +307,25 @@ with demo:
|
|
317 |
],
|
318 |
leaderboard_table,
|
319 |
queue=True,
|
|
|
320 |
)
|
321 |
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
# title="Top Scores and Human Baseline Over Time",
|
339 |
-
# )
|
340 |
-
# gr.Plot(value=chart, interactive=False, width=500, height=500)
|
341 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
342 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
343 |
|
@@ -440,14 +428,6 @@ with demo:
|
|
440 |
show_copy_button=True,
|
441 |
)
|
442 |
|
443 |
-
#dummy = gr.Textbox(visible=False)
|
444 |
-
#demo.load(
|
445 |
-
# change_tab,
|
446 |
-
# dummy,
|
447 |
-
# tabs,
|
448 |
-
# js=get_window_url_params,
|
449 |
-
#)
|
450 |
-
|
451 |
scheduler = BackgroundScheduler()
|
452 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
453 |
scheduler.start()
|
|
|
31 |
from src.submission.submit import add_new_eval
|
32 |
from src.tools.collections import update_collections
|
33 |
from src.tools.plots import (
|
|
|
34 |
create_metric_plot_obj,
|
35 |
create_plot_df,
|
36 |
create_scores_df,
|
|
|
37 |
)
|
38 |
|
39 |
|
40 |
def restart_space():
|
41 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
42 |
|
|
|
43 |
try:
|
44 |
snapshot_download(
|
45 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
|
|
54 |
restart_space()
|
55 |
|
56 |
|
57 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
58 |
update_collections(original_df.copy())
|
59 |
leaderboard_df = original_df.copy()
|
60 |
|
61 |
+
plot_df = create_plot_df(create_scores_df(raw_data))
|
|
|
|
|
62 |
|
63 |
(
|
64 |
finished_eval_queue_df,
|
|
|
67 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
# Searching and filtering
|
71 |
def update_table(
|
72 |
hidden_df: pd.DataFrame,
|
|
|
232 |
search_bar,
|
233 |
],
|
234 |
leaderboard_table,
|
235 |
+
concurrency_limit=None,
|
236 |
)
|
237 |
shown_columns.change(
|
238 |
update_table,
|
|
|
247 |
],
|
248 |
leaderboard_table,
|
249 |
queue=True,
|
250 |
+
concurrency_limit=None,
|
251 |
)
|
252 |
filter_columns_type.change(
|
253 |
update_table,
|
|
|
262 |
],
|
263 |
leaderboard_table,
|
264 |
queue=True,
|
265 |
+
concurrency_limit=None,
|
266 |
)
|
267 |
filter_columns_precision.change(
|
268 |
update_table,
|
|
|
277 |
],
|
278 |
leaderboard_table,
|
279 |
queue=True,
|
280 |
+
concurrency_limit=None,
|
281 |
)
|
282 |
filter_columns_size.change(
|
283 |
update_table,
|
|
|
292 |
],
|
293 |
leaderboard_table,
|
294 |
queue=True,
|
295 |
+
concurrency_limit=None,
|
296 |
)
|
297 |
deleted_models_visibility.change(
|
298 |
update_table,
|
|
|
307 |
],
|
308 |
leaderboard_table,
|
309 |
queue=True,
|
310 |
+
concurrency_limit=None,
|
311 |
)
|
312 |
|
313 |
+
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
|
314 |
+
with gr.Row():
|
315 |
+
with gr.Column():
|
316 |
+
chart = create_metric_plot_obj(
|
317 |
+
plot_df,
|
318 |
+
[AutoEvalColumn.average.name],
|
319 |
+
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
320 |
+
)
|
321 |
+
gr.Plot(value=chart, min_width=500)
|
322 |
+
with gr.Column():
|
323 |
+
chart = create_metric_plot_obj(
|
324 |
+
plot_df,
|
325 |
+
BENCHMARK_COLS,
|
326 |
+
title="Top Scores and Human Baseline Over Time (from last update)",
|
327 |
+
)
|
328 |
+
gr.Plot(value=chart, min_width=500)
|
|
|
|
|
|
|
329 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
330 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
331 |
|
|
|
428 |
show_copy_button=True,
|
429 |
)
|
430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
scheduler = BackgroundScheduler()
|
432 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
433 |
scheduler.start()
|
src/display/formatting.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import os
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
|
|
|
|
4 |
|
5 |
API = HfApi()
|
6 |
|
|
|
1 |
import os
|
2 |
+
from datetime import datetime, timezone
|
3 |
|
4 |
from huggingface_hub import HfApi
|
5 |
+
from huggingface_hub.hf_api import ModelInfo
|
6 |
+
|
7 |
|
8 |
API = HfApi()
|
9 |
|
src/display/utils.py
CHANGED
@@ -60,7 +60,7 @@ baseline_row = {
|
|
60 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
61 |
AutoEvalColumn.revision.name: "N/A",
|
62 |
AutoEvalColumn.precision.name: None,
|
63 |
-
AutoEvalColumn.average.name:
|
64 |
AutoEvalColumn.arc.name: 25.0,
|
65 |
AutoEvalColumn.hellaswag.name: 25.0,
|
66 |
AutoEvalColumn.mmlu.name: 25.0,
|
@@ -72,19 +72,43 @@ baseline_row = {
|
|
72 |
AutoEvalColumn.model_type.name: "",
|
73 |
}
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
@dataclass
|
77 |
-
class
|
78 |
name: str
|
79 |
symbol: str # emoji
|
80 |
|
81 |
|
82 |
class ModelType(Enum):
|
83 |
-
PT =
|
84 |
-
FT =
|
85 |
-
IFT =
|
86 |
-
RL =
|
87 |
-
Unknown =
|
88 |
|
89 |
def to_str(self, separator=" "):
|
90 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
@@ -128,7 +152,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
128 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
129 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
130 |
|
131 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks
|
132 |
|
133 |
NUMERIC_INTERVALS = {
|
134 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
60 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
61 |
AutoEvalColumn.revision.name: "N/A",
|
62 |
AutoEvalColumn.precision.name: None,
|
63 |
+
AutoEvalColumn.average.name: 31.0,
|
64 |
AutoEvalColumn.arc.name: 25.0,
|
65 |
AutoEvalColumn.hellaswag.name: 25.0,
|
66 |
AutoEvalColumn.mmlu.name: 25.0,
|
|
|
72 |
AutoEvalColumn.model_type.name: "",
|
73 |
}
|
74 |
|
75 |
+
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
76 |
+
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
77 |
+
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
78 |
+
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
|
79 |
+
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
|
80 |
+
# Drop: https://leaderboard.allenai.org/drop/submissions/public
|
81 |
+
# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
|
82 |
+
# GSM8K: paper
|
83 |
+
# Define the human baselines
|
84 |
+
human_baseline_row = {
|
85 |
+
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
86 |
+
AutoEvalColumn.revision.name: "N/A",
|
87 |
+
AutoEvalColumn.precision.name: None,
|
88 |
+
AutoEvalColumn.average.name: 92.75,
|
89 |
+
AutoEvalColumn.arc.name: 80.0,
|
90 |
+
AutoEvalColumn.hellaswag.name: 95.0,
|
91 |
+
AutoEvalColumn.mmlu.name: 89.8,
|
92 |
+
AutoEvalColumn.truthfulqa.name: 94.0,
|
93 |
+
AutoEvalColumn.winogrande.name: 94.0,
|
94 |
+
AutoEvalColumn.gsm8k.name: 100,
|
95 |
+
AutoEvalColumn.drop.name: 96.42,
|
96 |
+
AutoEvalColumn.dummy.name: "human_baseline",
|
97 |
+
AutoEvalColumn.model_type.name: "",
|
98 |
+
}
|
99 |
|
100 |
@dataclass
|
101 |
+
class ModelTypeDetails:
|
102 |
name: str
|
103 |
symbol: str # emoji
|
104 |
|
105 |
|
106 |
class ModelType(Enum):
|
107 |
+
PT = ModelTypeDetails(name="pretrained", symbol="🟢")
|
108 |
+
FT = ModelTypeDetails(name="fine-tuned", symbol="🔶")
|
109 |
+
IFT = ModelTypeDetails(name="instruction-tuned", symbol="⭕")
|
110 |
+
RL = ModelTypeDetails(name="RL-tuned", symbol="🟦")
|
111 |
+
Unknown = ModelTypeDetails(name="", symbol="?")
|
112 |
|
113 |
def to_str(self, separator=" "):
|
114 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
|
|
152 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
153 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
154 |
|
155 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
156 |
|
157 |
NUMERIC_INTERVALS = {
|
158 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/leaderboard/read_evals.py
CHANGED
@@ -3,9 +3,9 @@ import json
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
-
from typing import Dict, List, Tuple
|
7 |
|
8 |
import dateutil
|
|
|
9 |
import numpy as np
|
10 |
|
11 |
from src.display.formatting import make_clickable_model
|
@@ -61,8 +61,6 @@ class EvalResult:
|
|
61 |
still_on_hub, error = is_model_on_hub(
|
62 |
full_model, config.get("model_sha", "main"), trust_remote_code=True
|
63 |
)
|
64 |
-
if not still_on_hub:
|
65 |
-
print(full_model, error)
|
66 |
|
67 |
# Extract results available in this file (some results are split in several files)
|
68 |
results = {}
|
@@ -100,7 +98,6 @@ class EvalResult:
|
|
100 |
results=results,
|
101 |
precision=precision, # todo model_type=, weight_type=
|
102 |
revision=config.get("model_sha", ""),
|
103 |
-
date=config.get("submission_date", ""),
|
104 |
still_on_hub=still_on_hub,
|
105 |
)
|
106 |
|
@@ -114,6 +111,7 @@ class EvalResult:
|
|
114 |
self.license = request.get("license", "?")
|
115 |
self.likes = request.get("likes", 0)
|
116 |
self.num_params = request.get("params", 0)
|
|
|
117 |
except Exception:
|
118 |
print(f"Could not find request file for {self.org}/{self.model}")
|
119 |
|
@@ -162,7 +160,7 @@ def get_request_file_for_model(model_name, precision):
|
|
162 |
return request_file
|
163 |
|
164 |
|
165 |
-
def
|
166 |
json_filepaths = []
|
167 |
|
168 |
for root, _, files in os.walk(results_path):
|
@@ -196,7 +194,8 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
|
|
196 |
results = []
|
197 |
for v in eval_results.values():
|
198 |
try:
|
199 |
-
|
|
|
200 |
except KeyError: # not all eval values present
|
201 |
continue
|
202 |
|
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
|
|
6 |
|
7 |
import dateutil
|
8 |
+
from datetime import datetime
|
9 |
import numpy as np
|
10 |
|
11 |
from src.display.formatting import make_clickable_model
|
|
|
61 |
still_on_hub, error = is_model_on_hub(
|
62 |
full_model, config.get("model_sha", "main"), trust_remote_code=True
|
63 |
)
|
|
|
|
|
64 |
|
65 |
# Extract results available in this file (some results are split in several files)
|
66 |
results = {}
|
|
|
98 |
results=results,
|
99 |
precision=precision, # todo model_type=, weight_type=
|
100 |
revision=config.get("model_sha", ""),
|
|
|
101 |
still_on_hub=still_on_hub,
|
102 |
)
|
103 |
|
|
|
111 |
self.license = request.get("license", "?")
|
112 |
self.likes = request.get("likes", 0)
|
113 |
self.num_params = request.get("params", 0)
|
114 |
+
self.date = request.get("submitted_time", "")
|
115 |
except Exception:
|
116 |
print(f"Could not find request file for {self.org}/{self.model}")
|
117 |
|
|
|
160 |
return request_file
|
161 |
|
162 |
|
163 |
+
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
164 |
json_filepaths = []
|
165 |
|
166 |
for root, _, files in os.walk(results_path):
|
|
|
194 |
results = []
|
195 |
for v in eval_results.values():
|
196 |
try:
|
197 |
+
v.to_dict() # we test if the dict version is complete
|
198 |
+
results.append(v)
|
199 |
except KeyError: # not all eval values present
|
200 |
continue
|
201 |
|
src/populate.py
CHANGED
@@ -6,21 +6,22 @@ import pandas as pd
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
8 |
from src.leaderboard.filter_models import filter_models
|
9 |
-
from src.leaderboard.read_evals import
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
16 |
|
17 |
-
df = pd.DataFrame.from_records(
|
18 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
19 |
df = df[cols].round(decimals=2)
|
20 |
|
21 |
# filter out if any of the benchmarks have not been produced
|
22 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
23 |
-
return df
|
24 |
|
25 |
|
26 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
8 |
from src.leaderboard.filter_models import filter_models
|
9 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
+
raw_data = get_raw_eval_results(results_path)
|
14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
all_data_json.append(baseline_row)
|
16 |
+
filter_models(all_data_json)
|
17 |
|
18 |
+
df = pd.DataFrame.from_records(all_data_json)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
df = df[cols].round(decimals=2)
|
21 |
|
22 |
# filter out if any of the benchmarks have not been produced
|
23 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
24 |
+
return raw_data, df
|
25 |
|
26 |
|
27 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
src/submission/check_validity.py
CHANGED
@@ -55,7 +55,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
55 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
56 |
try:
|
57 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
58 |
-
except (AttributeError, TypeError):
|
59 |
try:
|
60 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
61 |
model_size = size_match.group(0)
|
|
|
55 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
56 |
try:
|
57 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
58 |
+
except (AttributeError, TypeError ):
|
59 |
try:
|
60 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
61 |
model_size = size_match.group(0)
|
src/tools/plots.py
CHANGED
@@ -1,153 +1,84 @@
|
|
1 |
-
import pickle
|
2 |
-
from datetime import datetime, timezone
|
3 |
-
from typing import Any, Dict, List, Tuple
|
4 |
-
|
5 |
import pandas as pd
|
|
|
6 |
import plotly.express as px
|
7 |
from plotly.graph_objs import Figure
|
8 |
|
9 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
|
|
|
|
10 |
|
11 |
-
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
12 |
-
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
13 |
-
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
|
14 |
-
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
|
15 |
-
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
|
16 |
-
# Define the human baselines
|
17 |
-
HUMAN_BASELINES = {
|
18 |
-
"Average ⬆️": 0.897 * 100,
|
19 |
-
"ARC": 0.80 * 100,
|
20 |
-
"HellaSwag": 0.95 * 100,
|
21 |
-
"MMLU": 0.898 * 100,
|
22 |
-
"TruthfulQA": 0.94 * 100,
|
23 |
-
}
|
24 |
-
|
25 |
-
|
26 |
-
def to_datetime(model_info: Tuple[str, Any]) -> datetime:
|
27 |
-
"""
|
28 |
-
Converts the lastModified attribute of the object to datetime.
|
29 |
-
|
30 |
-
:param model_info: A tuple containing the name and object.
|
31 |
-
The object must have a lastModified attribute
|
32 |
-
with a string representing the date and time.
|
33 |
-
:return: A datetime object converted from the lastModified attribute of the input object.
|
34 |
-
"""
|
35 |
-
name, obj = model_info
|
36 |
-
return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
|
37 |
|
38 |
|
39 |
-
def
|
40 |
-
"""
|
41 |
-
Integrates model information with the results DataFrame by matching 'Model sha'.
|
42 |
-
:param results_df: A DataFrame containing results information including 'Model sha' column.
|
43 |
-
:return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
|
44 |
"""
|
45 |
-
|
46 |
-
df = results_df.copy(deep=True)
|
47 |
-
|
48 |
-
# Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
|
49 |
-
df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
|
50 |
-
|
51 |
-
# load cache from disk
|
52 |
-
try:
|
53 |
-
with open("model_info_cache.pkl", "rb") as f:
|
54 |
-
model_info_cache = pickle.load(f)
|
55 |
-
except (EOFError, FileNotFoundError):
|
56 |
-
model_info_cache = {}
|
57 |
-
|
58 |
-
# Sort date strings using datetime objects as keys
|
59 |
-
sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
|
60 |
-
df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
|
61 |
-
|
62 |
-
# Define the date format string
|
63 |
-
date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
|
64 |
-
|
65 |
-
# Iterate over sorted_dates and update the dataframe
|
66 |
-
for name, obj in sorted_dates:
|
67 |
-
# Convert the lastModified string to a datetime object
|
68 |
-
last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
|
69 |
-
|
70 |
-
# Update the "Results Date" column where "Model sha" equals obj.sha
|
71 |
-
df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
|
72 |
-
return df
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
"""
|
77 |
-
Generates a DataFrame containing the maximum scores until each result date.
|
78 |
-
|
79 |
-
:param results_df: A DataFrame containing result information including metric scores and result dates.
|
80 |
-
:return: A new DataFrame containing the maximum scores until each result date for every metric.
|
81 |
"""
|
82 |
-
# Step 1: Ensure '
|
83 |
-
results_df
|
84 |
-
results_df.
|
|
|
85 |
|
86 |
# Step 2: Initialize the scores dictionary
|
87 |
-
scores = {
|
88 |
-
"Average ⬆️": [],
|
89 |
-
"ARC": [],
|
90 |
-
"HellaSwag": [],
|
91 |
-
"MMLU": [],
|
92 |
-
"TruthfulQA": [],
|
93 |
-
"Result Date": [],
|
94 |
-
"Model Name": [],
|
95 |
-
}
|
96 |
|
97 |
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
|
98 |
-
for
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
104 |
continue
|
105 |
-
if column == "Model Name":
|
106 |
-
scores[column].append(row["model_name_for_query"])
|
107 |
-
continue
|
108 |
-
current_max = scores[column][-1] if scores[column] else float("-inf")
|
109 |
-
scores[column].append(max(current_max, row[column]))
|
110 |
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
|
|
|
|
114 |
|
115 |
-
|
|
|
116 |
"""
|
117 |
Transforms the scores DataFrame into a new format suitable for plotting.
|
118 |
|
119 |
-
:param scores_df: A DataFrame containing metric scores and
|
120 |
:return: A new DataFrame reshaped for plotting purposes.
|
121 |
"""
|
122 |
-
# Sample columns
|
123 |
-
cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
|
124 |
-
|
125 |
# Initialize the list to store DataFrames
|
126 |
dfs = []
|
127 |
|
128 |
# Iterate over the cols and create a new DataFrame for each column
|
129 |
-
for col in
|
130 |
-
d = scores_df[
|
131 |
-
d["
|
132 |
-
d.rename(columns={col: "Metric Value"}, inplace=True)
|
133 |
dfs.append(d)
|
134 |
|
135 |
# Concatenate all the created DataFrames
|
136 |
concat_df = pd.concat(dfs, ignore_index=True)
|
137 |
|
138 |
-
# Sort values by '
|
139 |
-
concat_df.sort_values(by="
|
140 |
-
concat_df.reset_index(drop=True, inplace=True)
|
141 |
-
|
142 |
-
# Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
|
143 |
-
concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
|
144 |
-
|
145 |
concat_df.reset_index(drop=True, inplace=True)
|
146 |
return concat_df
|
147 |
|
148 |
|
149 |
def create_metric_plot_obj(
|
150 |
-
df: pd.DataFrame, metrics:
|
151 |
) -> Figure:
|
152 |
"""
|
153 |
Create a Plotly figure object with lines representing different metrics
|
@@ -156,27 +87,25 @@ def create_metric_plot_obj(
|
|
156 |
:param df: The DataFrame containing the metric values, names, and dates.
|
157 |
:param metrics: A list of strings representing the names of the metrics
|
158 |
to be included in the plot.
|
159 |
-
:param human_baselines: A dictionary where keys are metric names
|
160 |
-
and values are human baseline values for the metrics.
|
161 |
:param title: A string representing the title of the plot.
|
162 |
:return: A Plotly figure object with lines representing metrics and
|
163 |
horizontal dotted lines representing human baselines.
|
164 |
"""
|
165 |
|
166 |
# Filter the DataFrame based on the specified metrics
|
167 |
-
df = df[df["
|
168 |
|
169 |
# Filter the human baselines based on the specified metrics
|
170 |
-
filtered_human_baselines = {k: v for k, v in
|
171 |
|
172 |
# Create a line figure using plotly express with specified markers and custom data
|
173 |
fig = px.line(
|
174 |
df,
|
175 |
-
x="
|
176 |
-
y="
|
177 |
-
color="
|
178 |
markers=True,
|
179 |
-
custom_data=["
|
180 |
title=title,
|
181 |
)
|
182 |
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
import plotly.express as px
|
4 |
from plotly.graph_objs import Figure
|
5 |
|
6 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
7 |
+
from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
|
8 |
+
from src.leaderboard.read_evals import EvalResult
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
+
def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
13 |
"""
|
14 |
+
Generates a DataFrame containing the maximum scores until each date.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
:param results_df: A DataFrame containing result information including metric scores and dates.
|
17 |
+
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
|
|
|
|
|
|
|
|
|
|
18 |
"""
|
19 |
+
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
20 |
+
results_df = pd.DataFrame(raw_data)
|
21 |
+
#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
22 |
+
results_df.sort_values(by="date", inplace=True)
|
23 |
|
24 |
# Step 2: Initialize the scores dictionary
|
25 |
+
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
|
28 |
+
for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
|
29 |
+
current_max = 0
|
30 |
+
last_date = ""
|
31 |
+
column = task.col_name
|
32 |
+
for _, row in results_df.iterrows():
|
33 |
+
current_model = row["full_model"]
|
34 |
+
if current_model in FLAGGED_MODELS:
|
35 |
continue
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
current_date = row["date"]
|
38 |
+
if task.benchmark == "Average":
|
39 |
+
current_score = np.mean(list(row["results"].values()))
|
40 |
+
else:
|
41 |
+
current_score = row["results"][task.benchmark]
|
42 |
+
|
43 |
+
if current_score > current_max:
|
44 |
+
if current_date == last_date and len(scores[column]) > 0:
|
45 |
+
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
|
46 |
+
else:
|
47 |
+
scores[column].append({"model": current_model, "date": current_date, "score": current_score})
|
48 |
+
current_max = current_score
|
49 |
+
last_date = current_date
|
50 |
|
51 |
+
# Step 4: Return all dictionaries as DataFrames
|
52 |
+
return {k: pd.DataFrame(v) for k, v in scores.items()}
|
53 |
|
54 |
+
|
55 |
+
def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
|
56 |
"""
|
57 |
Transforms the scores DataFrame into a new format suitable for plotting.
|
58 |
|
59 |
+
:param scores_df: A DataFrame containing metric scores and dates.
|
60 |
:return: A new DataFrame reshaped for plotting purposes.
|
61 |
"""
|
|
|
|
|
|
|
62 |
# Initialize the list to store DataFrames
|
63 |
dfs = []
|
64 |
|
65 |
# Iterate over the cols and create a new DataFrame for each column
|
66 |
+
for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
|
67 |
+
d = scores_df[col].reset_index(drop=True)
|
68 |
+
d["task"] = col
|
|
|
69 |
dfs.append(d)
|
70 |
|
71 |
# Concatenate all the created DataFrames
|
72 |
concat_df = pd.concat(dfs, ignore_index=True)
|
73 |
|
74 |
+
# Sort values by 'date'
|
75 |
+
concat_df.sort_values(by="date", inplace=True)
|
|
|
|
|
|
|
|
|
|
|
76 |
concat_df.reset_index(drop=True, inplace=True)
|
77 |
return concat_df
|
78 |
|
79 |
|
80 |
def create_metric_plot_obj(
|
81 |
+
df: pd.DataFrame, metrics: list[str], title: str
|
82 |
) -> Figure:
|
83 |
"""
|
84 |
Create a Plotly figure object with lines representing different metrics
|
|
|
87 |
:param df: The DataFrame containing the metric values, names, and dates.
|
88 |
:param metrics: A list of strings representing the names of the metrics
|
89 |
to be included in the plot.
|
|
|
|
|
90 |
:param title: A string representing the title of the plot.
|
91 |
:return: A Plotly figure object with lines representing metrics and
|
92 |
horizontal dotted lines representing human baselines.
|
93 |
"""
|
94 |
|
95 |
# Filter the DataFrame based on the specified metrics
|
96 |
+
df = df[df["task"].isin(metrics)]
|
97 |
|
98 |
# Filter the human baselines based on the specified metrics
|
99 |
+
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
100 |
|
101 |
# Create a line figure using plotly express with specified markers and custom data
|
102 |
fig = px.line(
|
103 |
df,
|
104 |
+
x="date",
|
105 |
+
y="score",
|
106 |
+
color="task",
|
107 |
markers=True,
|
108 |
+
custom_data=["task", "score", "model"],
|
109 |
title=title,
|
110 |
)
|
111 |
|