clefourrier HF staff Wauplin HF staff commited on
Commit
b4ba8b7
β€’
1 Parent(s): 3ac217c

loading_from_contents (#766)

Browse files

- init - cleaning the code base, plus adding the new system to load from contents (4fc38646dccf6d3719eaf48a8dfd05c4a032fad0)
- added collections back to main (8618a2a9da2186516ef4dec2dd87f14322de9719)
- rm (459932d6f2d58fe06ffd5392686f723a08c9b734)
- simplified env vars (23f614e2ea6cf7dcfb37912a464e2d8c24085b70)
- test 1 with webhooks (784d3edc7dc5f5f0439a082a4d0a1cf6376416f6)
- small modif (32ea1bc7cefef89f251e4de467b3d49579d60feb)
- trying with open link (0cb7d54ebfa0af3b1fb240a5cd2d043799379791)
- Update app.py (e3b01f36af4a62b3cc3ba1cd88e665ad496fb839)
- removing share true (3cc4e3e275d1561c7aaa647db593d33d90434f1f)
- Update app.py (52608b2305c0c499835dc0a9892e57b2fa4f61af)
- Update app.py (953dbe38df6163c16df1b40daa579c81c07f72db)
- the webhooks will download the model at each update, and demo.load will restart the viewer at each page refresh (388bfbdf61f906fb0574cf8477aaf19941548368)
- added plots back (294422eeb5e3bcfb489bdf41322bbc3c7cc1632c)
- fixed! (fa8d7663cb995885cb91746a89ce1a2b3ff7f7ca)
- replace HuggingFaceH4 by open-llm-leaderboard (2acf509d0df752206adf666c682823be1a99991f)
- rm dynamic file reference (b4f48ba26897f4c72d213355f91b21555be04da8)


Co-authored-by: Lucain Pouget <[email protected]>

README.md CHANGED
@@ -8,14 +8,13 @@ sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- duplicated_from: HuggingFaceH4/open_llm_leaderboard
12
  fullWidth: true
13
  startup_duration_timeout: 1h
14
  space_ci:
15
  private: true
16
  secrets:
17
  - HF_TOKEN
18
- - H4_TOKEN
19
  tags:
20
  - leaderboard
21
  short_description: Track, rank and evaluate open LLMs and chatbots
 
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
11
  fullWidth: true
12
  startup_duration_timeout: 1h
13
  space_ci:
14
  private: true
15
  secrets:
16
  - HF_TOKEN
17
+ - WEBHOOK_SECRET
18
  tags:
19
  - leaderboard
20
  short_description: Track, rank and evaluate open LLMs and chatbots
app.py CHANGED
@@ -2,10 +2,9 @@ import os
2
  import logging
3
  import time
4
  import gradio as gr
5
- from apscheduler.schedulers.background import BackgroundScheduler
6
- from huggingface_hub import snapshot_download
7
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
8
- from gradio_space_ci import enable_space_ci
9
 
10
  from src.display.about import (
11
  CITATION_BUTTON_LABEL,
@@ -30,32 +29,27 @@ from src.display.utils import (
30
  )
31
  from src.envs import (
32
  API,
33
- DYNAMIC_INFO_FILE_PATH,
34
- DYNAMIC_INFO_PATH,
35
- DYNAMIC_INFO_REPO,
36
  EVAL_REQUESTS_PATH,
37
- EVAL_RESULTS_PATH,
38
- H4_TOKEN,
39
- IS_PUBLIC,
40
  QUEUE_REPO,
41
  REPO_ID,
42
- RESULTS_REPO,
43
  )
44
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
45
- from src.scripts.update_all_request_files import update_dynamic_files
46
  from src.submission.submit import add_new_eval
47
- from src.tools.collections import update_collections
48
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
49
 
50
  # Configure logging
51
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
52
 
53
- # Start ephemeral Spaces on PRs (see config in README.md)
54
- enable_space_ci()
55
 
 
 
 
56
 
57
  def restart_space():
58
- API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
59
 
60
 
61
  def time_diff_wrapper(func):
@@ -94,54 +88,90 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
94
  attempt += 1
95
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- def init_space(full_init: bool = True):
99
  """Initializes the application space, loading only necessary data."""
100
- if full_init:
101
  # These downloads only occur on full initialization
102
  try:
103
  download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
104
- download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
105
- download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
106
  except Exception:
107
  restart_space()
108
 
109
- # Always retrieve the leaderboard DataFrame
110
- raw_data, original_df = get_leaderboard_df(
111
- results_path=EVAL_RESULTS_PATH,
112
- requests_path=EVAL_REQUESTS_PATH,
113
- dynamic_path=DYNAMIC_INFO_FILE_PATH,
114
- cols=COLS,
115
- benchmark_cols=BENCHMARK_COLS,
116
- )
117
-
118
- if full_init:
119
- # Collection update only happens on full initialization
120
- update_collections(original_df)
121
-
122
- leaderboard_df = original_df.copy()
123
 
124
  # Evaluation queue DataFrame retrieval is independent of initialization detail level
125
- eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
126
 
127
- return leaderboard_df, raw_data, original_df, eval_queue_dfs
128
 
129
 
130
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
131
- # This controls whether a full initialization should be performed.
132
- do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
133
-
134
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
135
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
136
- leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
137
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
138
 
139
 
140
  # Data processing for plots now only on demand in the respective Gradio tab
141
  def load_and_create_plots():
142
- plot_df = create_plot_df(create_scores_df(raw_data))
143
  return plot_df
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  demo = gr.Blocks(css=custom_css)
147
  with demo:
@@ -150,37 +180,7 @@ with demo:
150
 
151
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
152
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
153
- leaderboard = Leaderboard(
154
- value=leaderboard_df,
155
- datatype=[c.type for c in fields(AutoEvalColumn)],
156
- select_columns=SelectColumns(
157
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
158
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
159
- label="Select Columns to Display:",
160
- ),
161
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
162
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
163
- filter_columns=[
164
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
165
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
166
- ColumnFilter(
167
- AutoEvalColumn.params.name,
168
- type="slider",
169
- min=0.01,
170
- max=150,
171
- label="Select the number of parameters (B)",
172
- ),
173
- ColumnFilter(
174
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
175
- ),
176
- ColumnFilter(
177
- AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
178
- ),
179
- ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
180
- ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
181
- ],
182
- bool_checkboxgroup_label="Hide models",
183
- )
184
 
185
  with gr.TabItem("πŸ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
186
  with gr.Row():
@@ -219,7 +219,6 @@ with demo:
219
  with gr.Column():
220
  model_name_textbox = gr.Textbox(label="Model name")
221
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
222
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
223
  model_type = gr.Dropdown(
224
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
225
  label="Model type",
@@ -290,7 +289,6 @@ with demo:
290
  base_model_name_textbox,
291
  revision_name_textbox,
292
  precision,
293
- private,
294
  weight_type,
295
  model_type,
296
  ],
@@ -307,9 +305,61 @@ with demo:
307
  show_copy_button=True,
308
  )
309
 
310
- scheduler = BackgroundScheduler()
311
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
312
- scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
313
- scheduler.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import logging
3
  import time
4
  import gradio as gr
5
+ import datasets
6
+ from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
7
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
8
 
9
  from src.display.about import (
10
  CITATION_BUTTON_LABEL,
 
29
  )
30
  from src.envs import (
31
  API,
 
 
 
32
  EVAL_REQUESTS_PATH,
33
+ AGGREGATED_REPO,
34
+ HF_TOKEN,
 
35
  QUEUE_REPO,
36
  REPO_ID,
37
+ HF_HOME,
38
  )
39
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
40
  from src.submission.submit import add_new_eval
 
41
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
42
 
43
  # Configure logging
44
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
45
 
 
 
46
 
47
+ # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
48
+ # This controls whether a full initialization should be performed.
49
+ DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
50
 
51
  def restart_space():
52
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
53
 
54
 
55
  def time_diff_wrapper(func):
 
88
  attempt += 1
89
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
90
 
91
+ def get_latest_data_leaderboard():
92
+ leaderboard_dataset = datasets.load_dataset(
93
+ AGGREGATED_REPO,
94
+ "default",
95
+ split="train",
96
+ cache_dir=HF_HOME,
97
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
98
+ verification_mode="no_checks"
99
+ )
100
+
101
+ leaderboard_df = get_leaderboard_df(
102
+ leaderboard_dataset=leaderboard_dataset,
103
+ cols=COLS,
104
+ benchmark_cols=BENCHMARK_COLS,
105
+ )
106
+
107
+ return leaderboard_df
108
+
109
+ def get_latest_data_queue():
110
+ eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
111
+ return eval_queue_dfs
112
 
113
+ def init_space():
114
  """Initializes the application space, loading only necessary data."""
115
+ if DO_FULL_INIT:
116
  # These downloads only occur on full initialization
117
  try:
118
  download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
 
 
119
  except Exception:
120
  restart_space()
121
 
122
+ # Always redownload the leaderboard DataFrame
123
+ leaderboard_df = get_latest_data_leaderboard()
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # Evaluation queue DataFrame retrieval is independent of initialization detail level
126
+ eval_queue_dfs = get_latest_data_queue()
127
 
128
+ return leaderboard_df, eval_queue_dfs
129
 
130
 
 
 
 
 
131
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
132
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
133
+ leaderboard_df, eval_queue_dfs = init_space()
134
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
135
 
136
 
137
  # Data processing for plots now only on demand in the respective Gradio tab
138
  def load_and_create_plots():
139
+ plot_df = create_plot_df(create_scores_df(leaderboard_df))
140
  return plot_df
141
 
142
+ def init_leaderboard(dataframe):
143
+ return Leaderboard(
144
+ value = dataframe,
145
+ datatype=[c.type for c in fields(AutoEvalColumn)],
146
+ select_columns=SelectColumns(
147
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
148
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
149
+ label="Select Columns to Display:",
150
+ ),
151
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
152
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
153
+ filter_columns=[
154
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
155
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
156
+ ColumnFilter(
157
+ AutoEvalColumn.params.name,
158
+ type="slider",
159
+ min=0.01,
160
+ max=150,
161
+ label="Select the number of parameters (B)",
162
+ ),
163
+ ColumnFilter(
164
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
165
+ ),
166
+ ColumnFilter(
167
+ AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
168
+ ),
169
+ ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
170
+ ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
171
+ ],
172
+ bool_checkboxgroup_label="Hide models",
173
+ )
174
+
175
 
176
  demo = gr.Blocks(css=custom_css)
177
  with demo:
 
180
 
181
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
182
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
183
+ leaderboard = init_leaderboard(leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  with gr.TabItem("πŸ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
186
  with gr.Row():
 
219
  with gr.Column():
220
  model_name_textbox = gr.Textbox(label="Model name")
221
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
222
  model_type = gr.Dropdown(
223
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
224
  label="Model type",
 
289
  base_model_name_textbox,
290
  revision_name_textbox,
291
  precision,
 
292
  weight_type,
293
  model_type,
294
  ],
 
305
  show_copy_button=True,
306
  )
307
 
308
+ demo.load(fn=get_latest_data_leaderboard, inputs=None, outputs=[leaderboard])
309
+ demo.load(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
310
+
311
+ demo.queue(default_concurrency_limit=40)
312
+
313
+ # Start ephemeral Spaces on PRs (see config in README.md)
314
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
315
+
316
+ def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
317
+ # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
318
+ # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
319
+ # ht to Lucain!
320
+ if SPACE_ID is None:
321
+ print("Not in a Space: Space CI disabled.")
322
+ return WebhooksServer(ui=demo)
323
+
324
+ if IS_EPHEMERAL_SPACE:
325
+ print("In an ephemeral Space: Space CI disabled.")
326
+ return WebhooksServer(ui=demo)
327
+
328
+ card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
329
+ config = card.data.get("space_ci", {})
330
+ print(f"Enabling Space CI with config from README: {config}")
331
+
332
+ return configure_space_ci(
333
+ blocks=ui,
334
+ trusted_authors=config.get("trusted_authors"),
335
+ private=config.get("private", "auto"),
336
+ variables=config.get("variables", "auto"),
337
+ secrets=config.get("secrets"),
338
+ hardware=config.get("hardware"),
339
+ storage=config.get("storage"),
340
+ )
341
 
342
+ # Create webhooks server (with CI url if in Space and not ephemeral)
343
+ webhooks_server = enable_space_ci_and_return_server(ui=demo)
344
+
345
+ # Add webhooks
346
+ @webhooks_server.add_webhook
347
+ async def update_leaderboard(payload: WebhookPayload) -> None:
348
+ """Redownloads the leaderboard dataset each time it updates"""
349
+ if payload.repo.type == "dataset" and payload.event.action == "update":
350
+ datasets.load_dataset(
351
+ AGGREGATED_REPO,
352
+ "default",
353
+ split="train",
354
+ cache_dir=HF_HOME,
355
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
356
+ verification_mode="no_checks"
357
+ )
358
+
359
+ @webhooks_server.add_webhook
360
+ async def update_queue(payload: WebhookPayload) -> None:
361
+ """Redownloads the queue dataset each time it updates"""
362
+ if payload.repo.type == "dataset" and payload.event.action == "update":
363
+ download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
364
+
365
+ webhooks_server.launch()
requirements.txt CHANGED
@@ -15,4 +15,4 @@ transformers==4.41.1
15
  tokenizers>=0.15.0
16
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
17
  gradio==4.20.0
18
- gradio_leaderboard==0.0.8
 
15
  tokenizers>=0.15.0
16
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
17
  gradio==4.20.0
18
+ gradio_leaderboard==0.0.9
src/display/about.py CHANGED
@@ -81,7 +81,7 @@ To get more information about quantization, see:
81
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
82
 
83
  ### Useful links
84
- - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
85
  - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
86
 
87
  ### Other cool leaderboards:
@@ -217,7 +217,7 @@ CITATION_BUTTON_TEXT = r"""
217
  title = {Open LLM Leaderboard},
218
  year = {2023},
219
  publisher = {Hugging Face},
220
- howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
221
  }
222
  @software{eval-harness,
223
  author = {Gao, Leo and
 
81
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
82
 
83
  ### Useful links
84
+ - [Community resources](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/174)
85
  - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
86
 
87
  ### Other cool leaderboards:
 
217
  title = {Open LLM Leaderboard},
218
  year = {2023},
219
  publisher = {Hugging Face},
220
+ howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
221
  }
222
  @software{eval-harness,
223
  author = {Gao, Leo and
src/display/utils.py CHANGED
@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
93
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
94
  auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
95
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 
96
  # Dummy column for the search bar (hidden by the custom CSS)
97
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
98
 
 
93
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
94
  auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
95
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
96
+ auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
97
  # Dummy column for the search bar (hidden by the custom CSS)
98
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
99
 
src/envs.py CHANGED
@@ -2,17 +2,11 @@ import os
2
  from huggingface_hub import HfApi
3
 
4
  # clone / pull the lmeh eval data
5
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
6
 
7
- REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
8
  QUEUE_REPO = "open-llm-leaderboard/requests"
9
- DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
10
- RESULTS_REPO = "open-llm-leaderboard/results"
11
-
12
- PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
13
- PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
14
-
15
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
16
 
17
  HF_HOME = os.getenv("HF_HOME", ".")
18
 
@@ -27,18 +21,10 @@ else:
27
  print("Write access confirmed for HF_HOME")
28
 
29
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
30
- EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
31
- DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
32
- DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
33
-
34
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
35
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
36
-
37
- PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
38
 
39
  # Rate limit variables
40
  RATE_LIMIT_PERIOD = 7
41
  RATE_LIMIT_QUOTA = 5
42
  HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
43
 
44
- API = HfApi(token=H4_TOKEN)
 
2
  from huggingface_hub import HfApi
3
 
4
  # clone / pull the lmeh eval data
5
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
+ REPO_ID = "open-llm-leaderboard/open_llm_leaderboard"
8
  QUEUE_REPO = "open-llm-leaderboard/requests"
9
+ AGGREGATED_REPO = "open-llm-leaderboard/contents"
 
 
 
 
 
 
10
 
11
  HF_HOME = os.getenv("HF_HOME", ".")
12
 
 
21
  print("Write access confirmed for HF_HOME")
22
 
23
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 
 
 
 
 
 
 
 
24
 
25
  # Rate limit variables
26
  RATE_LIMIT_PERIOD = 7
27
  RATE_LIMIT_QUOTA = 5
28
  HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
29
 
30
+ API = HfApi(token=HF_TOKEN)
src/leaderboard/filter_models.py CHANGED
@@ -5,120 +5,120 @@ from src.display.utils import AutoEvalColumn
5
  # Models which have been flagged by users as being problematic for a reason or another
6
  # (Model name to forum discussion link)
7
  FLAGGED_MODELS = {
8
- "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
9
- "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
10
- "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
11
- "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
12
- "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
13
- "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
14
- "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
15
- "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
- "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
17
- "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
18
- "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
19
- "jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
20
- "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
21
- "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
22
- "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
23
- "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
24
- "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
25
- "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
26
- "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
27
- "janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
28
- "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
29
- "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
30
- "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
31
- "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
32
- "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
33
- "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
34
- "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
35
- "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
36
- "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
37
- "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
38
- "cookinai/BruinHermes": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
39
- "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
40
- "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
41
- "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
42
- "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
43
- "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
44
- "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
45
- "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
46
  "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
47
- "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
48
- "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
49
- "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
50
- "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
51
- "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
52
- "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
53
  # Merges not indicated
54
- "gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
55
- "gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
56
- "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
57
- "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
58
- "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
59
- "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
60
- "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
61
- "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
62
- "rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
63
- "rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
64
- "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
65
- "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
66
- "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
67
- "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
68
- "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
69
- "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
70
- "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
71
- "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
72
- "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
73
- "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
74
- "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
75
- "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
76
- "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
77
- "elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
78
- "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
79
- "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
80
- "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
81
- "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
82
- "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
83
- "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
84
- "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
85
- "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
86
- "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
87
- "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
88
- "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
89
- "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
90
- "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
91
- "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
92
- "udkai/Turdus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
93
- "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
94
- "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
95
- "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
96
- "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
97
- "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
98
- "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
99
- "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
100
- "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
101
- "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
102
- "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
103
- "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
104
- "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
105
  # MoErges
106
- "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
107
- "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
108
- "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
109
- "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
110
- "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
111
- "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
112
- "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
113
- "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
114
- "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
115
- "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
116
- "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
117
  # Other - contamination mostly
118
- "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
119
- "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
120
- "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/664",
121
- "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/664",
122
  }
123
 
124
  # Models which have been requested by orgs to not be submitted on the leaderboard
@@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
167
  leaderboard_data.pop(ix)
168
  return leaderboard_data
169
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  def filter_models_flags(leaderboard_data: list[dict]):
172
  leaderboard_data = remove_forbidden_models(leaderboard_data)
 
5
  # Models which have been flagged by users as being problematic for a reason or another
6
  # (Model name to forum discussion link)
7
  FLAGGED_MODELS = {
8
+ "merged": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
9
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
10
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
11
+ "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
12
+ "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
13
+ "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
14
+ "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
15
+ "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
16
+ "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
17
+ "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
18
+ "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
19
+ "jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
20
+ "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
21
+ "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
22
+ "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
23
+ "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
24
+ "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
25
+ "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
26
+ "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
27
+ "janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
28
+ "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
29
+ "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
30
+ "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
31
+ "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
32
+ "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
33
+ "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
34
+ "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
35
+ "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
36
+ "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
37
+ "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
38
+ "cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
39
+ "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
40
+ "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
41
+ "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
42
+ "rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
43
+ "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
44
+ "dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
45
+ "udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
46
  "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
47
+ "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
48
+ "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
49
+ "alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
50
+ "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
51
+ "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
52
+ "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
53
  # Merges not indicated
54
+ "gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
55
+ "gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
56
+ "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
57
+ "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
58
+ "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
59
+ "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
60
+ "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
61
+ "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
62
+ "rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
63
+ "rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
64
+ "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
65
+ "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
66
+ "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
67
+ "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
68
+ "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
69
+ "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
70
+ "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
71
+ "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
72
+ "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
73
+ "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
74
+ "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
75
+ "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
76
+ "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
77
+ "elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
78
+ "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
79
+ "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
80
+ "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
81
+ "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
82
+ "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
83
+ "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
84
+ "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
85
+ "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
86
+ "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
87
+ "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
88
+ "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
89
+ "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
90
+ "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
91
+ "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
92
+ "udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
93
+ "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
94
+ "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
95
+ "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
96
+ "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
97
+ "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
98
+ "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
99
+ "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
100
+ "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
101
+ "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
102
+ "cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
103
+ "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
104
+ "jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
105
  # MoErges
106
+ "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
107
+ "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
108
+ "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
109
+ "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
110
+ "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
111
+ "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
112
+ "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
113
+ "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
114
+ "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
115
+ "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
116
+ "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
117
  # Other - contamination mostly
118
+ "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
119
+ "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
120
+ "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
121
+ "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
122
  }
123
 
124
  # Models which have been requested by orgs to not be submitted on the leaderboard
 
167
  leaderboard_data.pop(ix)
168
  return leaderboard_data
169
 
170
+ """
171
+ def remove_forbidden_models(leaderboard_data):
172
+ #Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
173
+ indices_to_remove = []
174
+ for ix, row in leaderboard_data.iterrows():
175
+ if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
176
+ indices_to_remove.append(ix)
177
+
178
+ # Remove the models from the list
179
+ return leaderboard_data.drop(indices_to_remove)
180
+ """
181
+
182
 
183
  def filter_models_flags(leaderboard_data: list[dict]):
184
  leaderboard_data = remove_forbidden_models(leaderboard_data)
src/leaderboard/read_evals.py DELETED
@@ -1,261 +0,0 @@
1
- import json
2
- from pathlib import Path
3
- from json import JSONDecodeError
4
- import logging
5
- import math
6
-
7
- from dataclasses import dataclass, field
8
- from typing import Optional, Dict, List
9
-
10
- from tqdm import tqdm
11
- from tqdm.contrib.logging import logging_redirect_tqdm
12
-
13
- import numpy as np
14
-
15
- from src.display.formatting import make_clickable_model
16
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
-
21
-
22
- @dataclass
23
- class EvalResult:
24
- # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
- eval_name: str # org_model_precision (uid)
26
- full_model: str # org/model (path on hub)
27
- org: Optional[str]
28
- model: str
29
- revision: str # commit hash, "" if main
30
- results: Dict[str, float]
31
- precision: Precision = Precision.Unknown
32
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
33
- weight_type: WeightType = WeightType.Original
34
- architecture: str = "Unknown" # From config file
35
- license: str = "?"
36
- likes: int = 0
37
- num_params: int = 0
38
- date: str = "" # submission date of request file
39
- still_on_hub: bool = True
40
- is_merge: bool = False
41
- not_flagged: bool = False
42
- status: str = "FINISHED"
43
- # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
44
- tags: List[str] = field(default_factory=list)
45
-
46
- @classmethod
47
- def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
48
- with open(json_filepath, "r") as fp:
49
- data = json.load(fp)
50
-
51
- config = data.get("config_general", {})
52
- precision = Precision.from_str(config.get("model_dtype", "unknown"))
53
- org_and_model = config.get("model_name", "").split("/", 1)
54
- org = org_and_model[0] if len(org_and_model) > 1 else None
55
- model = org_and_model[-1]
56
- if len(org_and_model) == 1:
57
- org = None
58
- model = org_and_model[0]
59
- result_key = f"{model}_{precision.value.name}"
60
- else:
61
- org = org_and_model[0]
62
- model = org_and_model[1]
63
- result_key = f"{org}_{model}_{precision.value.name}"
64
- full_model = "/".join(org_and_model)
65
-
66
- results = cls.extract_results(data) # Properly call the method to extract results
67
-
68
- return cls(
69
- eval_name=result_key,
70
- full_model=full_model,
71
- org=org,
72
- model=model,
73
- results=results,
74
- precision=precision,
75
- revision=config.get("model_sha", ""),
76
- )
77
-
78
- @staticmethod
79
- def extract_results(data: Dict) -> Dict[str, float]:
80
- """
81
- Extract and process benchmark results from a given dict.
82
-
83
- Parameters:
84
- - data (Dict): A dictionary containing benchmark data. This dictionary must
85
- include 'versions' and 'results' keys with respective sub-data.
86
-
87
- Returns:
88
- - Dict[str, float]: A dictionary where keys are benchmark names and values
89
- are the processed average scores as percentages.
90
-
91
- Notes:
92
- - The method specifically checks for certain benchmark names to skip outdated entries.
93
- - Handles NaN values by setting the corresponding benchmark result to 0.0.
94
- - Averages scores across metrics for benchmarks found in the data, in a percentage format.
95
- """
96
- results = {}
97
- for task in Tasks:
98
- task = task.value
99
- # We skip old mmlu entries
100
- if task.benchmark == "hendrycksTest":
101
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
102
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
103
- continue
104
-
105
- # Some benchamrk values are NaNs, mostly truthfulQA
106
- # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
107
- # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
108
- for k, v in data["results"].items():
109
- if task.benchmark in k:
110
- if math.isnan(float(v[task.metric])):
111
- results[task.benchmark] = 0.0
112
- continue
113
-
114
- # We average all scores of a given metric (mostly for mmlu)
115
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
116
- if accs.size == 0 or any([acc is None for acc in accs]):
117
- continue
118
-
119
- mean_acc = np.mean(accs) * 100.0
120
- results[task.benchmark] = mean_acc
121
-
122
- return results
123
-
124
- def update_with_request_file(self, requests_path):
125
- """Finds the relevant request file for the current model and updates info with it."""
126
- try:
127
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
128
- if request_file is None:
129
- logging.warning(f"No request file for {self.org}/{self.model}")
130
- self.status = "FAILED"
131
- return
132
-
133
- with open(request_file, "r") as f:
134
- request = json.load(f)
135
-
136
- self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
137
- self.weight_type = WeightType[request.get("weight_type", "Original")]
138
- self.num_params = int(request.get("params", 0)) # Ensuring type safety
139
- self.date = request.get("submitted_time", "")
140
- self.architecture = request.get("architectures", "Unknown")
141
- self.status = request.get("status", "FAILED")
142
-
143
- except FileNotFoundError:
144
- self.status = "FAILED"
145
- logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
146
- except JSONDecodeError:
147
- self.status = "FAILED"
148
- logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
149
- except KeyError as e:
150
- self.status = "FAILED"
151
- logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
152
- except Exception as e: # Catch-all for any other unexpected exceptions
153
- self.status = "FAILED"
154
- logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
155
-
156
- def update_with_dynamic_file_dict(self, file_dict):
157
- """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
158
- # Default values set for optional or potentially missing keys.
159
- self.license = file_dict.get("license", "?")
160
- self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
161
- self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
162
- self.tags = file_dict.get("tags", [])
163
-
164
- # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
165
- self.not_flagged = not (any("flagged" in tag for tag in self.tags))
166
-
167
- def to_dict(self):
168
- """Converts the Eval Result to a dict compatible with our dataframe display"""
169
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
170
- data_dict = {
171
- "eval_name": self.eval_name, # not a column, just a save name,
172
- AutoEvalColumn.precision.name: self.precision.value.name,
173
- AutoEvalColumn.model_type.name: self.model_type.value.name,
174
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
175
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
176
- AutoEvalColumn.architecture.name: self.architecture,
177
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
178
- AutoEvalColumn.fullname.name: self.full_model,
179
- AutoEvalColumn.revision.name: self.revision,
180
- AutoEvalColumn.average.name: average,
181
- AutoEvalColumn.license.name: self.license,
182
- AutoEvalColumn.likes.name: self.likes,
183
- AutoEvalColumn.params.name: self.num_params,
184
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
185
- AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
186
- AutoEvalColumn.moe.name: not (
187
- ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
188
- ),
189
- AutoEvalColumn.not_flagged.name: self.not_flagged,
190
- }
191
-
192
- for task in Tasks:
193
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
194
-
195
- return data_dict
196
-
197
-
198
- def get_request_file_for_model(requests_path, model_name, precision):
199
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
200
- requests_path = Path(requests_path)
201
- pattern = f"{model_name}_eval_request_*.json"
202
-
203
- # Using pathlib to find files matching the pattern
204
- request_files = list(requests_path.glob(pattern))
205
-
206
- # Sort the files by name in descending order to mimic 'reverse=True'
207
- request_files.sort(reverse=True)
208
-
209
- # Select the correct request file based on 'status' and 'precision'
210
- request_file = None
211
- for request_file in request_files:
212
- with request_file.open("r") as f:
213
- req_content = json.load(f)
214
- if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
215
- request_file = str(request_file)
216
-
217
- # Return empty string if no file found that matches criteria
218
- return request_file
219
-
220
-
221
- def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
222
- """From the path of the results folder root, extract all needed info for results"""
223
- with open(dynamic_path) as f:
224
- dynamic_data = json.load(f)
225
-
226
- results_path = Path(results_path)
227
- model_files = list(results_path.rglob("results_*.json"))
228
- model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
229
-
230
- eval_results = {}
231
- # Wrap model_files iteration with tqdm for progress display
232
- for model_result_filepath in tqdm(model_files, desc="Processing model files"):
233
- # Creation of result
234
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
235
- with logging_redirect_tqdm():
236
- eval_result.update_with_request_file(requests_path)
237
-
238
- if eval_result.full_model in dynamic_data:
239
- eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
240
- # Hardcoding because of gating problem
241
- if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
242
- eval_result.still_on_hub = True
243
-
244
- # Store results of same eval together
245
- eval_name = eval_result.eval_name
246
- if eval_name in eval_results.keys():
247
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
248
- else:
249
- eval_results[eval_name] = eval_result
250
-
251
- results = []
252
- for k, v in eval_results.items():
253
- try:
254
- if v.status == "FINISHED":
255
- v.to_dict() # we test if the dict version is complete
256
- results.append(v)
257
- except KeyError as e:
258
- logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
259
- continue
260
-
261
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -1,9 +1,9 @@
1
  import pathlib
2
  import pandas as pd
 
3
  from src.display.formatting import has_no_nan_values, make_clickable_model
4
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
5
  from src.leaderboard.filter_models import filter_models_flags
6
- from src.leaderboard.read_evals import get_raw_eval_results
7
  from src.display.utils import load_json_data
8
 
9
 
@@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols):
39
  return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
40
 
41
 
42
- def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmark_cols):
43
  """Retrieve and process leaderboard data."""
44
- raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
45
- all_data_json = [model.to_dict() for model in raw_data] + [baseline_row]
46
- filter_models_flags(all_data_json)
 
47
 
48
- df = pd.DataFrame.from_records(all_data_json)
49
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
50
  df = df[cols].round(decimals=2)
51
  df = df[has_no_nan_values(df, benchmark_cols)]
52
- return raw_data, df
 
1
  import pathlib
2
  import pandas as pd
3
+ from datasets import Dataset
4
  from src.display.formatting import has_no_nan_values, make_clickable_model
5
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
6
  from src.leaderboard.filter_models import filter_models_flags
 
7
  from src.display.utils import load_json_data
8
 
9
 
 
39
  return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
40
 
41
 
42
+ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
43
  """Retrieve and process leaderboard data."""
44
+ all_data_json = leaderboard_dataset.to_dict()
45
+ num_items = leaderboard_dataset.num_rows
46
+ all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
47
+ filter_models_flags(all_data_json_list)
48
 
49
+ df = pd.DataFrame.from_records(all_data_json_list)
50
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
51
  df = df[cols].round(decimals=2)
52
  df = df[has_no_nan_values(df, benchmark_cols)]
53
+ return df
src/scripts/update_all_request_files.py DELETED
@@ -1,129 +0,0 @@
1
- import json
2
- import os
3
- import time
4
-
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
8
- from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
9
-
10
-
11
- def update_one_model(model_id, data, models_on_the_hub):
12
- # Model no longer on the hub at all
13
- if model_id not in models_on_the_hub:
14
- data["still_on_hub"] = False
15
- data["likes"] = 0
16
- data["downloads"] = 0
17
- data["created_at"] = ""
18
- data["tags"] = []
19
- return data
20
-
21
- # Grabbing model parameters
22
- model_cfg = models_on_the_hub[model_id]
23
- data["likes"] = model_cfg.likes
24
- data["downloads"] = model_cfg.downloads
25
- data["created_at"] = str(model_cfg.created_at)
26
- data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
27
-
28
- # Grabbing model details
29
- model_name = model_id
30
- if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
31
- if isinstance(model_cfg.card_data.base_model, str):
32
- model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
33
- still_on_hub, _, _ = is_model_on_hub(
34
- model_name=model_name,
35
- revision=data.get("revision"),
36
- trust_remote_code=True,
37
- test_tokenizer=False,
38
- token=H4_TOKEN,
39
- )
40
- # If the model doesn't have a model card or a license, we consider it's deleted
41
- if still_on_hub:
42
- try:
43
- status, _, model_card = check_model_card(model_id)
44
- if status is False:
45
- still_on_hub = False
46
- except Exception:
47
- model_card = None
48
- still_on_hub = False
49
- data["still_on_hub"] = still_on_hub
50
-
51
- tags = get_model_tags(model_card, model_id) if still_on_hub else []
52
-
53
- data["tags"] = tags
54
- return data
55
-
56
-
57
- def update_models(file_path, models_on_the_hub):
58
- """
59
- Search through all JSON files in the specified root folder and its subfolders,
60
- and update the likes key in JSON dict from value of input dict
61
- """
62
- seen_models = []
63
- with open(file_path, "r") as f:
64
- model_infos = json.load(f)
65
- for model_id in model_infos.keys():
66
- seen_models.append(model_id)
67
- model_infos[model_id] = update_one_model(
68
- model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
69
- )
70
-
71
- # If new requests files have been created since we started all this
72
- # we grab them
73
- all_models = []
74
- try:
75
- for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
76
- if ix == 0:
77
- continue
78
- for file in files:
79
- if "eval_request" in file:
80
- path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
81
- all_models.append(path)
82
- except Exception as e:
83
- print(e)
84
- pass
85
-
86
- for model_id in all_models:
87
- if model_id not in seen_models:
88
- model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
89
-
90
- with open(file_path, "w") as f:
91
- json.dump(model_infos, f, indent=2)
92
-
93
-
94
- def update_dynamic_files():
95
- """This will only update metadata for models already linked in the repo, not add missing ones."""
96
- snapshot_download(
97
- repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
98
- )
99
-
100
- print("UPDATE_DYNAMIC: Loaded snapshot")
101
- # Get models
102
- start = time.time()
103
-
104
- models = list(
105
- API.list_models(
106
- # filter=ModelFilter(task="text-generation"),
107
- full=False,
108
- cardData=True,
109
- fetch_config=True,
110
- )
111
- )
112
- id_to_model = {model.id: model for model in models}
113
-
114
- print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
115
-
116
- start = time.time()
117
-
118
- update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
119
-
120
- print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
121
-
122
- API.upload_file(
123
- path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
124
- path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
125
- repo_id=DYNAMIC_INFO_REPO,
126
- repo_type="dataset",
127
- commit_message="Daily request file update.",
128
- )
129
- print("UPDATE_DYNAMIC: pushed to hub")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py CHANGED
@@ -13,7 +13,7 @@ from src.envs import HAS_HIGHER_RATE_LIMIT
13
 
14
 
15
  # ht to @Wauplin, thank you for the snippet!
16
- # See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
17
  def check_model_card(repo_id: str) -> tuple[bool, str]:
18
  # Returns operation status, and error message
19
  try:
 
13
 
14
 
15
  # ht to @Wauplin, thank you for the snippet!
16
+ # See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
17
  def check_model_card(repo_id: str) -> tuple[bool, str]:
18
  # Returns operation status, and error message
19
  try:
src/submission/submit.py CHANGED
@@ -2,16 +2,11 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
- from huggingface_hub import snapshot_download
6
-
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
  from src.envs import (
9
  API,
10
- DYNAMIC_INFO_FILE_PATH,
11
- DYNAMIC_INFO_PATH,
12
- DYNAMIC_INFO_REPO,
13
  EVAL_REQUESTS_PATH,
14
- H4_TOKEN,
15
  QUEUE_REPO,
16
  RATE_LIMIT_PERIOD,
17
  RATE_LIMIT_QUOTA,
@@ -35,7 +30,6 @@ def add_new_eval(
35
  base_model: str,
36
  revision: str,
37
  precision: str,
38
- private: bool,
39
  weight_type: str,
40
  model_type: str,
41
  ):
@@ -80,7 +74,7 @@ def add_new_eval(
80
  # Is the model on the hub?
81
  if weight_type in ["Delta", "Adapter"]:
82
  base_model_on_hub, error, _ = is_model_on_hub(
83
- model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
84
  )
85
  if not base_model_on_hub:
86
  return styled_error(f'Base model "{base_model}" {error}')
@@ -126,7 +120,6 @@ def add_new_eval(
126
  "model": model,
127
  "base_model": base_model,
128
  "revision": model_info.sha, # force to use the exact model commit
129
- "private": private,
130
  "precision": precision,
131
  "params": model_size,
132
  "architectures": architecture,
@@ -154,7 +147,7 @@ def add_new_eval(
154
  print("Creating eval file")
155
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
156
  os.makedirs(OUT_DIR, exist_ok=True)
157
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
158
 
159
  with open(out_path, "w") as f:
160
  f.write(json.dumps(eval_entry))
@@ -168,26 +161,6 @@ def add_new_eval(
168
  commit_message=f"Add {model} to eval queue",
169
  )
170
 
171
- # We want to grab the latest version of the submission file to not accidentally overwrite it
172
- snapshot_download(
173
- repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
174
- )
175
-
176
- with open(DYNAMIC_INFO_FILE_PATH) as f:
177
- all_supplementary_info = json.load(f)
178
-
179
- all_supplementary_info[model] = supplementary_info
180
- with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
181
- json.dump(all_supplementary_info, f, indent=2)
182
-
183
- API.upload_file(
184
- path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
185
- path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
186
- repo_id=DYNAMIC_INFO_REPO,
187
- repo_type="dataset",
188
- commit_message=f"Add {model} to dynamic info queue",
189
- )
190
-
191
  # Remove the local file
192
  os.remove(out_path)
193
 
 
2
  import os
3
  from datetime import datetime, timezone
4
 
 
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import (
7
  API,
 
 
 
8
  EVAL_REQUESTS_PATH,
9
+ HF_TOKEN,
10
  QUEUE_REPO,
11
  RATE_LIMIT_PERIOD,
12
  RATE_LIMIT_QUOTA,
 
30
  base_model: str,
31
  revision: str,
32
  precision: str,
 
33
  weight_type: str,
34
  model_type: str,
35
  ):
 
74
  # Is the model on the hub?
75
  if weight_type in ["Delta", "Adapter"]:
76
  base_model_on_hub, error, _ = is_model_on_hub(
77
+ model_name=base_model, revision=revision, token=HF_TOKEN, test_tokenizer=True
78
  )
79
  if not base_model_on_hub:
80
  return styled_error(f'Base model "{base_model}" {error}')
 
120
  "model": model,
121
  "base_model": base_model,
122
  "revision": model_info.sha, # force to use the exact model commit
 
123
  "precision": precision,
124
  "params": model_size,
125
  "architectures": architecture,
 
147
  print("Creating eval file")
148
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
149
  os.makedirs(OUT_DIR, exist_ok=True)
150
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
151
 
152
  with open(out_path, "w") as f:
153
  f.write(json.dumps(eval_entry))
 
161
  commit_message=f"Add {model} to eval queue",
162
  )
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  # Remove the local file
165
  os.remove(out_path)
166
 
src/tools/collections.py DELETED
@@ -1,76 +0,0 @@
1
- import pandas as pd
2
- from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
3
- from huggingface_hub.utils._errors import HfHubHTTPError
4
- from pandas import DataFrame
5
-
6
- from src.display.utils import AutoEvalColumn, ModelType
7
- from src.envs import H4_TOKEN, PATH_TO_COLLECTION
8
-
9
- # Specific intervals for the collections
10
- intervals = {
11
- "1B": pd.Interval(0, 1.5, closed="right"),
12
- "3B": pd.Interval(2.5, 3.5, closed="neither"),
13
- "7B": pd.Interval(6, 8, closed="neither"),
14
- "13B": pd.Interval(10, 14, closed="neither"),
15
- "30B": pd.Interval(25, 35, closed="neither"),
16
- "65B": pd.Interval(60, 70, closed="neither"),
17
- }
18
-
19
-
20
- def _filter_by_type_and_size(df, model_type, size_interval):
21
- """Filter DataFrame by model type and parameter size interval."""
22
- type_emoji = model_type.value.symbol[0]
23
- filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
24
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
25
- mask = params_column.apply(lambda x: x in size_interval)
26
- return filtered_df.loc[mask]
27
-
28
-
29
- def _add_models_to_collection(collection, models, model_type, size):
30
- """Add best models to the collection and update positions."""
31
- cur_len_collection = len(collection.items)
32
- for ix, model in enumerate(models, start=1):
33
- try:
34
- collection = add_collection_item(
35
- PATH_TO_COLLECTION,
36
- item_id=model,
37
- item_type="model",
38
- exists_ok=True,
39
- note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
40
- token=H4_TOKEN,
41
- )
42
- # Ensure position is correct if item was added
43
- if len(collection.items) > cur_len_collection:
44
- item_object_id = collection.items[-1].item_object_id
45
- update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
46
- cur_len_collection = len(collection.items)
47
- break # assuming we only add the top model
48
- except HfHubHTTPError:
49
- continue
50
-
51
-
52
- def update_collections(df: DataFrame):
53
- """Update collections by filtering and adding the best models."""
54
- collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
55
- cur_best_models = []
56
-
57
- for model_type in ModelType:
58
- if not model_type.value.name:
59
- continue
60
- for size, interval in intervals.items():
61
- filtered_df = _filter_by_type_and_size(df, model_type, interval)
62
- best_models = list(
63
- filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
64
- )
65
- print(model_type.value.symbol, size, best_models)
66
- _add_models_to_collection(collection, best_models, model_type, size)
67
- cur_best_models.extend(best_models)
68
-
69
- # Cleanup
70
- existing_models = {item.item_id for item in collection.items}
71
- to_remove = existing_models - set(cur_best_models)
72
- for item_id in to_remove:
73
- try:
74
- delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
- except HfHubHTTPError:
76
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{scripts β†’ tools}/create_request_file.py RENAMED
File without changes
src/tools/model_backlinks.py CHANGED
@@ -630,7 +630,7 @@ models = [
630
  "WizardLM/WizardMath-7B-V1.0",
631
  "Norquinal/llama-2-7b-claude-chat",
632
  "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
633
- "HuggingFaceH4/starchat-beta",
634
  "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
635
  "conceptofmind/LLongMA-2-13b-16k",
636
  "tianyil1/denas-llama2",
@@ -1039,7 +1039,7 @@ models = [
1039
  "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
1040
  "EleutherAI/gpt-neo-2.7B",
1041
  "danielhanchen/open_llama_3b_600bt_preview",
1042
- "HuggingFaceH4/starchat-alpha",
1043
  "pythainlp/wangchanglm-7.5B-sft-en-sharded",
1044
  "beaugogh/pythia-1.4b-deduped-sharegpt",
1045
  "HWERI/pythia-1.4b-deduped-sharegpt",
 
630
  "WizardLM/WizardMath-7B-V1.0",
631
  "Norquinal/llama-2-7b-claude-chat",
632
  "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
633
+ "open-llm-leaderboard/starchat-beta",
634
  "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
635
  "conceptofmind/LLongMA-2-13b-16k",
636
  "tianyil1/denas-llama2",
 
1039
  "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
1040
  "EleutherAI/gpt-neo-2.7B",
1041
  "danielhanchen/open_llama_3b_600bt_preview",
1042
+ "open-llm-leaderboard/starchat-alpha",
1043
  "pythainlp/wangchanglm-7.5B-sft-en-sharded",
1044
  "beaugogh/pythia-1.4b-deduped-sharegpt",
1045
  "HWERI/pythia-1.4b-deduped-sharegpt",
src/tools/plots.py CHANGED
@@ -6,10 +6,9 @@ from plotly.graph_objs import Figure
6
  from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
  from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
9
- from src.leaderboard.read_evals import EvalResult
10
 
11
 
12
- def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
  """
14
  Generates a DataFrame containing the maximum scores until each date.
15
 
@@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
17
  :return: A new DataFrame containing the maximum scores until each date for every metric.
18
  """
19
  # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
- results_df = pd.DataFrame(raw_data)
21
- # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
  results_df.sort_values(by="date", inplace=True)
23
 
24
  # Step 2: Initialize the scores dictionary
@@ -30,22 +28,18 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
30
  last_date = ""
31
  column = task.col_name
32
  for _, row in results_df.iterrows():
33
- current_model = row["full_model"]
34
  # We ignore models that are flagged/no longer on the hub/not finished
35
  to_ignore = (
36
- not row["still_on_hub"]
37
- or not row["not_flagged"]
38
  or current_model in FLAGGED_MODELS
39
- or row["status"] != "FINISHED"
40
  )
41
  if to_ignore:
42
  continue
43
 
44
- current_date = row["date"]
45
- if task.benchmark == "Average":
46
- current_score = np.mean(list(row["results"].values()))
47
- else:
48
- current_score = row["results"][task.benchmark]
49
 
50
  if current_score > current_max:
51
  if current_date == last_date and len(scores[column]) > 0:
 
6
  from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
  from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
 
9
 
10
 
11
+ def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
12
  """
13
  Generates a DataFrame containing the maximum scores until each date.
14
 
 
16
  :return: A new DataFrame containing the maximum scores until each date for every metric.
17
  """
18
  # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
19
+ results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
 
20
  results_df.sort_values(by="date", inplace=True)
21
 
22
  # Step 2: Initialize the scores dictionary
 
28
  last_date = ""
29
  column = task.col_name
30
  for _, row in results_df.iterrows():
31
+ current_model = row[AutoEvalColumn.fullname.name]
32
  # We ignore models that are flagged/no longer on the hub/not finished
33
  to_ignore = (
34
+ not row[AutoEvalColumn.still_on_hub.name]
35
+ or not row[AutoEvalColumn.not_flagged.name]
36
  or current_model in FLAGGED_MODELS
 
37
  )
38
  if to_ignore:
39
  continue
40
 
41
+ current_date = row[AutoEvalColumn.date.name]
42
+ current_score = row[task.col_name]
 
 
 
43
 
44
  if current_score > current_max:
45
  if current_date == last_date and len(scores[column]) > 0: