Clémentine commited on
Commit
9833cdb
·
1 Parent(s): d084b26

Simplified leaderboard v0

Browse files
app.py CHANGED
@@ -25,16 +25,9 @@ from src.display.utils import (
25
  WeightType,
26
  Precision
27
  )
28
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
- from src.submission.check_validity import already_submitted_models
32
- from src.tools.collections import update_collections
33
- from src.tools.plots import (
34
- create_metric_plot_obj,
35
- create_plot_df,
36
- create_scores_df,
37
- )
38
 
39
 
40
  def restart_space():
@@ -57,11 +50,8 @@ except Exception:
57
 
58
 
59
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
60
- update_collections(original_df.copy())
61
  leaderboard_df = original_df.copy()
62
 
63
- plot_df = create_plot_df(create_scores_df(raw_data))
64
-
65
  (
66
  finished_eval_queue_df,
67
  running_eval_queue_df,
@@ -251,22 +241,6 @@ with demo:
251
  queue=True,
252
  )
253
 
254
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
255
- with gr.Row():
256
- with gr.Column():
257
- chart = create_metric_plot_obj(
258
- plot_df,
259
- [AutoEvalColumn.average.name],
260
- title="Average of Top Scores and Human Baseline Over Time (from last update)",
261
- )
262
- gr.Plot(value=chart, min_width=500)
263
- with gr.Column():
264
- chart = create_metric_plot_obj(
265
- plot_df,
266
- BENCHMARK_COLS,
267
- title="Top Scores and Human Baseline Over Time (from last update)",
268
- )
269
- gr.Plot(value=chart, min_width=500)
270
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
271
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
272
 
@@ -317,7 +291,6 @@ with demo:
317
  with gr.Column():
318
  model_name_textbox = gr.Textbox(label="Model name")
319
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
320
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
321
  model_type = gr.Dropdown(
322
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
323
  label="Model type",
@@ -352,7 +325,6 @@ with demo:
352
  base_model_name_textbox,
353
  revision_name_textbox,
354
  precision,
355
- private,
356
  weight_type,
357
  model_type,
358
  ],
 
25
  WeightType,
26
  Precision
27
  )
28
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
 
 
 
 
 
 
 
31
 
32
 
33
  def restart_space():
 
50
 
51
 
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
  leaderboard_df = original_df.copy()
54
 
 
 
55
  (
56
  finished_eval_queue_df,
57
  running_eval_queue_df,
 
241
  queue=True,
242
  )
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
245
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
246
 
 
291
  with gr.Column():
292
  model_name_textbox = gr.Textbox(label="Model name")
293
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
294
  model_type = gr.Dropdown(
295
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
296
  label="Model type",
 
325
  base_model_name_textbox,
326
  revision_name_textbox,
327
  precision,
 
328
  weight_type,
329
  model_type,
330
  ],
requirements.txt CHANGED
@@ -5,15 +5,11 @@ datasets==2.14.5
5
  gradio==4.4.0
6
  gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
8
- markdown-it-py==2.2.0
9
- MarkupSafe==2.1.2
10
  matplotlib==3.7.1
11
  numpy==1.24.2
12
  pandas==2.0.0
13
- plotly==5.14.1
14
  python-dateutil==2.8.2
15
  requests==2.28.2
16
- semantic-version==2.10.0
17
  tqdm==4.65.0
18
  transformers==4.35.2
19
  tokenizers>=0.15.0
 
5
  gradio==4.4.0
6
  gradio_client==0.7.0
7
  huggingface-hub>=0.18.0
 
 
8
  matplotlib==3.7.1
9
  numpy==1.24.2
10
  pandas==2.0.0
 
11
  python-dateutil==2.8.2
12
  requests==2.28.2
 
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
  tokenizers>=0.15.0
src/display/about.py CHANGED
@@ -1,85 +1,34 @@
1
  from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
 
4
 
 
5
  INTRODUCTION_TEXT = """
6
- 📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
-
8
- 🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
9
- The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
10
  """
11
 
 
12
  LLM_BENCHMARKS_TEXT = f"""
13
- Useful links: [FAQ](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/179), [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174), [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03).
14
-
15
- # Context
16
- With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
17
-
18
- ## Icons
19
- - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
20
- - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
21
- Specific fine-tune subcategories (more adapted to chat):
22
- - {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
23
- - {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
24
- If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
25
-
26
- "Flagged" indicates that this model has been flagged by the community, and should probably be ignored! Clicking the link will redirect you to the discussion about the model.
27
- (For ex, the model was trained on the evaluation data, and is therefore cheating on the leaderboard.)
28
-
29
  ## How it works
30
 
31
- 📈 We evaluate models on 7 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
32
-
33
- - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
34
- - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
35
- - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
36
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
37
- - <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
38
- - <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
39
- - <a href="https://arxiv.org/abs/1903.00161" target="_blank"> DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
40
-
41
- For all these evaluations, a higher score is a better score.
42
- We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
43
-
44
- ## Details and logs
45
- You can find:
46
- - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
47
- - details on the input/outputs for the models in the `details` of each model, that you can access by clicking the 📄 emoji after the model name
48
- - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
49
-
50
  ## Reproducibility
51
- To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
52
- `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
53
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
54
-
55
- The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
56
- *You can expect results to vary slightly for different batch sizes because of padding.*
57
-
58
- The tasks and few shots parameters are:
59
- - ARC: 25-shot, *arc-challenge* (`acc_norm`)
60
- - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
61
- - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
62
- - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
63
- - Winogrande: 5-shot, *winogrande* (`acc`)
64
- - GSM8k: 5-shot, *gsm8k* (`acc`)
65
- - DROP: 3-shot, *drop* (`f1`)
66
-
67
- Side note on the baseline scores:
68
- - for log-likelihood evaluation, we select the random baseline
69
- - for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
70
- - for GSM8K, we select the score obtained in the paper after finetuning a 6B model on the full GSM8K training set for 50 epochs
71
 
72
  ## Quantization
73
  To get more information about quantization, see:
74
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
75
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
 
 
 
 
 
 
 
 
76
  """
77
 
78
  EVALUATION_QUEUE_TEXT = """
79
- # Evaluation Queue for the 🤗 Open LLM Leaderboard
80
-
81
- Models added here will be automatically evaluated on the 🤗 cluster.
82
-
83
  ## Some good practices before submitting a model
84
 
85
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
@@ -111,109 +60,4 @@ If everything is done, check you can launch the EleutherAIHarness on your model
111
 
112
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
113
  CITATION_BUTTON_TEXT = r"""
114
- @misc{open-llm-leaderboard,
115
- author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
116
- title = {Open LLM Leaderboard},
117
- year = {2023},
118
- publisher = {Hugging Face},
119
- howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
120
- }
121
- @software{eval-harness,
122
- author = {Gao, Leo and
123
- Tow, Jonathan and
124
- Biderman, Stella and
125
- Black, Sid and
126
- DiPofi, Anthony and
127
- Foster, Charles and
128
- Golding, Laurence and
129
- Hsu, Jeffrey and
130
- McDonell, Kyle and
131
- Muennighoff, Niklas and
132
- Phang, Jason and
133
- Reynolds, Laria and
134
- Tang, Eric and
135
- Thite, Anish and
136
- Wang, Ben and
137
- Wang, Kevin and
138
- Zou, Andy},
139
- title = {A framework for few-shot language model evaluation},
140
- month = sep,
141
- year = 2021,
142
- publisher = {Zenodo},
143
- version = {v0.0.1},
144
- doi = {10.5281/zenodo.5371628},
145
- url = {https://doi.org/10.5281/zenodo.5371628}
146
- }
147
- @misc{clark2018think,
148
- title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
149
- author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
150
- year={2018},
151
- eprint={1803.05457},
152
- archivePrefix={arXiv},
153
- primaryClass={cs.AI}
154
- }
155
- @misc{zellers2019hellaswag,
156
- title={HellaSwag: Can a Machine Really Finish Your Sentence?},
157
- author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
158
- year={2019},
159
- eprint={1905.07830},
160
- archivePrefix={arXiv},
161
- primaryClass={cs.CL}
162
- }
163
- @misc{hendrycks2021measuring,
164
- title={Measuring Massive Multitask Language Understanding},
165
- author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
166
- year={2021},
167
- eprint={2009.03300},
168
- archivePrefix={arXiv},
169
- primaryClass={cs.CY}
170
- }
171
- @misc{lin2022truthfulqa,
172
- title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
173
- author={Stephanie Lin and Jacob Hilton and Owain Evans},
174
- year={2022},
175
- eprint={2109.07958},
176
- archivePrefix={arXiv},
177
- primaryClass={cs.CL}
178
- }
179
- @misc{DBLP:journals/corr/abs-1907-10641,
180
- title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
181
- author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
182
- year={2019},
183
- eprint={1907.10641},
184
- archivePrefix={arXiv},
185
- primaryClass={cs.CL}
186
- }
187
- @misc{DBLP:journals/corr/abs-2110-14168,
188
- title={Training Verifiers to Solve Math Word Problems},
189
- author={Karl Cobbe and
190
- Vineet Kosaraju and
191
- Mohammad Bavarian and
192
- Mark Chen and
193
- Heewoo Jun and
194
- Lukasz Kaiser and
195
- Matthias Plappert and
196
- Jerry Tworek and
197
- Jacob Hilton and
198
- Reiichiro Nakano and
199
- Christopher Hesse and
200
- John Schulman},
201
- year={2021},
202
- eprint={2110.14168},
203
- archivePrefix={arXiv},
204
- primaryClass={cs.CL}
205
- }
206
- @misc{DBLP:journals/corr/abs-1903-00161,
207
- title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
208
- Over Paragraphs},
209
- author={Dheeru Dua and
210
- Yizhong Wang and
211
- Pradeep Dasigi and
212
- Gabriel Stanovsky and
213
- Sameer Singh and
214
- Matt Gardner},
215
- year={2019},
216
- eprinttype={arXiv},
217
- eprint={1903.00161},
218
- primaryClass={cs.CL}
219
- }"""
 
1
  from src.display.utils import ModelType
2
 
3
+ # To complete, what is your leaderboard name
4
+ TITLE = """<h1 align="center" id="space-title">Leaderboard</h1>"""
5
 
6
+ # to complete - what does your leaderboard evaluate
7
  INTRODUCTION_TEXT = """
 
 
 
 
8
  """
9
 
10
+ # to complete - which evaluations are you running? how can people reproduce what you have?
11
  LLM_BENCHMARKS_TEXT = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ## How it works
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  ## Reproducibility
15
+ To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  ## Quantization
18
  To get more information about quantization, see:
19
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
20
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
21
+
22
+ ## Model types
23
+ - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
24
+ - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
25
+ Specific fine-tune subcategories (more adapted to chat):
26
+ - {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
27
+ - {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
28
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
29
  """
30
 
31
  EVALUATION_QUEUE_TEXT = """
 
 
 
 
32
  ## Some good practices before submitting a model
33
 
34
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 
60
 
61
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
62
  CITATION_BUTTON_TEXT = r"""
63
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py CHANGED
@@ -13,11 +13,7 @@ def model_hyperlink(link, model_name):
13
 
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
-
17
- details_model_name = model_name.replace("/", "__")
18
- details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
19
-
20
- return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
21
 
22
 
23
  def styled_error(error):
 
13
 
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
+ return model_hyperlink(link, model_name)
 
 
 
 
17
 
18
 
19
  def styled_error(error):
src/display/utils.py CHANGED
@@ -13,14 +13,10 @@ class Task:
13
  metric: str
14
  col_name: str
15
 
 
16
  class Tasks(Enum):
17
- arc = Task("arc:challenge", "acc_norm", "ARC")
18
- hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
- mmlu = Task("hendrycksTest", "acc", "MMLU")
20
- truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
- winogrande = Task("winogrande", "acc", "Winogrande")
22
- gsm8k = Task("gsm8k", "acc", "GSM8K")
23
- drop = Task("drop", "f1", "DROP")
24
 
25
  # These classes are for user facing column names,
26
  # to avoid having to change them all around the code
@@ -67,44 +63,20 @@ class EvalQueueColumn: # Queue column
67
  weight_type = ColumnContent("weight_type", "str", "Original")
68
  status = ColumnContent("status", "str", True)
69
 
70
-
71
  baseline_row = {
72
  AutoEvalColumn.model.name: "<p>Baseline</p>",
73
  AutoEvalColumn.revision.name: "N/A",
74
  AutoEvalColumn.precision.name: None,
75
- AutoEvalColumn.average.name: 31.0,
76
- AutoEvalColumn.arc.name: 25.0,
77
- AutoEvalColumn.hellaswag.name: 25.0,
78
- AutoEvalColumn.mmlu.name: 25.0,
79
- AutoEvalColumn.truthfulqa.name: 25.0,
80
- AutoEvalColumn.winogrande.name: 50.0,
81
- AutoEvalColumn.gsm8k.name: 0.21,
82
- AutoEvalColumn.drop.name: 0.47,
83
  AutoEvalColumn.dummy.name: "baseline",
84
  AutoEvalColumn.model_type.name: "",
85
  }
86
 
87
- # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
88
- # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
89
- # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
90
- # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
91
- # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
92
- # Drop: https://leaderboard.allenai.org/drop/submissions/public
93
- # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
94
- # GSM8K: paper
95
- # Define the human baselines
96
  human_baseline_row = {
97
  AutoEvalColumn.model.name: "<p>Human performance</p>",
98
  AutoEvalColumn.revision.name: "N/A",
99
  AutoEvalColumn.precision.name: None,
100
- AutoEvalColumn.average.name: 92.75,
101
- AutoEvalColumn.arc.name: 80.0,
102
- AutoEvalColumn.hellaswag.name: 95.0,
103
- AutoEvalColumn.mmlu.name: 89.8,
104
- AutoEvalColumn.truthfulqa.name: 94.0,
105
- AutoEvalColumn.winogrande.name: 94.0,
106
- AutoEvalColumn.gsm8k.name: 100,
107
- AutoEvalColumn.drop.name: 96.42,
108
  AutoEvalColumn.dummy.name: "human_baseline",
109
  AutoEvalColumn.model_type.name: "",
110
  }
@@ -112,7 +84,8 @@ human_baseline_row = {
112
  @dataclass
113
  class ModelDetails:
114
  name: str
115
- symbol: str = "" # emoji, only for the model type
 
116
 
117
 
118
  class ModelType(Enum):
@@ -162,9 +135,6 @@ class Precision(Enum):
162
  if precision in ["GPTQ", "None"]:
163
  return Precision.qt_GPTQ
164
  return Precision.Unknown
165
-
166
-
167
-
168
 
169
  # Column selection
170
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
13
  metric: str
14
  col_name: str
15
 
16
+ # Init: to update with your specific keys
17
  class Tasks(Enum):
18
+ task0 = Task("Key in the harness", "metric in the harness", "display name")
19
+ task1 = Task("Key in the harness", "metric in the harness", "display name")
 
 
 
 
 
20
 
21
  # These classes are for user facing column names,
22
  # to avoid having to change them all around the code
 
63
  weight_type = ColumnContent("weight_type", "str", "Original")
64
  status = ColumnContent("status", "str", True)
65
 
 
66
  baseline_row = {
67
  AutoEvalColumn.model.name: "<p>Baseline</p>",
68
  AutoEvalColumn.revision.name: "N/A",
69
  AutoEvalColumn.precision.name: None,
70
+ AutoEvalColumn.average.name: 0,
 
 
 
 
 
 
 
71
  AutoEvalColumn.dummy.name: "baseline",
72
  AutoEvalColumn.model_type.name: "",
73
  }
74
 
 
 
 
 
 
 
 
 
 
75
  human_baseline_row = {
76
  AutoEvalColumn.model.name: "<p>Human performance</p>",
77
  AutoEvalColumn.revision.name: "N/A",
78
  AutoEvalColumn.precision.name: None,
79
+ AutoEvalColumn.average.name: 0,
 
 
 
 
 
 
 
80
  AutoEvalColumn.dummy.name: "human_baseline",
81
  AutoEvalColumn.model_type.name: "",
82
  }
 
84
  @dataclass
85
  class ModelDetails:
86
  name: str
87
+ display_name: str = ""
88
+ symbol: str = "" # emoji
89
 
90
 
91
  class ModelType(Enum):
 
135
  if precision in ["GPTQ", "None"]:
136
  return Precision.qt_GPTQ
137
  return Precision.Unknown
 
 
 
138
 
139
  # Column selection
140
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
src/envs.py CHANGED
@@ -5,28 +5,15 @@ from huggingface_hub import HfApi
5
  # clone / pull the lmeh eval data
6
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
7
 
8
- REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
9
- QUEUE_REPO = "open-llm-leaderboard/requests"
10
- RESULTS_REPO = "open-llm-leaderboard/results"
11
-
12
- PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
13
- PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
14
-
15
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
16
 
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
 
19
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
20
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
21
 
22
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
23
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
24
-
25
- PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
26
-
27
- # Rate limit variables
28
- RATE_LIMIT_PERIOD = 7
29
- RATE_LIMIT_QUOTA = 5
30
- HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
31
-
32
  API = HfApi(token=H4_TOKEN)
 
5
  # clone / pull the lmeh eval data
6
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
7
 
8
+ OWNER = "clefourrier"
9
+ REPO_ID = f"{OWNER}/leaderboard"
10
+ QUEUE_REPO = f"{OWNER}/requests"
11
+ RESULTS_REPO = f"{OWNER}/results"
 
 
 
 
12
 
13
  CACHE_PATH=os.getenv("HF_HOME", ".")
14
 
15
+ # Local caches
16
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
17
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
18
 
 
 
 
 
 
 
 
 
 
 
19
  API = HfApi(token=H4_TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -72,23 +72,8 @@ class EvalResult:
72
  results = {}
73
  for task in Tasks:
74
  task = task.value
75
- # We skip old mmlu entries
76
- wrong_mmlu_version = False
77
- if task.benchmark == "hendrycksTest":
78
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
79
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
80
- wrong_mmlu_version = True
81
-
82
- if wrong_mmlu_version:
83
- continue
84
-
85
- # Some truthfulQA values are NaNs
86
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
87
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
88
- results[task.benchmark] = 0.0
89
- continue
90
 
91
- # We average all scores of a given metric (mostly for mmlu)
92
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
93
  if accs.size == 0 or any([acc is None for acc in accs]):
94
  continue
 
72
  results = {}
73
  for task in Tasks:
74
  task = task.value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # We average all scores of a given metric
77
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
79
  continue
src/submission/check_validity.py CHANGED
@@ -10,13 +10,8 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
13
- from src.envs import HAS_HIGHER_RATE_LIMIT
14
-
15
-
16
- # ht to @Wauplin, thank you for the snippet!
17
- # See https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/317
18
  def check_model_card(repo_id: str) -> tuple[bool, str]:
19
- # Returns operation status, and error message
20
  try:
21
  card = ModelCard.load(repo_id)
22
  except huggingface_hub.utils.EntryNotFoundError:
@@ -38,6 +33,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
38
 
39
 
40
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
41
  try:
42
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
43
  if test_tokenizer:
@@ -69,47 +65,20 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
69
 
70
 
71
  def get_model_size(model_info: ModelInfo, precision: str):
72
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
73
  try:
74
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
75
- except (AttributeError, TypeError ):
76
- try:
77
- size_match = re.search(size_pattern, model_info.modelId.lower())
78
- model_size = size_match.group(0)
79
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
80
- except AttributeError:
81
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
82
 
83
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
84
  model_size = size_factor * model_size
85
  return model_size
86
 
87
  def get_model_arch(model_info: ModelInfo):
 
88
  return model_info.config.get("architectures", "Unknown")
89
 
90
- def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
91
- if org_or_user not in users_to_submission_dates:
92
- return True, ""
93
- submission_dates = sorted(users_to_submission_dates[org_or_user])
94
-
95
- time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
96
- submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
97
-
98
- num_models_submitted_in_period = len(submissions_after_timelimit)
99
- if org_or_user in HAS_HIGHER_RATE_LIMIT:
100
- rate_limit_quota = 2 * rate_limit_quota
101
-
102
- if num_models_submitted_in_period > rate_limit_quota:
103
- error_msg = f"Organisation or user `{org_or_user}`"
104
- error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
105
- error_msg += f"in the last {rate_limit_period} days.\n"
106
- error_msg += (
107
- "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
108
- )
109
- return False, error_msg
110
- return True, ""
111
-
112
-
113
  def already_submitted_models(requested_models_dir: str) -> set[str]:
114
  depth = 1
115
  file_names = []
 
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
 
 
 
 
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
  try:
16
  card = ModelCard.load(repo_id)
17
  except huggingface_hub.utils.EntryNotFoundError:
 
33
 
34
 
35
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
36
+ """Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)"""
37
  try:
38
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
39
  if test_tokenizer:
 
65
 
66
 
67
  def get_model_size(model_info: ModelInfo, precision: str):
68
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
69
  try:
70
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
71
+ except (AttributeError, TypeError):
72
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
 
 
 
 
 
73
 
74
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
75
  model_size = size_factor * model_size
76
  return model_size
77
 
78
  def get_model_arch(model_info: ModelInfo):
79
+ """Gets the model architecture from the configuration"""
80
  return model_info.config.get("architectures", "Unknown")
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def already_submitted_models(requested_models_dir: str) -> set[str]:
83
  depth = 1
84
  file_names = []
src/submission/submit.py CHANGED
@@ -3,14 +3,12 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
- from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
  from src.submission.check_validity import (
9
  already_submitted_models,
10
  check_model_card,
11
  get_model_size,
12
  is_model_on_hub,
13
- user_submission_permission,
14
  )
15
 
16
  REQUESTED_MODELS = None
@@ -21,7 +19,6 @@ def add_new_eval(
21
  base_model: str,
22
  revision: str,
23
  precision: str,
24
- private: bool,
25
  weight_type: str,
26
  model_type: str,
27
  ):
@@ -42,18 +39,6 @@ def add_new_eval(
42
  if model_type is None or model_type == "":
43
  return styled_error("Please select a model type.")
44
 
45
- # Is the user rate limited?
46
- if user_name != "":
47
- user_can_submit, error_msg = user_submission_permission(
48
- user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
49
- )
50
- if not user_can_submit:
51
- return styled_error(error_msg)
52
-
53
- # Did the model authors forbid its submission to the leaderboard?
54
- if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
55
- return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
56
-
57
  # Does the model actually exist?
58
  if revision == "":
59
  revision = "main"
@@ -94,7 +79,6 @@ def add_new_eval(
94
  "model": model,
95
  "base_model": base_model,
96
  "revision": revision,
97
- "private": private,
98
  "precision": precision,
99
  "weight_type": weight_type,
100
  "status": "PENDING",
@@ -112,7 +96,7 @@ def add_new_eval(
112
  print("Creating eval file")
113
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
114
  os.makedirs(OUT_DIR, exist_ok=True)
115
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
116
 
117
  with open(out_path, "w") as f:
118
  f.write(json.dumps(eval_entry))
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
 
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
10
  get_model_size,
11
  is_model_on_hub,
 
12
  )
13
 
14
  REQUESTED_MODELS = None
 
19
  base_model: str,
20
  revision: str,
21
  precision: str,
 
22
  weight_type: str,
23
  model_type: str,
24
  ):
 
39
  if model_type is None or model_type == "":
40
  return styled_error("Please select a model type.")
41
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
 
79
  "model": model,
80
  "base_model": base_model,
81
  "revision": revision,
 
82
  "precision": precision,
83
  "weight_type": weight_type,
84
  "status": "PENDING",
 
96
  print("Creating eval file")
97
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
98
  os.makedirs(OUT_DIR, exist_ok=True)
99
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
100
 
101
  with open(out_path, "w") as f:
102
  f.write(json.dumps(eval_entry))