Clémentine commited on
Commit
728a44a
·
1 Parent(s): 5d28865
app.py CHANGED
@@ -8,20 +8,15 @@ import numpy as np
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
  from huggingface_hub import HfApi
11
- from transformers import AutoConfig
12
 
13
- from src.auto_leaderboard.get_model_metadata import apply_metadata
14
  from src.assets.text_content import *
15
  from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
16
- from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
17
- from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
18
- from src.assets.css_html_js import custom_css, get_window_url_params
19
- from src.utils_display import AutoEvalColumn, EvalQueueColumn, EloEvalColumn, fields, styled_error, styled_warning, styled_message
20
  from src.init import load_all_info_from_hub
21
 
22
  # clone / pull the lmeh eval data
23
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
24
- LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
25
  HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
26
  GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
27
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
@@ -37,21 +32,7 @@ def restart_space():
37
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
38
  )
39
 
40
- auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
41
-
42
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
43
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
44
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
45
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
46
-
47
- if not IS_PUBLIC:
48
- COLS.insert(2, AutoEvalColumn.is_8bit.name)
49
- TYPES.insert(2, AutoEvalColumn.is_8bit.type)
50
-
51
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
52
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
53
-
54
- BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
55
 
56
  ELO_COLS = [c.name for c in fields(EloEvalColumn)]
57
  ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
@@ -66,78 +47,6 @@ def has_nan_values(df, columns):
66
  return df[columns].isna().any(axis=1)
67
 
68
 
69
- def get_leaderboard_df():
70
- if auto_eval_repo:
71
- print("Pulling evaluation results for the leaderboard.")
72
- auto_eval_repo.git_pull()
73
-
74
- all_data = get_eval_results_dicts(IS_PUBLIC)
75
-
76
- if not IS_PUBLIC:
77
- all_data.append(gpt4_values)
78
- all_data.append(gpt35_values)
79
-
80
- all_data.append(baseline)
81
- apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
82
-
83
- df = pd.DataFrame.from_records(all_data)
84
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
85
- df = df[COLS]
86
-
87
- # filter out if any of the benchmarks have not been produced
88
- df = df[has_no_nan_values(df, BENCHMARK_COLS)]
89
- return df
90
-
91
-
92
- def get_evaluation_queue_df():
93
- # todo @saylortwift: replace the repo by the one you created for the eval queue
94
- if auto_eval_repo:
95
- print("Pulling changes for the evaluation queue.")
96
- auto_eval_repo.git_pull()
97
-
98
- entries = [
99
- entry
100
- for entry in os.listdir(EVAL_REQUESTS_PATH)
101
- if not entry.startswith(".")
102
- ]
103
- all_evals = []
104
-
105
- for entry in entries:
106
- if ".json" in entry:
107
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
108
- with open(file_path) as fp:
109
- data = json.load(fp)
110
-
111
- data["# params"] = "unknown"
112
- data["model"] = make_clickable_model(data["model"])
113
- data["revision"] = data.get("revision", "main")
114
-
115
- all_evals.append(data)
116
- else:
117
- # this is a folder
118
- sub_entries = [
119
- e
120
- for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
121
- if not e.startswith(".")
122
- ]
123
- for sub_entry in sub_entries:
124
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
125
- with open(file_path) as fp:
126
- data = json.load(fp)
127
-
128
- # data["# params"] = get_n_params(data["model"])
129
- data["model"] = make_clickable_model(data["model"])
130
- all_evals.append(data)
131
-
132
- pending_list = [e for e in all_evals if e["status"] == "PENDING"]
133
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
134
- finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
135
- df_pending = pd.DataFrame.from_records(pending_list)
136
- df_running = pd.DataFrame.from_records(running_list)
137
- df_finished = pd.DataFrame.from_records(finished_list)
138
- return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
139
-
140
-
141
  def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
142
  if human_eval_repo:
143
  print("Pulling human_eval_repo changes")
@@ -173,14 +82,6 @@ def get_elo_elements():
173
  plot_4,
174
  )
175
 
176
-
177
- original_df = get_leaderboard_df()
178
- leaderboard_df = original_df.copy()
179
- (
180
- finished_eval_queue_df,
181
- running_eval_queue_df,
182
- pending_eval_queue_df,
183
- ) = get_evaluation_queue_df()
184
  (
185
  elo_leaderboard,
186
  elo_leaderboard_with_tie_allowed,
@@ -191,309 +92,46 @@ leaderboard_df = original_df.copy()
191
  ) = get_elo_elements()
192
 
193
 
194
- def is_model_on_hub(model_name, revision) -> bool:
195
- try:
196
- AutoConfig.from_pretrained(model_name, revision=revision)
197
- return True, None
198
-
199
- except ValueError as e:
200
- return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
201
-
202
- except Exception as e:
203
- print("Could not get the model config from the hub.: \n", e)
204
- return False, "was not found on hub!"
205
-
206
-
207
- def add_new_eval(
208
- model: str,
209
- base_model: str,
210
- revision: str,
211
- is_8_bit_eval: bool,
212
- private: bool,
213
- is_delta_weight: bool,
214
- ):
215
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
216
-
217
- # check the model actually exists before adding the eval
218
- if revision == "":
219
- revision = "main"
220
-
221
- if is_delta_weight:
222
- base_model_on_hub, error = is_model_on_hub(base_model, revision)
223
- if not base_model_on_hub:
224
- return styled_error(f'Base model "{base_model}" {error}')
225
-
226
- model_on_hub, error = is_model_on_hub(model, revision)
227
- if not model_on_hub:
228
- return styled_error(f'Model "{model}" {error}')
229
-
230
- print("adding new eval")
231
-
232
- eval_entry = {
233
- "model": model,
234
- "base_model": base_model,
235
- "revision": revision,
236
- "private": private,
237
- "8bit_eval": is_8_bit_eval,
238
- "is_delta_weight": is_delta_weight,
239
- "status": "PENDING",
240
- "submitted_time": current_time,
241
- }
242
-
243
- user_name = ""
244
- model_path = model
245
- if "/" in model:
246
- user_name = model.split("/")[0]
247
- model_path = model.split("/")[1]
248
-
249
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
250
- os.makedirs(OUT_DIR, exist_ok=True)
251
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
252
-
253
- # Check for duplicate submission
254
- if out_path.split("eval_requests/")[1].lower() in requested_models:
255
- return styled_warning("This model has been already submitted.")
256
-
257
- with open(out_path, "w") as f:
258
- f.write(json.dumps(eval_entry))
259
-
260
- api.upload_file(
261
- path_or_fileobj=out_path,
262
- path_in_repo=out_path,
263
- repo_id=LMEH_REPO,
264
- token=H4_TOKEN,
265
- repo_type="dataset",
266
- )
267
-
268
- return styled_message("Your request has been submitted to the evaluation queue!")
269
-
270
-
271
- def refresh():
272
- leaderboard_df = get_leaderboard_df()
273
- (
274
- finished_eval_queue_df,
275
- running_eval_queue_df,
276
- pending_eval_queue_df,
277
- ) = get_evaluation_queue_df()
278
- return (
279
- leaderboard_df,
280
- finished_eval_queue_df,
281
- running_eval_queue_df,
282
- pending_eval_queue_df,
283
- )
284
-
285
-
286
- def search_table(df, query):
287
- filtered_df = df[df[AutoEvalColumn.dummy.name].str.contains(query, case=False)]
288
- return filtered_df
289
-
290
-
291
- def change_tab(query_param):
292
- query_param = query_param.replace("'", '"')
293
- query_param = json.loads(query_param)
294
-
295
- if (
296
- isinstance(query_param, dict)
297
- and "tab" in query_param
298
- and query_param["tab"] == "evaluation"
299
- ):
300
- return gr.Tabs.update(selected=1)
301
- else:
302
- return gr.Tabs.update(selected=0)
303
-
304
-
305
  demo = gr.Blocks(css=custom_css)
306
  with demo:
307
  gr.HTML(TITLE)
308
  with gr.Row():
309
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
310
 
311
- with gr.Row():
312
- with gr.Column():
313
- with gr.Accordion("📙 Citation", open=False):
314
- citation_button = gr.Textbox(
315
- value=CITATION_BUTTON_TEXT,
316
- label=CITATION_BUTTON_LABEL,
317
- elem_id="citation-button",
318
- ).style(show_copy_button=True)
319
- with gr.Column():
320
- with gr.Accordion("✨ CHANGELOG", open=False):
321
- changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
324
- with gr.TabItem("📊 LLM Benchmarks", elem_id="llm-benchmark-tab-table", id=0):
325
- with gr.Column():
326
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
327
- with gr.Box(elem_id="search-bar-table-box"):
328
- search_bar = gr.Textbox(
329
- placeholder="🔍 Search your model and press ENTER...",
330
- show_label=False,
331
- elem_id="search-bar",
332
- )
333
- with gr.Tabs(elem_classes="tab-buttons"):
334
- with gr.TabItem("Light View"):
335
- leaderboard_table_lite = gr.components.Dataframe(
336
- value=leaderboard_df[COLS_LITE],
337
- headers=COLS_LITE,
338
- datatype=TYPES_LITE,
339
- max_rows=None,
340
- elem_id="leaderboard-table-lite",
341
- )
342
- with gr.TabItem("Extended Model View"):
343
- leaderboard_table = gr.components.Dataframe(
344
- value=leaderboard_df,
345
- headers=COLS,
346
- datatype=TYPES,
347
- max_rows=None,
348
- elem_id="leaderboard-table",
349
- )
350
-
351
- # Dummy leaderboard for handling the case when the user uses backspace key
352
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
353
- value=original_df,
354
- headers=COLS,
355
- datatype=TYPES,
356
- max_rows=None,
357
- visible=False,
358
- )
359
- search_bar.submit(
360
- search_table,
361
- [hidden_leaderboard_table_for_search, search_bar],
362
- leaderboard_table,
363
- )
364
-
365
- # Dummy leaderboard for handling the case when the user uses backspace key
366
- hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
367
- value=original_df[COLS_LITE],
368
- headers=COLS_LITE,
369
- datatype=TYPES_LITE,
370
- max_rows=None,
371
- visible=False,
372
- )
373
- search_bar.submit(
374
- search_table,
375
- [hidden_leaderboard_table_for_search_lite, search_bar],
376
- leaderboard_table_lite,
377
- )
378
-
379
- with gr.Row():
380
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
381
-
382
- with gr.Accordion("✅ Finished Evaluations", open=False):
383
- with gr.Row():
384
- finished_eval_table = gr.components.Dataframe(
385
- value=finished_eval_queue_df,
386
- headers=EVAL_COLS,
387
- datatype=EVAL_TYPES,
388
- max_rows=5,
389
- )
390
- with gr.Accordion("🔄 Running Evaluation Queue", open=False):
391
- with gr.Row():
392
- running_eval_table = gr.components.Dataframe(
393
- value=running_eval_queue_df,
394
- headers=EVAL_COLS,
395
- datatype=EVAL_TYPES,
396
- max_rows=5,
397
- )
398
-
399
- with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
400
- with gr.Row():
401
- pending_eval_table = gr.components.Dataframe(
402
- value=pending_eval_queue_df,
403
- headers=EVAL_COLS,
404
- datatype=EVAL_TYPES,
405
- max_rows=5,
406
- )
407
-
408
- with gr.Row():
409
- refresh_button = gr.Button("Refresh")
410
- refresh_button.click(
411
- refresh,
412
- inputs=[],
413
- outputs=[
414
- leaderboard_table,
415
- finished_eval_table,
416
- running_eval_table,
417
- pending_eval_table,
418
- ],
419
- )
420
- with gr.Accordion("Submit a new model for evaluation"):
421
- with gr.Row():
422
- with gr.Column():
423
- model_name_textbox = gr.Textbox(label="Model name")
424
- revision_name_textbox = gr.Textbox(
425
- label="revision", placeholder="main"
426
- )
427
-
428
- with gr.Column():
429
- is_8bit_toggle = gr.Checkbox(
430
- False, label="8 bit eval", visible=not IS_PUBLIC
431
- )
432
- private = gr.Checkbox(
433
- False, label="Private", visible=not IS_PUBLIC
434
- )
435
- is_delta_weight = gr.Checkbox(False, label="Delta weights")
436
- base_model_name_textbox = gr.Textbox(
437
- label="base model (for delta)"
438
- )
439
-
440
- submit_button = gr.Button("Submit Eval")
441
- submission_result = gr.Markdown()
442
- submit_button.click(
443
- add_new_eval,
444
- [
445
- model_name_textbox,
446
- base_model_name_textbox,
447
- revision_name_textbox,
448
- is_8bit_toggle,
449
- private,
450
- is_delta_weight,
451
- ],
452
- submission_result,
453
- )
454
- with gr.TabItem(
455
- "🧑‍⚖️ Human & GPT-4 Evaluations 🤖", elem_id="human-gpt-tab-table", id=1
456
- ):
457
- with gr.Row():
458
- with gr.Column(scale=2):
459
- gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
460
- with gr.Column(scale=1):
461
- gr.Image(
462
- "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
463
- )
464
- gr.Markdown("## No tie allowed")
465
- elo_leaderboard_table = gr.components.Dataframe(
466
- value=elo_leaderboard,
467
- headers=ELO_COLS,
468
- datatype=ELO_TYPES,
469
- max_rows=5,
470
- )
471
-
472
- gr.Markdown("## Tie allowed*")
473
- elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
474
- value=elo_leaderboard_with_tie_allowed,
475
- headers=ELO_COLS,
476
- datatype=ELO_TYPES,
477
- max_rows=5,
478
- )
479
-
480
- gr.Markdown(
481
- "\* Results when the scores of 4 and 5 were treated as ties.",
482
- elem_classes="markdown-text",
483
- )
484
-
485
- gr.Markdown(
486
- "Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
487
- elem_id="models-to-add-text",
488
- )
489
-
490
- dummy = gr.Textbox(visible=False)
491
- demo.load(
492
- change_tab,
493
- dummy,
494
- tabs,
495
- _js=get_window_url_params,
496
- )
497
  if ADD_PLOTS:
498
  with gr.Box():
499
  visualization_title = gr.HTML(VISUALIZATION_TITLE)
@@ -512,6 +150,20 @@ with demo:
512
  gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
513
  plot_4 = gr.Plot(plot_4, show_label=False)
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  scheduler = BackgroundScheduler()
516
  scheduler.add_job(restart_space, "interval", seconds=3600)
517
  scheduler.start()
 
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
  from huggingface_hub import HfApi
 
11
 
 
12
  from src.assets.text_content import *
13
  from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
14
+ from src.assets.css_html_js import custom_css, get_window_url_params # left in case you need them
15
+ from src.utils_display import EloEvalColumn, fields, styled_error, styled_warning, styled_message
 
 
16
  from src.init import load_all_info_from_hub
17
 
18
  # clone / pull the lmeh eval data
19
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
 
20
  HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
21
  GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
22
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 
32
  repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
33
  )
34
 
35
+ human_eval_repo, gpt_4_eval_repo = load_all_info_from_hub(HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  ELO_COLS = [c.name for c in fields(EloEvalColumn)]
38
  ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
 
47
  return df[columns].isna().any(axis=1)
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
51
  if human_eval_repo:
52
  print("Pulling human_eval_repo changes")
 
82
  plot_4,
83
  )
84
 
 
 
 
 
 
 
 
 
85
  (
86
  elo_leaderboard,
87
  elo_leaderboard_with_tie_allowed,
 
92
  ) = get_elo_elements()
93
 
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  demo = gr.Blocks(css=custom_css)
96
  with demo:
97
  gr.HTML(TITLE)
98
  with gr.Row():
99
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
 
101
+ with gr.Column():
102
+ with gr.Row():
103
+ with gr.Column(scale=2):
104
+ gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
105
+ with gr.Column(scale=1):
106
+ gr.Image(
107
+ "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
108
+ )
109
+ gr.Markdown("## No tie allowed")
110
+ elo_leaderboard_table = gr.components.Dataframe(
111
+ value=elo_leaderboard,
112
+ headers=ELO_COLS,
113
+ datatype=ELO_TYPES,
114
+ max_rows=5,
115
+ )
116
+
117
+ gr.Markdown("## Tie allowed*")
118
+ elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
119
+ value=elo_leaderboard_with_tie_allowed,
120
+ headers=ELO_COLS,
121
+ datatype=ELO_TYPES,
122
+ max_rows=5,
123
+ )
124
+
125
+ gr.Markdown(
126
+ "\* Results when the scores of 4 and 5 were treated as ties.",
127
+ elem_classes="markdown-text",
128
+ )
129
+
130
+ gr.Markdown(
131
+ "Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
132
+ elem_id="models-to-add-text",
133
+ )
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  if ADD_PLOTS:
136
  with gr.Box():
137
  visualization_title = gr.HTML(VISUALIZATION_TITLE)
 
150
  gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
151
  plot_4 = gr.Plot(plot_4, show_label=False)
152
 
153
+ with gr.Row():
154
+ with gr.Column():
155
+ with gr.Accordion("📙 Citation", open=False):
156
+ citation_button = gr.Textbox(
157
+ value=CITATION_BUTTON_TEXT,
158
+ label=CITATION_BUTTON_LABEL,
159
+ elem_id="citation-button",
160
+ ).style(show_copy_button=True)
161
+ with gr.Column():
162
+ with gr.Accordion("✨ CHANGELOG", open=False):
163
+ changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
164
+
165
+
166
+
167
  scheduler = BackgroundScheduler()
168
  scheduler.add_job(restart_space, "interval", seconds=3600)
169
  scheduler.start()
src/assets/hardcoded_evals.py DELETED
@@ -1,38 +0,0 @@
1
- from src.utils_display import AutoEvalColumn, model_hyperlink
2
-
3
- gpt4_values = {
4
- AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
- AutoEvalColumn.revision.name: "tech report",
6
- AutoEvalColumn.is_8bit.name: None,
7
- AutoEvalColumn.average.name: 84.3,
8
- AutoEvalColumn.arc.name: 96.3,
9
- AutoEvalColumn.hellaswag.name: 95.3,
10
- AutoEvalColumn.mmlu.name: 86.4,
11
- AutoEvalColumn.truthfulqa.name: 59.0,
12
- AutoEvalColumn.dummy.name: "GPT-4",
13
- }
14
-
15
- gpt35_values = {
16
- AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
17
- AutoEvalColumn.revision.name: "tech report",
18
- AutoEvalColumn.is_8bit.name: None,
19
- AutoEvalColumn.average.name: 71.9,
20
- AutoEvalColumn.arc.name: 85.2,
21
- AutoEvalColumn.hellaswag.name: 85.5,
22
- AutoEvalColumn.mmlu.name: 70.0,
23
- AutoEvalColumn.truthfulqa.name: 47.0,
24
- AutoEvalColumn.dummy.name: "GPT-3.5",
25
- }
26
-
27
- baseline = {
28
- AutoEvalColumn.model.name: "<p>Baseline</p>",
29
- AutoEvalColumn.revision.name: "N/A",
30
- AutoEvalColumn.is_8bit.name: None,
31
- AutoEvalColumn.average.name: 25.0,
32
- AutoEvalColumn.arc.name: 25.0,
33
- AutoEvalColumn.hellaswag.name: 25.0,
34
- AutoEvalColumn.mmlu.name: 25.0,
35
- AutoEvalColumn.truthfulqa.name: 25.0,
36
- AutoEvalColumn.dummy.name: "baseline",
37
- }
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/assets/text_content.py CHANGED
@@ -54,24 +54,12 @@ CHANGELOG_TEXT = f"""
54
  - Release the leaderboard to public
55
  """
56
 
57
- TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
58
 
59
  INTRODUCTION_TEXT = f"""
60
  📐 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
61
 
62
- 🤗 A key advantage of this leaderboard is that anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
63
-
64
- 📈 In the **first tab (LLM Benchmarks)**, we evaluate models on 4 key benchmarks from the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks. In the **second tab (Human & GPT Evaluations)**, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
65
- """
66
-
67
- LLM_BENCHMARKS_TEXT = f"""
68
- Evaluation is performed against 4 popular benchmarks:
69
- - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
70
- - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
71
- - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
72
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online.
73
-
74
- We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
75
  """
76
 
77
  HUMAN_GPT_EVAL_TEXT = f"""
@@ -83,10 +71,6 @@ For more information on the calibration and initiation of these measurements, pl
83
  """
84
 
85
 
86
- EVALUATION_QUEUE_TEXT = f"""
87
- # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
88
- """
89
-
90
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
91
  CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
92
  author = {Edward Beeching, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
@@ -121,38 +105,6 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
121
  version = {v0.0.1},
122
  doi = {10.5281/zenodo.5371628},
123
  url = {https://doi.org/10.5281/zenodo.5371628}
124
- }
125
- @misc{clark2018think,
126
- title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
127
- author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
128
- year={2018},
129
- eprint={1803.05457},
130
- archivePrefix={arXiv},
131
- primaryClass={cs.AI}
132
- }
133
- @misc{zellers2019hellaswag,
134
- title={HellaSwag: Can a Machine Really Finish Your Sentence?},
135
- author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
136
- year={2019},
137
- eprint={1905.07830},
138
- archivePrefix={arXiv},
139
- primaryClass={cs.CL}
140
- }
141
- @misc{hendrycks2021measuring,
142
- title={Measuring Massive Multitask Language Understanding},
143
- author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
144
- year={2021},
145
- eprint={2009.03300},
146
- archivePrefix={arXiv},
147
- primaryClass={cs.CY}
148
- }
149
- @misc{lin2022truthfulqa,
150
- title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
151
- author={Stephanie Lin and Jacob Hilton and Owain Evans},
152
- year={2022},
153
- eprint={2109.07958},
154
- archivePrefix={arXiv},
155
- primaryClass={cs.CL}
156
  }"""
157
 
158
  VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
 
54
  - Release the leaderboard to public
55
  """
56
 
57
+ TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard (Humans and GPT4 evaluations) </h1>"""
58
 
59
  INTRODUCTION_TEXT = f"""
60
  📐 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art. The 🤗 Open LLM Leaderboard aims to track, rank and evaluate LLMs and chatbots as they are released.
61
 
62
+ 📈 Here, the evaluations are performed by having humans and GPT-4 compare completions from a set of popular open-source language models (LLMs) on a secret set of instruction prompts.
 
 
 
 
 
 
 
 
 
 
 
 
63
  """
64
 
65
  HUMAN_GPT_EVAL_TEXT = f"""
 
71
  """
72
 
73
 
 
 
 
 
74
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
75
  CITATION_BUTTON_TEXT = r"""@misc{open-llm-leaderboard,
76
  author = {Edward Beeching, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
 
105
  version = {v0.0.1},
106
  doi = {10.5281/zenodo.5371628},
107
  url = {https://doi.org/10.5281/zenodo.5371628}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  }"""
109
 
110
  VISUALIZATION_TITLE = """<h1 align="center" id="space-title">📊 Visualizations</h1>"""
src/auto_leaderboard/get_model_metadata.py DELETED
@@ -1,56 +0,0 @@
1
- import re
2
- from typing import List
3
-
4
- from src.utils_display import AutoEvalColumn
5
- from src.auto_leaderboard.model_metadata_type import get_model_type
6
-
7
- from huggingface_hub import HfApi
8
- import huggingface_hub
9
- api = HfApi()
10
-
11
-
12
- def get_model_infos_from_hub(leaderboard_data: List[dict]):
13
- for model_data in leaderboard_data:
14
- model_name = model_data["model_name_for_query"]
15
- try:
16
- model_info = api.model_info(model_name)
17
- except huggingface_hub.utils._errors.RepositoryNotFoundError:
18
- model_data[AutoEvalColumn.license.name] = None
19
- model_data[AutoEvalColumn.likes.name] = None
20
- model_data[AutoEvalColumn.params.name] = None
21
- continue
22
-
23
- model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
24
- model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
25
- model_data[AutoEvalColumn.params.name] = get_model_size(model_name, model_info)
26
-
27
-
28
- def get_model_license(model_info):
29
- try:
30
- return model_info.cardData["license"]
31
- except Exception:
32
- return None
33
-
34
- def get_model_likes(model_info):
35
- return model_info.likes
36
-
37
- size_pattern = re.compile(r"\d+(b|m)")
38
-
39
- def get_model_size(model_name, model_info):
40
- # In billions
41
- try:
42
- return round(model_info.safetensors["total"] / 1e9, 3)
43
- except AttributeError:
44
- #print(f"Repository {model_id} does not have safetensors weights")
45
- pass
46
- try:
47
- size_match = re.search(size_pattern, model_name.lower())
48
- size = size_match.group(0)
49
- return round(int(size[:-1]) if size[-1] == "b" else int(size[:-1]) / 1e3, 3)
50
- except AttributeError:
51
- return None
52
-
53
-
54
- def apply_metadata(leaderboard_data: List[dict]):
55
- get_model_type(leaderboard_data)
56
- get_model_infos_from_hub(leaderboard_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/auto_leaderboard/load_results.py DELETED
@@ -1,116 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import glob
4
- import json
5
- from typing import Dict, List, Tuple
6
-
7
- from src.utils_display import AutoEvalColumn, make_clickable_model
8
- import numpy as np
9
-
10
- # clone / pull the lmeh eval data
11
- METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
12
- BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
13
- BENCH_TO_NAME = {
14
- "arc_challenge": AutoEvalColumn.arc.name,
15
- "hellaswag": AutoEvalColumn.hellaswag.name,
16
- "hendrycks": AutoEvalColumn.mmlu.name,
17
- "truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
18
- }
19
-
20
-
21
- @dataclass
22
- class EvalResult:
23
- eval_name: str
24
- org: str
25
- model: str
26
- revision: str
27
- is_8bit: bool
28
- results: dict
29
-
30
- def to_dict(self):
31
- if self.org is not None:
32
- base_model = f"{self.org}/{self.model}"
33
- else:
34
- base_model = f"{self.model}"
35
- data_dict = {}
36
-
37
- data_dict["eval_name"] = self.eval_name # not a column, just a save name
38
- data_dict[AutoEvalColumn.is_8bit.name] = self.is_8bit
39
- data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
40
- data_dict[AutoEvalColumn.dummy.name] = base_model
41
- data_dict[AutoEvalColumn.revision.name] = self.revision
42
- data_dict[AutoEvalColumn.average.name] = round(
43
- sum([v for k, v in self.results.items()]) / 4.0, 1
44
- )
45
-
46
- for benchmark in BENCHMARKS:
47
- if not benchmark in self.results.keys():
48
- self.results[benchmark] = None
49
-
50
- for k, v in BENCH_TO_NAME.items():
51
- data_dict[v] = self.results[k]
52
-
53
- return data_dict
54
-
55
-
56
- def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
57
- with open(json_filepath) as fp:
58
- data = json.load(fp)
59
-
60
- path_split = json_filepath.split("/")
61
- org = None
62
- model = path_split[-4]
63
- is_8bit = path_split[-2] == "8bit"
64
- revision = path_split[-3]
65
- if len(path_split) == 7:
66
- # handles gpt2 type models that don't have an org
67
- result_key = f"{model}_{revision}_{is_8bit}"
68
- else:
69
- org = path_split[-5]
70
- result_key = f"{org}_{model}_{revision}_{is_8bit}"
71
-
72
- eval_result = None
73
- for benchmark, metric in zip(BENCHMARKS, METRICS):
74
- if benchmark in json_filepath:
75
- accs = np.array([v[metric] for v in data["results"].values()])
76
- mean_acc = round(np.mean(accs) * 100.0, 1)
77
- eval_result = EvalResult(
78
- result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
79
- )
80
-
81
- return result_key, eval_result
82
-
83
-
84
- def get_eval_results(is_public) -> List[EvalResult]:
85
- json_filepaths = glob.glob(
86
- "auto_evals/eval_results/public/**/16bit/*.json", recursive=True
87
- )
88
- if not is_public:
89
- json_filepaths += glob.glob(
90
- "auto_evals/eval_results/private/**/*.json", recursive=True
91
- )
92
- json_filepaths += glob.glob(
93
- "auto_evals/eval_results/private/**/*.json", recursive=True
94
- )
95
- # include the 8bit evals of public models
96
- json_filepaths += glob.glob(
97
- "auto_evals/eval_results/public/**/8bit/*.json", recursive=True
98
- )
99
- eval_results = {}
100
-
101
- for json_filepath in json_filepaths:
102
- result_key, eval_result = parse_eval_result(json_filepath)
103
- if result_key in eval_results.keys():
104
- eval_results[result_key].results.update(eval_result.results)
105
- else:
106
- eval_results[result_key] = eval_result
107
-
108
- eval_results = [v for v in eval_results.values()]
109
-
110
- return eval_results
111
-
112
-
113
- def get_eval_results_dicts(is_public=True) -> List[Dict]:
114
- eval_results = get_eval_results(is_public)
115
-
116
- return [e.to_dict() for e in eval_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/auto_leaderboard/model_metadata_type.py DELETED
@@ -1,163 +0,0 @@
1
- from enum import Enum
2
- from typing import Dict, List
3
-
4
- class ModelType(Enum):
5
- PT = "pretrained"
6
- SFT = "finetuned"
7
- RL = "with RL"
8
-
9
-
10
- TYPE_METADATA: Dict[str, ModelType] = {
11
- "aisquared/dlite-v1-355m": ModelType.SFT,
12
- "aisquared/dlite-v2-774m": ModelType.SFT,
13
- "aisquared/dlite-v2-1_5b": ModelType.SFT,
14
- "TheBloke/wizardLM-7B-HF": ModelType.SFT,
15
- "TheBloke/dromedary-65b-lora-HF": ModelType.SFT,
16
- "TheBloke/vicuna-13B-1.1-HF": ModelType.SFT,
17
- "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.SFT,
18
- "wordcab/llama-natural-instructions-13b": ModelType.SFT,
19
- "JosephusCheung/Guanaco": ModelType.SFT,
20
- "AlekseyKorshuk/vicuna-7b": ModelType.SFT,
21
- "AlekseyKorshuk/chatml-pyg-v1": ModelType.SFT,
22
- "concedo/OPT-19M-ChatSalad": ModelType.SFT,
23
- "digitous/Javalion-R": ModelType.SFT,
24
- "digitous/Alpacino30b": ModelType.SFT,
25
- "digitous/Javelin-GPTJ": ModelType.SFT,
26
- "anton-l/gpt-j-tiny-random": ModelType.SFT,
27
- "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.SFT,
28
- "gpt2-medium": ModelType.PT,
29
- "PygmalionAI/pygmalion-6b": ModelType.SFT,
30
- "medalpaca/medalpaca-7b": ModelType.SFT,
31
- "medalpaca/medalpaca-13b": ModelType.SFT,
32
- "chavinlo/alpaca-13b": ModelType.SFT,
33
- "chavinlo/alpaca-native": ModelType.SFT,
34
- "chavinlo/gpt4-x-alpaca": ModelType.SFT,
35
- "hakurei/lotus-12B": ModelType.SFT,
36
- "amazon/LightGPT": ModelType.SFT,
37
- "shibing624/chinese-llama-plus-13b-hf": ModelType.SFT,
38
- "mosaicml/mpt-7b": ModelType.PT,
39
- "PSanni/Deer-3b": ModelType.SFT,
40
- "bigscience/bloom-1b1": ModelType.PT,
41
- "MetaIX/GPT4-X-Alpasta-30b": ModelType.SFT,
42
- "EleutherAI/gpt-neox-20b": ModelType.PT,
43
- "EleutherAI/gpt-j-6b": ModelType.PT,
44
- "roneneldan/TinyStories-28M": ModelType.SFT,
45
- "lmsys/vicuna-13b-delta-v1.1": ModelType.SFT,
46
- "lmsys/vicuna-7b-delta-v1.1": ModelType.SFT,
47
- "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.SFT,
48
- "pillowtalks-ai/delta13b": ModelType.SFT,
49
- "bigcode/starcoderplus": ModelType.SFT,
50
- "microsoft/DialoGPT-large": ModelType.SFT,
51
- "microsoft/CodeGPT-small-py": ModelType.SFT,
52
- "Pirr/pythia-13b-deduped-green_devil": ModelType.SFT,
53
- "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.SFT,
54
- "Aeala/VicUnlocked-alpaca-30b": ModelType.SFT,
55
- "dvruette/llama-13b-pretrained-sft-epoch-2": ModelType.SFT,
56
- "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.SFT,
57
- "openlm-research/open_llama_3b_350bt_preview": ModelType.PT,
58
- "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
59
- "openlm-research/open_llama_7b": ModelType.PT,
60
- "openlm-research/open_llama_3b": ModelType.PT,
61
- "openlm-research/open_llama_7b_400bt_preview": ModelType.PT,
62
- "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.SFT,
63
- "GeorgiaTechResearchInstitute/galactica-6.7b-evol-instruct-70k": ModelType.SFT,
64
- "databricks/dolly-v2-7b": ModelType.SFT,
65
- "databricks/dolly-v2-3b": ModelType.SFT,
66
- "databricks/dolly-v2-12b": ModelType.SFT,
67
- "pinkmanlove/llama-65b-hf": ModelType.SFT,
68
- "Rachneet/gpt2-xl-alpaca": ModelType.SFT,
69
- "Locutusque/gpt2-conversational-or-qa": ModelType.SFT,
70
- "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.SFT,
71
- "Fredithefish/ScarletPajama-3B-HF": ModelType.SFT,
72
- "eachadea/vicuna-7b-1.1": ModelType.SFT,
73
- "eachadea/vicuna-13b": ModelType.SFT,
74
- "openaccess-ai-collective/wizard-mega-13b": ModelType.SFT,
75
- "openaccess-ai-collective/manticore-13b": ModelType.SFT,
76
- "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.SFT,
77
- "openaccess-ai-collective/minotaur-13b": ModelType.SFT,
78
- "lamini/instruct-tuned-3b": ModelType.SFT,
79
- "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.SFT,
80
- "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.SFT,
81
- "stabilityai/stablelm-tuned-alpha-7b": ModelType.SFT,
82
- "CalderaAI/30B-Lazarus": ModelType.SFT,
83
- "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.SFT,
84
- "distilgpt2": ModelType.PT,
85
- "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.SFT,
86
- "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.SFT,
87
- "junelee/wizard-vicuna-13b": ModelType.SFT,
88
- "BreadAi/StoryPy": ModelType.SFT,
89
- "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
90
- "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
91
- "Writer/camel-5b-hf": ModelType.SFT,
92
- "Writer/palmyra-base": ModelType.PT,
93
- "MBZUAI/lamini-neo-125m": ModelType.SFT,
94
- "TehVenom/DiffMerge_Pygmalion_Main-onto-V8P4": ModelType.SFT,
95
- "vicgalle/gpt2-alpaca-gpt4": ModelType.SFT,
96
- "facebook/opt-350m": ModelType.PT,
97
- "facebook/opt-125m": ModelType.PT,
98
- "facebook/opt-13b": ModelType.PT,
99
- "facebook/opt-1.3b": ModelType.PT,
100
- "facebook/opt-66b": ModelType.PT,
101
- "facebook/galactica-120b": ModelType.PT,
102
- "Abe13/jgpt2-v1": ModelType.SFT,
103
- "gpt2-xl": ModelType.PT,
104
- "HuggingFaceH4/stable-vicuna-13b-2904": ModelType.RL,
105
- "HuggingFaceH4/llama-7b-ift-alpaca": ModelType.SFT,
106
- "HuggingFaceH4/starchat-alpha": ModelType.SFT,
107
- "HuggingFaceH4/starchat-beta": ModelType.SFT,
108
- "ausboss/Llama30B-SuperHOT": ModelType.SFT,
109
- "ausboss/llama-13b-supercot": ModelType.SFT,
110
- "ausboss/llama-30b-supercot": ModelType.SFT,
111
- "Neko-Institute-of-Science/metharme-7b": ModelType.SFT,
112
- "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.SFT,
113
- "victor123/WizardLM-13B-1.0": ModelType.SFT,
114
- "AlpinDale/pygmalion-instruct": ModelType.SFT,
115
- "tiiuae/falcon-7b-instruct": ModelType.SFT,
116
- "tiiuae/falcon-40b-instruct": ModelType.SFT,
117
- "tiiuae/falcon-40b": ModelType.PT,
118
- "tiiuae/falcon-7b": ModelType.PT,
119
- "cyl/awsome-llama": ModelType.SFT,
120
- "xzuyn/Alpacino-SuperCOT-13B": ModelType.SFT,
121
- "xzuyn/MedicWizard-7B": ModelType.SFT,
122
- "beomi/KoAlpaca-Polyglot-5.8B": ModelType.SFT,
123
- "chainyo/alpaca-lora-7b": ModelType.SFT,
124
- "Salesforce/codegen-16B-nl": ModelType.PT,
125
- "Salesforce/codegen-16B-multi": ModelType.SFT,
126
- "ai-forever/rugpt3large_based_on_gpt2": ModelType.SFT,
127
- "gpt2-large": ModelType.PT,
128
- "huggingface/llama-13b": ModelType.PT,
129
- "huggingface/llama-7b": ModelType.PT,
130
- "huggingface/llama-65b": ModelType.PT,
131
- "huggingface/llama-30b": ModelType.PT,
132
- "jondurbin/airoboros-7b": ModelType.SFT,
133
- "jondurbin/airoboros-13b": ModelType.SFT,
134
- "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
135
- "cerebras/Cerebras-GPT-111M": ModelType.PT,
136
- "NousResearch/Nous-Hermes-13b": ModelType.SFT,
137
- "project-baize/baize-v2-7b": ModelType.SFT,
138
- "project-baize/baize-v2-13b": ModelType.SFT,
139
- "LLMs/AlpacaGPT4-7B-elina": ModelType.SFT,
140
- "LLMs/Vicuna-EvolInstruct-13B": ModelType.SFT,
141
- "huggingtweets/jerma985": ModelType.SFT,
142
- "huggyllama/llama-65b": ModelType.PT,
143
- "WizardLM/WizardLM-13B-1.0": ModelType.SFT,
144
- "gpt2": ModelType.PT,
145
- "alessandropalla/instruct_gpt2": ModelType.SFT,
146
- "MayaPH/FinOPT-Lincoln": ModelType.SFT,
147
- "MayaPH/FinOPT-Franklin": ModelType.SFT,
148
- "timdettmers/guanaco-33b-merged": ModelType.SFT,
149
- "timdettmers/guanaco-65b-merged": ModelType.SFT,
150
- "elinas/llama-30b-hf-transformers-4.29": ModelType.SFT,
151
- "elinas/chronos-33b": ModelType.SFT,
152
- "nmitchko/medguanaco-65b-GPTQ": ModelType.SFT,
153
- "xhyi/PT_GPTNEO350_ATG": ModelType.SFT,
154
- "h2oai/h2ogpt-oasst1-512-20b": ModelType.SFT,
155
- "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.SFT,
156
- "nomic-ai/gpt4all-13b-snoozy": ModelType.SFT,
157
- "nomic-ai/gpt4all-j": ModelType.SFT,
158
- }
159
-
160
-
161
- def get_model_type(leaderboard_data: List[dict]):
162
- for model_data in leaderboard_data:
163
- model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/elo_leaderboard/load_results.py CHANGED
@@ -143,6 +143,7 @@ def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
143
  "gpt_4_evals/data/",
144
  split="train",
145
  revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
 
146
  ).to_pandas()
147
 
148
  dfs = [df_instruct, df_code_instruct, df_all]
 
143
  "gpt_4_evals/data/",
144
  split="train",
145
  revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
146
+
147
  ).to_pandas()
148
 
149
  dfs = [df_instruct, df_code_instruct, df_all]
src/init.py CHANGED
@@ -15,27 +15,7 @@ def get_all_requested_models(requested_models_dir):
15
 
16
  return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
17
 
18
- def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
19
- auto_eval_repo = None
20
- requested_models = None
21
- if H4_TOKEN:
22
- print("Pulling evaluation requests and results.")
23
- # try:
24
- # shutil.rmtree("./auto_evals/")
25
- # except:
26
- # pass
27
-
28
- auto_eval_repo = Repository(
29
- local_dir="./auto_evals/",
30
- clone_from=LMEH_REPO,
31
- use_auth_token=H4_TOKEN,
32
- repo_type="dataset",
33
- )
34
- auto_eval_repo.git_pull()
35
-
36
- requested_models_dir = "./auto_evals/eval_requests"
37
- requested_models = get_all_requested_models(requested_models_dir)
38
-
39
  human_eval_repo = None
40
  if H4_TOKEN and not os.path.isdir("./human_evals"):
41
  print("Pulling human evaluation repo")
@@ -58,7 +38,7 @@ def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
58
  )
59
  gpt_4_eval_repo.git_pull()
60
 
61
- return auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models
62
 
63
 
64
  #def load_results(model, benchmark, metric):
 
15
 
16
  return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
17
 
18
+ def load_all_info_from_hub(HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  human_eval_repo = None
20
  if H4_TOKEN and not os.path.isdir("./human_evals"):
21
  print("Pulling human evaluation repo")
 
38
  )
39
  gpt_4_eval_repo.git_pull()
40
 
41
+ return human_eval_repo, gpt_4_eval_repo
42
 
43
 
44
  #def load_results(model, benchmark, metric):
src/utils_display.py CHANGED
@@ -12,22 +12,6 @@ class ColumnContent:
12
  def fields(raw_class):
13
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
 
15
- @dataclass(frozen=True)
16
- class AutoEvalColumn: # Auto evals column
17
- model = ColumnContent("Model", "markdown", True)
18
- revision = ColumnContent("Revision", "str", True, True)
19
- model_type = ColumnContent("Type", "bool", False)
20
- is_8bit = ColumnContent("8bit", "bool", False, True)
21
- license = ColumnContent("Hub License", "str", False)
22
- params = ColumnContent("#Params (B)", "number", False)
23
- likes = ColumnContent("Hub ❤️", "number", False)
24
- average = ColumnContent("Average ⬆️", "number", True)
25
- arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
26
- hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
27
- mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
28
- truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
29
- dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
30
-
31
  @dataclass(frozen=True)
32
  class EloEvalColumn: # Elo evals column
33
  model = ColumnContent("Model", "markdown", True)
@@ -36,16 +20,6 @@ class EloEvalColumn: # Elo evals column
36
  human_instruct = ColumnContent("Human (instruct)", "number", True)
37
  human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
38
 
39
-
40
- @dataclass(frozen=True)
41
- class EvalQueueColumn: # Queue column
42
- model = ColumnContent("model", "markdown", True)
43
- revision = ColumnContent("revision", "str", True)
44
- private = ColumnContent("private", "bool", True)
45
- is_8bit = ColumnContent("8bit_eval", "bool", True)
46
- has_delta_weight = ColumnContent("is_delta_weight", "bool", True)
47
- status = ColumnContent("status", "str", True)
48
-
49
  LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
50
 
51
 
 
12
  def fields(raw_class):
13
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  @dataclass(frozen=True)
16
  class EloEvalColumn: # Elo evals column
17
  model = ColumnContent("Model", "markdown", True)
 
20
  human_instruct = ColumnContent("Human (instruct)", "number", True)
21
  human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
22
 
 
 
 
 
 
 
 
 
 
 
23
  LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
24
 
25