junkim100 commited on
Commit
1093702
·
1 Parent(s): 57dfc04

Fixed Average Error

Browse files
app.py CHANGED
@@ -265,7 +265,7 @@ with demo:
265
  )
266
  # Check query parameter once at startup and update search bar + hidden component
267
  demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
268
-
269
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
270
  selector.change(
271
  update_table,
 
265
  )
266
  # Check query parameter once at startup and update search bar + hidden component
267
  demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
268
+
269
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
270
  selector.change(
271
  update_table,
eval-queue/.gitattributes CHANGED
@@ -53,3 +53,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
53
  *.jpg filter=lfs diff=lfs merge=lfs -text
54
  *.jpeg filter=lfs diff=lfs merge=lfs -text
55
  *.webp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
53
  *.jpg filter=lfs diff=lfs merge=lfs -text
54
  *.jpeg filter=lfs diff=lfs merge=lfs -text
55
  *.webp filter=lfs diff=lfs merge=lfs -text
56
+ # Video files - compressed
57
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ *.webm filter=lfs diff=lfs merge=lfs -text
eval-results/.gitattributes CHANGED
@@ -53,3 +53,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
53
  *.jpg filter=lfs diff=lfs merge=lfs -text
54
  *.jpeg filter=lfs diff=lfs merge=lfs -text
55
  *.webp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
53
  *.jpg filter=lfs diff=lfs merge=lfs -text
54
  *.jpeg filter=lfs diff=lfs merge=lfs -text
55
  *.webp filter=lfs diff=lfs merge=lfs -text
56
+ # Video files - compressed
57
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ *.webm filter=lfs diff=lfs merge=lfs -text
eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.015945068581236614,
366
  "mc2": 0.4670848140389129,
367
  "mc2_stderr": 0.01585178282587417
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.47107438016528924,
371
- "acc_stderr": 0.017161563949916348,
372
- "acc_norm": 0.5171192443919717,
373
- "acc_norm_stderr": 0.017180275246085626
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "01-ai/Yi-1.5-9B-32K",
 
365
  "mc1_stderr": 0.015945068581236614,
366
  "mc2": 0.4670848140389129,
367
  "mc2_stderr": 0.01585178282587417
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "01-ai/Yi-1.5-9B-32K",
eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "daily": {
4
- "daily": 10
5
  },
6
  "quarterly": {
7
- "quarterly": 10
8
  },
9
  "harness|arc_challenge|25": {
10
  "acc": 0.257679180887372,
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.016150201321323002,
366
  "mc2": 0.4721418472000992,
367
  "mc2_stderr": 0.01626625866283201
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.27863046044864226,
371
- "acc_stderr": 0.01541373949434568,
372
- "acc_norm": 0.3825265643447462,
373
- "acc_norm_stderr": 0.016709165387228803
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "BioMistral/BioMistral-7B",
 
1
  {
2
  "results": {
3
  "daily": {
4
+ "daily": 9
5
  },
6
  "quarterly": {
7
+ "quarterly": 9
8
  },
9
  "harness|arc_challenge|25": {
10
  "acc": 0.257679180887372,
 
365
  "mc1_stderr": 0.016150201321323002,
366
  "mc2": 0.4721418472000992,
367
  "mc2_stderr": 0.01626625866283201
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "BioMistral/BioMistral-7B",
eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "results": {
3
  "daily": {
4
- "daily": 11
5
  },
6
  "quarterly": {
7
- "quarterly": 11
8
  },
9
  "harness|arc_challenge|25": {
10
  "acc": 0.2235494880546075,
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.015176985027707682,
366
  "mc2": 0.4116568832959107,
367
  "mc2_stderr": 0.015044504977529799
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.27744982290436837,
371
- "acc_stderr": 0.015393630236605975,
372
- "acc_norm": 0.3400236127508855,
373
- "acc_norm_stderr": 0.016286717220737674
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "EleutherAI/polyglot-ko-1.3b",
 
1
  {
2
  "results": {
3
  "daily": {
4
+ "daily": 10
5
  },
6
  "quarterly": {
7
+ "quarterly": 10
8
  },
9
  "harness|arc_challenge|25": {
10
  "acc": 0.2235494880546075,
 
365
  "mc1_stderr": 0.015176985027707682,
366
  "mc2": 0.4116568832959107,
367
  "mc2_stderr": 0.015044504977529799
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "EleutherAI/polyglot-ko-1.3b",
eval-results/HuggingFaceH4/zephyr-7b-beta/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.01648214881024147,
366
  "mc2": 0.5171680571717291,
367
  "mc2_stderr": 0.01606077987901482
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.39787485242030696,
371
- "acc_stderr": 0.01682795905473339,
372
- "acc_norm": 0.4014167650531287,
373
- "acc_norm_stderr": 0.01685290785872906
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "HuggingFaceH4/zephyr-7b-beta",
 
365
  "mc1_stderr": 0.01648214881024147,
366
  "mc2": 0.5171680571717291,
367
  "mc2_stderr": 0.01606077987901482
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "HuggingFaceH4/zephyr-7b-beta",
eval-results/nlpai-lab/KULLM3/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.016542412809494877,
366
  "mc2": 0.49995145184296846,
367
  "mc2_stderr": 0.015887726098900913
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.564344746162928,
371
- "acc_stderr": 0.017047415229476316,
372
- "acc_norm": 0.6068476977567887,
373
- "acc_norm_stderr": 0.016793262801287068
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "nlpai-lab/KULLM3",
 
365
  "mc1_stderr": 0.016542412809494877,
366
  "mc2": 0.49995145184296846,
367
  "mc2_stderr": 0.015887726098900913
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "nlpai-lab/KULLM3",
eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.01687480500145318,
366
  "mc2": 0.7522925779273922,
367
  "mc2_stderr": 0.014568927682929578
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.45218417945690675,
371
- "acc_stderr": 0.017111567130916785,
372
- "acc_norm": 0.45454545454545453,
373
- "acc_norm_stderr": 0.017119172208061504
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "x2bee/POLAR-14B-DPO-v1.3",
 
365
  "mc1_stderr": 0.01687480500145318,
366
  "mc2": 0.7522925779273922,
367
  "mc2_stderr": 0.014568927682929578
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "x2bee/POLAR-14B-DPO-v1.3",
eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.01746379386716811,
366
  "mc2": NaN,
367
  "mc2_stderr": NaN
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.44037780401416765,
371
- "acc_stderr": 0.01706769977431298,
372
- "acc_norm": 0.44510035419126326,
373
- "acc_norm_stderr": 0.01708641743100547
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "x2bee/POLAR-14B-DPO-v1.4",
 
365
  "mc1_stderr": 0.01746379386716811,
366
  "mc2": NaN,
367
  "mc2_stderr": NaN
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "x2bee/POLAR-14B-DPO-v1.4",
eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.0165424128094949,
366
  "mc2": 0.7515104740134964,
367
  "mc2_stderr": 0.014200593490054807
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.5147579693034239,
371
- "acc_stderr": 0.01718286443499856,
372
- "acc_norm": 0.526564344746163,
373
- "acc_norm_stderr": 0.017166075717577747
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
 
365
  "mc1_stderr": 0.0165424128094949,
366
  "mc2": 0.7515104740134964,
367
  "mc2_stderr": 0.014200593490054807
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.017106588140700332,
366
  "mc2": 0.7254831072808595,
367
  "mc2_stderr": 0.014162522228042162
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.5926800472255017,
371
- "acc_stderr": 0.01689245669519127,
372
- "acc_norm": 0.6269185360094451,
373
- "acc_norm_stderr": 0.016627318275137453
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
 
365
  "mc1_stderr": 0.017106588140700332,
366
  "mc2": 0.7254831072808595,
367
  "mc2_stderr": 0.014162522228042162
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
eval-results/x2bee/POLAR-14B-v0.2/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.01563813566777552,
366
  "mc2": 0.8107575910195236,
367
  "mc2_stderr": 0.013335029489665237
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.525383707201889,
371
- "acc_stderr": 0.017168187201429253,
372
- "acc_norm": 0.5442739079102715,
373
- "acc_norm_stderr": 0.017122829143292655
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "x2bee/POLAR-14B-v0.2",
 
365
  "mc1_stderr": 0.01563813566777552,
366
  "mc2": 0.8107575910195236,
367
  "mc2_stderr": 0.013335029489665237
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "x2bee/POLAR-14B-v0.2",
eval-results/x2bee/POLAR-14B-v0.5/result.json CHANGED
@@ -365,12 +365,6 @@
365
  "mc1_stderr": 0.014421468452506978,
366
  "mc2": 0.8572574997405501,
367
  "mc2_stderr": 0.01200311225898601
368
- },
369
- "harness|commongen_v2|2": {
370
- "acc": 0.5159386068476978,
371
- "acc_stderr": 0.017181617837190195,
372
- "acc_norm": 0.5301062573789846,
373
- "acc_norm_stderr": 0.01715916359017022
374
  }
375
  },
376
  "versions": {
@@ -434,8 +428,7 @@
434
  "harness|mmlu_professional_law|5": 1,
435
  "harness|mmlu_high_school_us_history|5": 1,
436
  "harness|mmlu_high_school_european_history|5": 1,
437
- "harness|truthfulqa_mc|0": 0,
438
- "harness|commongen_v2|2": 1
439
  },
440
  "config_general": {
441
  "model_name": "x2bee/POLAR-14B-v0.5",
 
365
  "mc1_stderr": 0.014421468452506978,
366
  "mc2": 0.8572574997405501,
367
  "mc2_stderr": 0.01200311225898601
 
 
 
 
 
 
368
  }
369
  },
370
  "versions": {
 
428
  "harness|mmlu_professional_law|5": 1,
429
  "harness|mmlu_high_school_us_history|5": 1,
430
  "harness|mmlu_high_school_european_history|5": 1,
431
+ "harness|truthfulqa_mc|0": 0
 
432
  },
433
  "config_general": {
434
  "model_name": "x2bee/POLAR-14B-v0.5",
src/__pycache__/populate.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
 
src/display/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ
 
src/display/utils.py CHANGED
@@ -3,8 +3,11 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
 
6
  def fields(raw_class):
7
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
 
8
 
9
 
10
  @dataclass
@@ -13,6 +16,7 @@ class Task:
13
  metric: str
14
  col_name: str
15
 
 
16
  class Tasks(Enum):
17
  arc = Task("arc_challenge", "acc_norm", "ARC")
18
  hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
@@ -20,12 +24,13 @@ class Tasks(Enum):
20
  truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
21
  # winogrande = Task("winogrande", "acc_norm", "Winogrande")
22
  # gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
23
- commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
24
  # eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
25
  # instFollow = Task("inst_follow", "acc_norm", "InstFollow")
26
  # harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
27
  # helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
28
 
 
29
  class Ranks(Enum):
30
  daily = Task("daily", "daily", "Daily Rank")
31
  quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
@@ -43,31 +48,84 @@ class ColumnContent:
43
  never_hidden: bool = False
44
  dummy: bool = False
45
 
 
46
  auto_eval_column_dict = []
47
  # Init
48
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
49
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Ranks
51
- auto_eval_column_dict.append(["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)])
52
- auto_eval_column_dict.append(["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)])
 
 
 
 
53
  # Scores
54
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
55
  for task in Tasks:
56
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
57
  # Model information
58
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
59
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
60
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
61
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
62
- auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
63
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
64
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
65
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
66
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
67
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
68
- auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Dummy column for the search bar (hidden by the custom CSS)
70
- auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 
 
 
 
 
 
71
 
72
  # We use make dataclass to dynamically fill the scores from Tasks
73
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -82,15 +140,17 @@ class EvalQueueColumn: # Queue column
82
  weight_type = ColumnContent("weight_type", "str", "Original")
83
  status = ColumnContent("status", "str", True)
84
 
 
85
  # Define the human baselines
86
  human_baseline_row = {
87
  AutoEvalColumn.model.name: "<p>Human performance</p>",
88
  }
89
 
 
90
  @dataclass
91
  class ModelDetails:
92
  name: str
93
- symbol: str = "" # emoji, only for the model type
94
 
95
 
96
  class ModelType(Enum):
@@ -115,11 +175,13 @@ class ModelType(Enum):
115
  return ModelType.IFT
116
  return ModelType.Unknown
117
 
 
118
  class WeightType(Enum):
119
  Adapter = ModelDetails("Adapter")
120
  Original = ModelDetails("Original")
121
  Delta = ModelDetails("Delta")
122
 
 
123
  class Precision(Enum):
124
  float16 = ModelDetails("float16")
125
  # bfloat16 = ModelDetails("bfloat16")
@@ -138,15 +200,17 @@ class Precision(Enum):
138
  if precision in ["GPTQ", "None"]:
139
  return Precision.qt_GPTQ
140
  return Precision.Unknown
141
-
142
-
143
 
144
 
145
  # Column selection
146
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
147
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
148
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
149
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
 
 
 
150
 
151
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
152
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
3
 
4
  import pandas as pd
5
 
6
+
7
  def fields(raw_class):
8
+ return [
9
+ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
10
+ ]
11
 
12
 
13
  @dataclass
 
16
  metric: str
17
  col_name: str
18
 
19
+
20
  class Tasks(Enum):
21
  arc = Task("arc_challenge", "acc_norm", "ARC")
22
  hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
 
24
  truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
25
  # winogrande = Task("winogrande", "acc_norm", "Winogrande")
26
  # gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
27
+ # commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
28
  # eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
29
  # instFollow = Task("inst_follow", "acc_norm", "InstFollow")
30
  # harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
31
  # helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
32
 
33
+
34
  class Ranks(Enum):
35
  daily = Task("daily", "daily", "Daily Rank")
36
  quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
 
48
  never_hidden: bool = False
49
  dummy: bool = False
50
 
51
+
52
  auto_eval_column_dict = []
53
  # Init
54
+ auto_eval_column_dict.append(
55
+ [
56
+ "model_type_symbol",
57
+ ColumnContent,
58
+ ColumnContent("T", "str", True, never_hidden=True),
59
+ ]
60
+ )
61
+ auto_eval_column_dict.append(
62
+ [
63
+ "model",
64
+ ColumnContent,
65
+ ColumnContent("Model", "markdown", True, never_hidden=True),
66
+ ]
67
+ )
68
  # Ranks
69
+ auto_eval_column_dict.append(
70
+ ["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)]
71
+ )
72
+ auto_eval_column_dict.append(
73
+ ["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)]
74
+ )
75
  # Scores
76
+ auto_eval_column_dict.append(
77
+ ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
78
+ )
79
  for task in Tasks:
80
+ auto_eval_column_dict.append(
81
+ [task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]
82
+ )
83
  # Model information
84
+ auto_eval_column_dict.append(
85
+ ["model_type", ColumnContent, ColumnContent("Type", "str", False)]
86
+ )
87
+ auto_eval_column_dict.append(
88
+ ["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]
89
+ )
90
+ auto_eval_column_dict.append(
91
+ ["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]
92
+ )
93
+ auto_eval_column_dict.append(
94
+ ["precision", ColumnContent, ColumnContent("Precision", "str", False)]
95
+ )
96
+ auto_eval_column_dict.append(
97
+ ["merged", ColumnContent, ColumnContent("Merged", "bool", False)]
98
+ )
99
+ auto_eval_column_dict.append(
100
+ ["license", ColumnContent, ColumnContent("Hub License", "str", False)]
101
+ )
102
+ auto_eval_column_dict.append(
103
+ ["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]
104
+ )
105
+ auto_eval_column_dict.append(
106
+ ["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]
107
+ )
108
+ auto_eval_column_dict.append(
109
+ [
110
+ "still_on_hub",
111
+ ColumnContent,
112
+ ColumnContent("Available on the hub", "bool", False),
113
+ ]
114
+ )
115
+ auto_eval_column_dict.append(
116
+ ["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]
117
+ )
118
+ auto_eval_column_dict.append(
119
+ ["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)]
120
+ )
121
  # Dummy column for the search bar (hidden by the custom CSS)
122
+ auto_eval_column_dict.append(
123
+ [
124
+ "dummy",
125
+ ColumnContent,
126
+ ColumnContent("model_name_for_query", "str", False, dummy=True),
127
+ ]
128
+ )
129
 
130
  # We use make dataclass to dynamically fill the scores from Tasks
131
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
140
  weight_type = ColumnContent("weight_type", "str", "Original")
141
  status = ColumnContent("status", "str", True)
142
 
143
+
144
  # Define the human baselines
145
  human_baseline_row = {
146
  AutoEvalColumn.model.name: "<p>Human performance</p>",
147
  }
148
 
149
+
150
  @dataclass
151
  class ModelDetails:
152
  name: str
153
+ symbol: str = "" # emoji, only for the model type
154
 
155
 
156
  class ModelType(Enum):
 
175
  return ModelType.IFT
176
  return ModelType.Unknown
177
 
178
+
179
  class WeightType(Enum):
180
  Adapter = ModelDetails("Adapter")
181
  Original = ModelDetails("Original")
182
  Delta = ModelDetails("Delta")
183
 
184
+
185
  class Precision(Enum):
186
  float16 = ModelDetails("float16")
187
  # bfloat16 = ModelDetails("bfloat16")
 
200
  if precision in ["GPTQ", "None"]:
201
  return Precision.qt_GPTQ
202
  return Precision.Unknown
 
 
203
 
204
 
205
  # Column selection
206
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
207
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
208
+ COLS_LITE = [
209
+ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
210
+ ]
211
+ TYPES_LITE = [
212
+ c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
213
+ ]
214
 
215
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
216
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
src/populate.py CHANGED
@@ -9,7 +9,9 @@ from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
  # all_data_json.append(baseline_row)
@@ -49,7 +51,9 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
49
  all_evals.append(data)
50
  elif ".md" not in entry:
51
  # this is a folder
52
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
 
 
53
  for sub_entry in sub_entries:
54
  file_path = os.path.join(save_path, entry, sub_entry)
55
  with open(file_path) as fp:
@@ -61,7 +65,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
61
 
62
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
63
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
64
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
 
 
 
 
65
  failed_list = [e for e in all_evals if e["status"] == "FAILED"]
66
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
67
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
 
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
+ def get_leaderboard_df(
13
+ results_path: str, requests_path: str, cols: list, benchmark_cols: list
14
+ ) -> pd.DataFrame:
15
  raw_data = get_raw_eval_results(results_path, requests_path)
16
  all_data_json = [v.to_dict() for v in raw_data]
17
  # all_data_json.append(baseline_row)
 
51
  all_evals.append(data)
52
  elif ".md" not in entry:
53
  # this is a folder
54
+ sub_entries = [
55
+ e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")
56
+ ]
57
  for sub_entry in sub_entries:
58
  file_path = os.path.join(save_path, entry, sub_entry)
59
  with open(file_path) as fp:
 
65
 
66
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
67
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
68
+ finished_list = [
69
+ e
70
+ for e in all_evals
71
+ if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"
72
+ ]
73
  failed_list = [e for e in all_evals if e["status"] == "FAILED"]
74
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
75
  df_running = pd.DataFrame.from_records(running_list, columns=cols)