Spaces:
Runtime error
Runtime error
Fixed Average Error
Browse files- app.py +1 -1
- eval-queue/.gitattributes +3 -0
- eval-results/.gitattributes +3 -0
- eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json +1 -8
- eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json +3 -10
- eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json +3 -10
- eval-results/HuggingFaceH4/zephyr-7b-beta/result.json +1 -8
- eval-results/nlpai-lab/KULLM3/result.json +1 -8
- eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json +1 -8
- eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json +1 -8
- eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json +1 -8
- eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json +1 -8
- eval-results/x2bee/POLAR-14B-v0.2/result.json +1 -8
- eval-results/x2bee/POLAR-14B-v0.5/result.json +1 -8
- src/__pycache__/populate.cpython-310.pyc +0 -0
- src/display/__pycache__/utils.cpython-310.pyc +0 -0
- src/display/utils.py +89 -25
- src/populate.py +11 -3
app.py
CHANGED
@@ -265,7 +265,7 @@ with demo:
|
|
265 |
)
|
266 |
# Check query parameter once at startup and update search bar + hidden component
|
267 |
demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
|
268 |
-
|
269 |
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
270 |
selector.change(
|
271 |
update_table,
|
|
|
265 |
)
|
266 |
# Check query parameter once at startup and update search bar + hidden component
|
267 |
demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
|
268 |
+
|
269 |
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
270 |
selector.change(
|
271 |
update_table,
|
eval-queue/.gitattributes
CHANGED
@@ -53,3 +53,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
53 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
*.webp filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
53 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
*.webp filter=lfs diff=lfs merge=lfs -text
|
56 |
+
# Video files - compressed
|
57 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
58 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
eval-results/.gitattributes
CHANGED
@@ -53,3 +53,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
53 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
*.webp filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
53 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
*.webp filter=lfs diff=lfs merge=lfs -text
|
56 |
+
# Video files - compressed
|
57 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
58 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.015945068581236614,
|
366 |
"mc2": 0.4670848140389129,
|
367 |
"mc2_stderr": 0.01585178282587417
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.47107438016528924,
|
371 |
-
"acc_stderr": 0.017161563949916348,
|
372 |
-
"acc_norm": 0.5171192443919717,
|
373 |
-
"acc_norm_stderr": 0.017180275246085626
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "01-ai/Yi-1.5-9B-32K",
|
|
|
365 |
"mc1_stderr": 0.015945068581236614,
|
366 |
"mc2": 0.4670848140389129,
|
367 |
"mc2_stderr": 0.01585178282587417
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "01-ai/Yi-1.5-9B-32K",
|
eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"daily": {
|
4 |
-
"daily":
|
5 |
},
|
6 |
"quarterly": {
|
7 |
-
"quarterly":
|
8 |
},
|
9 |
"harness|arc_challenge|25": {
|
10 |
"acc": 0.257679180887372,
|
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.016150201321323002,
|
366 |
"mc2": 0.4721418472000992,
|
367 |
"mc2_stderr": 0.01626625866283201
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.27863046044864226,
|
371 |
-
"acc_stderr": 0.01541373949434568,
|
372 |
-
"acc_norm": 0.3825265643447462,
|
373 |
-
"acc_norm_stderr": 0.016709165387228803
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "BioMistral/BioMistral-7B",
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"daily": {
|
4 |
+
"daily": 9
|
5 |
},
|
6 |
"quarterly": {
|
7 |
+
"quarterly": 9
|
8 |
},
|
9 |
"harness|arc_challenge|25": {
|
10 |
"acc": 0.257679180887372,
|
|
|
365 |
"mc1_stderr": 0.016150201321323002,
|
366 |
"mc2": 0.4721418472000992,
|
367 |
"mc2_stderr": 0.01626625866283201
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "BioMistral/BioMistral-7B",
|
eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"daily": {
|
4 |
-
"daily":
|
5 |
},
|
6 |
"quarterly": {
|
7 |
-
"quarterly":
|
8 |
},
|
9 |
"harness|arc_challenge|25": {
|
10 |
"acc": 0.2235494880546075,
|
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.015176985027707682,
|
366 |
"mc2": 0.4116568832959107,
|
367 |
"mc2_stderr": 0.015044504977529799
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.27744982290436837,
|
371 |
-
"acc_stderr": 0.015393630236605975,
|
372 |
-
"acc_norm": 0.3400236127508855,
|
373 |
-
"acc_norm_stderr": 0.016286717220737674
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "EleutherAI/polyglot-ko-1.3b",
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"daily": {
|
4 |
+
"daily": 10
|
5 |
},
|
6 |
"quarterly": {
|
7 |
+
"quarterly": 10
|
8 |
},
|
9 |
"harness|arc_challenge|25": {
|
10 |
"acc": 0.2235494880546075,
|
|
|
365 |
"mc1_stderr": 0.015176985027707682,
|
366 |
"mc2": 0.4116568832959107,
|
367 |
"mc2_stderr": 0.015044504977529799
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "EleutherAI/polyglot-ko-1.3b",
|
eval-results/HuggingFaceH4/zephyr-7b-beta/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.01648214881024147,
|
366 |
"mc2": 0.5171680571717291,
|
367 |
"mc2_stderr": 0.01606077987901482
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.39787485242030696,
|
371 |
-
"acc_stderr": 0.01682795905473339,
|
372 |
-
"acc_norm": 0.4014167650531287,
|
373 |
-
"acc_norm_stderr": 0.01685290785872906
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "HuggingFaceH4/zephyr-7b-beta",
|
|
|
365 |
"mc1_stderr": 0.01648214881024147,
|
366 |
"mc2": 0.5171680571717291,
|
367 |
"mc2_stderr": 0.01606077987901482
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "HuggingFaceH4/zephyr-7b-beta",
|
eval-results/nlpai-lab/KULLM3/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.016542412809494877,
|
366 |
"mc2": 0.49995145184296846,
|
367 |
"mc2_stderr": 0.015887726098900913
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.564344746162928,
|
371 |
-
"acc_stderr": 0.017047415229476316,
|
372 |
-
"acc_norm": 0.6068476977567887,
|
373 |
-
"acc_norm_stderr": 0.016793262801287068
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "nlpai-lab/KULLM3",
|
|
|
365 |
"mc1_stderr": 0.016542412809494877,
|
366 |
"mc2": 0.49995145184296846,
|
367 |
"mc2_stderr": 0.015887726098900913
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "nlpai-lab/KULLM3",
|
eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.01687480500145318,
|
366 |
"mc2": 0.7522925779273922,
|
367 |
"mc2_stderr": 0.014568927682929578
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.45218417945690675,
|
371 |
-
"acc_stderr": 0.017111567130916785,
|
372 |
-
"acc_norm": 0.45454545454545453,
|
373 |
-
"acc_norm_stderr": 0.017119172208061504
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "x2bee/POLAR-14B-DPO-v1.3",
|
|
|
365 |
"mc1_stderr": 0.01687480500145318,
|
366 |
"mc2": 0.7522925779273922,
|
367 |
"mc2_stderr": 0.014568927682929578
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "x2bee/POLAR-14B-DPO-v1.3",
|
eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.01746379386716811,
|
366 |
"mc2": NaN,
|
367 |
"mc2_stderr": NaN
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.44037780401416765,
|
371 |
-
"acc_stderr": 0.01706769977431298,
|
372 |
-
"acc_norm": 0.44510035419126326,
|
373 |
-
"acc_norm_stderr": 0.01708641743100547
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "x2bee/POLAR-14B-DPO-v1.4",
|
|
|
365 |
"mc1_stderr": 0.01746379386716811,
|
366 |
"mc2": NaN,
|
367 |
"mc2_stderr": NaN
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "x2bee/POLAR-14B-DPO-v1.4",
|
eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.0165424128094949,
|
366 |
"mc2": 0.7515104740134964,
|
367 |
"mc2_stderr": 0.014200593490054807
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.5147579693034239,
|
371 |
-
"acc_stderr": 0.01718286443499856,
|
372 |
-
"acc_norm": 0.526564344746163,
|
373 |
-
"acc_norm_stderr": 0.017166075717577747
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
|
|
|
365 |
"mc1_stderr": 0.0165424128094949,
|
366 |
"mc2": 0.7515104740134964,
|
367 |
"mc2_stderr": 0.014200593490054807
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
|
eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.017106588140700332,
|
366 |
"mc2": 0.7254831072808595,
|
367 |
"mc2_stderr": 0.014162522228042162
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.5926800472255017,
|
371 |
-
"acc_stderr": 0.01689245669519127,
|
372 |
-
"acc_norm": 0.6269185360094451,
|
373 |
-
"acc_norm_stderr": 0.016627318275137453
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
|
|
|
365 |
"mc1_stderr": 0.017106588140700332,
|
366 |
"mc2": 0.7254831072808595,
|
367 |
"mc2_stderr": 0.014162522228042162
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
|
eval-results/x2bee/POLAR-14B-v0.2/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.01563813566777552,
|
366 |
"mc2": 0.8107575910195236,
|
367 |
"mc2_stderr": 0.013335029489665237
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.525383707201889,
|
371 |
-
"acc_stderr": 0.017168187201429253,
|
372 |
-
"acc_norm": 0.5442739079102715,
|
373 |
-
"acc_norm_stderr": 0.017122829143292655
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "x2bee/POLAR-14B-v0.2",
|
|
|
365 |
"mc1_stderr": 0.01563813566777552,
|
366 |
"mc2": 0.8107575910195236,
|
367 |
"mc2_stderr": 0.013335029489665237
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "x2bee/POLAR-14B-v0.2",
|
eval-results/x2bee/POLAR-14B-v0.5/result.json
CHANGED
@@ -365,12 +365,6 @@
|
|
365 |
"mc1_stderr": 0.014421468452506978,
|
366 |
"mc2": 0.8572574997405501,
|
367 |
"mc2_stderr": 0.01200311225898601
|
368 |
-
},
|
369 |
-
"harness|commongen_v2|2": {
|
370 |
-
"acc": 0.5159386068476978,
|
371 |
-
"acc_stderr": 0.017181617837190195,
|
372 |
-
"acc_norm": 0.5301062573789846,
|
373 |
-
"acc_norm_stderr": 0.01715916359017022
|
374 |
}
|
375 |
},
|
376 |
"versions": {
|
@@ -434,8 +428,7 @@
|
|
434 |
"harness|mmlu_professional_law|5": 1,
|
435 |
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
-
"harness|truthfulqa_mc|0": 0
|
438 |
-
"harness|commongen_v2|2": 1
|
439 |
},
|
440 |
"config_general": {
|
441 |
"model_name": "x2bee/POLAR-14B-v0.5",
|
|
|
365 |
"mc1_stderr": 0.014421468452506978,
|
366 |
"mc2": 0.8572574997405501,
|
367 |
"mc2_stderr": 0.01200311225898601
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
},
|
370 |
"versions": {
|
|
|
428 |
"harness|mmlu_professional_law|5": 1,
|
429 |
"harness|mmlu_high_school_us_history|5": 1,
|
430 |
"harness|mmlu_high_school_european_history|5": 1,
|
431 |
+
"harness|truthfulqa_mc|0": 0
|
|
|
432 |
},
|
433 |
"config_general": {
|
434 |
"model_name": "x2bee/POLAR-14B-v0.5",
|
src/__pycache__/populate.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/populate.cpython-310.pyc and b/src/__pycache__/populate.cpython-310.pyc differ
|
|
src/display/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ
|
|
src/display/utils.py
CHANGED
@@ -3,8 +3,11 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
def fields(raw_class):
|
7 |
-
return [
|
|
|
|
|
8 |
|
9 |
|
10 |
@dataclass
|
@@ -13,6 +16,7 @@ class Task:
|
|
13 |
metric: str
|
14 |
col_name: str
|
15 |
|
|
|
16 |
class Tasks(Enum):
|
17 |
arc = Task("arc_challenge", "acc_norm", "ARC")
|
18 |
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
@@ -20,12 +24,13 @@ class Tasks(Enum):
|
|
20 |
truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
|
21 |
# winogrande = Task("winogrande", "acc_norm", "Winogrande")
|
22 |
# gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
|
23 |
-
commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
|
24 |
# eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
|
25 |
# instFollow = Task("inst_follow", "acc_norm", "InstFollow")
|
26 |
# harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
|
27 |
# helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
|
28 |
|
|
|
29 |
class Ranks(Enum):
|
30 |
daily = Task("daily", "daily", "Daily Rank")
|
31 |
quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
|
@@ -43,31 +48,84 @@ class ColumnContent:
|
|
43 |
never_hidden: bool = False
|
44 |
dummy: bool = False
|
45 |
|
|
|
46 |
auto_eval_column_dict = []
|
47 |
# Init
|
48 |
-
auto_eval_column_dict.append(
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
# Ranks
|
51 |
-
auto_eval_column_dict.append(
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
# Scores
|
54 |
-
auto_eval_column_dict.append(
|
|
|
|
|
55 |
for task in Tasks:
|
56 |
-
auto_eval_column_dict.append(
|
|
|
|
|
57 |
# Model information
|
58 |
-
auto_eval_column_dict.append(
|
59 |
-
|
60 |
-
|
61 |
-
auto_eval_column_dict.append(
|
62 |
-
|
63 |
-
|
64 |
-
auto_eval_column_dict.append(
|
65 |
-
|
66 |
-
|
67 |
-
auto_eval_column_dict.append(
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
# Dummy column for the search bar (hidden by the custom CSS)
|
70 |
-
auto_eval_column_dict.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
# We use make dataclass to dynamically fill the scores from Tasks
|
73 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -82,15 +140,17 @@ class EvalQueueColumn: # Queue column
|
|
82 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
83 |
status = ColumnContent("status", "str", True)
|
84 |
|
|
|
85 |
# Define the human baselines
|
86 |
human_baseline_row = {
|
87 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
88 |
}
|
89 |
|
|
|
90 |
@dataclass
|
91 |
class ModelDetails:
|
92 |
name: str
|
93 |
-
symbol: str = ""
|
94 |
|
95 |
|
96 |
class ModelType(Enum):
|
@@ -115,11 +175,13 @@ class ModelType(Enum):
|
|
115 |
return ModelType.IFT
|
116 |
return ModelType.Unknown
|
117 |
|
|
|
118 |
class WeightType(Enum):
|
119 |
Adapter = ModelDetails("Adapter")
|
120 |
Original = ModelDetails("Original")
|
121 |
Delta = ModelDetails("Delta")
|
122 |
|
|
|
123 |
class Precision(Enum):
|
124 |
float16 = ModelDetails("float16")
|
125 |
# bfloat16 = ModelDetails("bfloat16")
|
@@ -138,15 +200,17 @@ class Precision(Enum):
|
|
138 |
if precision in ["GPTQ", "None"]:
|
139 |
return Precision.qt_GPTQ
|
140 |
return Precision.Unknown
|
141 |
-
|
142 |
-
|
143 |
|
144 |
|
145 |
# Column selection
|
146 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
147 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
148 |
-
COLS_LITE = [
|
149 |
-
|
|
|
|
|
|
|
|
|
150 |
|
151 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
152 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
|
7 |
def fields(raw_class):
|
8 |
+
return [
|
9 |
+
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
|
10 |
+
]
|
11 |
|
12 |
|
13 |
@dataclass
|
|
|
16 |
metric: str
|
17 |
col_name: str
|
18 |
|
19 |
+
|
20 |
class Tasks(Enum):
|
21 |
arc = Task("arc_challenge", "acc_norm", "ARC")
|
22 |
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
|
|
24 |
truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
|
25 |
# winogrande = Task("winogrande", "acc_norm", "Winogrande")
|
26 |
# gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
|
27 |
+
# commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
|
28 |
# eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
|
29 |
# instFollow = Task("inst_follow", "acc_norm", "InstFollow")
|
30 |
# harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
|
31 |
# helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
|
32 |
|
33 |
+
|
34 |
class Ranks(Enum):
|
35 |
daily = Task("daily", "daily", "Daily Rank")
|
36 |
quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
|
|
|
48 |
never_hidden: bool = False
|
49 |
dummy: bool = False
|
50 |
|
51 |
+
|
52 |
auto_eval_column_dict = []
|
53 |
# Init
|
54 |
+
auto_eval_column_dict.append(
|
55 |
+
[
|
56 |
+
"model_type_symbol",
|
57 |
+
ColumnContent,
|
58 |
+
ColumnContent("T", "str", True, never_hidden=True),
|
59 |
+
]
|
60 |
+
)
|
61 |
+
auto_eval_column_dict.append(
|
62 |
+
[
|
63 |
+
"model",
|
64 |
+
ColumnContent,
|
65 |
+
ColumnContent("Model", "markdown", True, never_hidden=True),
|
66 |
+
]
|
67 |
+
)
|
68 |
# Ranks
|
69 |
+
auto_eval_column_dict.append(
|
70 |
+
["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)]
|
71 |
+
)
|
72 |
+
auto_eval_column_dict.append(
|
73 |
+
["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)]
|
74 |
+
)
|
75 |
# Scores
|
76 |
+
auto_eval_column_dict.append(
|
77 |
+
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
|
78 |
+
)
|
79 |
for task in Tasks:
|
80 |
+
auto_eval_column_dict.append(
|
81 |
+
[task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]
|
82 |
+
)
|
83 |
# Model information
|
84 |
+
auto_eval_column_dict.append(
|
85 |
+
["model_type", ColumnContent, ColumnContent("Type", "str", False)]
|
86 |
+
)
|
87 |
+
auto_eval_column_dict.append(
|
88 |
+
["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]
|
89 |
+
)
|
90 |
+
auto_eval_column_dict.append(
|
91 |
+
["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]
|
92 |
+
)
|
93 |
+
auto_eval_column_dict.append(
|
94 |
+
["precision", ColumnContent, ColumnContent("Precision", "str", False)]
|
95 |
+
)
|
96 |
+
auto_eval_column_dict.append(
|
97 |
+
["merged", ColumnContent, ColumnContent("Merged", "bool", False)]
|
98 |
+
)
|
99 |
+
auto_eval_column_dict.append(
|
100 |
+
["license", ColumnContent, ColumnContent("Hub License", "str", False)]
|
101 |
+
)
|
102 |
+
auto_eval_column_dict.append(
|
103 |
+
["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]
|
104 |
+
)
|
105 |
+
auto_eval_column_dict.append(
|
106 |
+
["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]
|
107 |
+
)
|
108 |
+
auto_eval_column_dict.append(
|
109 |
+
[
|
110 |
+
"still_on_hub",
|
111 |
+
ColumnContent,
|
112 |
+
ColumnContent("Available on the hub", "bool", False),
|
113 |
+
]
|
114 |
+
)
|
115 |
+
auto_eval_column_dict.append(
|
116 |
+
["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]
|
117 |
+
)
|
118 |
+
auto_eval_column_dict.append(
|
119 |
+
["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)]
|
120 |
+
)
|
121 |
# Dummy column for the search bar (hidden by the custom CSS)
|
122 |
+
auto_eval_column_dict.append(
|
123 |
+
[
|
124 |
+
"dummy",
|
125 |
+
ColumnContent,
|
126 |
+
ColumnContent("model_name_for_query", "str", False, dummy=True),
|
127 |
+
]
|
128 |
+
)
|
129 |
|
130 |
# We use make dataclass to dynamically fill the scores from Tasks
|
131 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
140 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
141 |
status = ColumnContent("status", "str", True)
|
142 |
|
143 |
+
|
144 |
# Define the human baselines
|
145 |
human_baseline_row = {
|
146 |
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
147 |
}
|
148 |
|
149 |
+
|
150 |
@dataclass
|
151 |
class ModelDetails:
|
152 |
name: str
|
153 |
+
symbol: str = "" # emoji, only for the model type
|
154 |
|
155 |
|
156 |
class ModelType(Enum):
|
|
|
175 |
return ModelType.IFT
|
176 |
return ModelType.Unknown
|
177 |
|
178 |
+
|
179 |
class WeightType(Enum):
|
180 |
Adapter = ModelDetails("Adapter")
|
181 |
Original = ModelDetails("Original")
|
182 |
Delta = ModelDetails("Delta")
|
183 |
|
184 |
+
|
185 |
class Precision(Enum):
|
186 |
float16 = ModelDetails("float16")
|
187 |
# bfloat16 = ModelDetails("bfloat16")
|
|
|
200 |
if precision in ["GPTQ", "None"]:
|
201 |
return Precision.qt_GPTQ
|
202 |
return Precision.Unknown
|
|
|
|
|
203 |
|
204 |
|
205 |
# Column selection
|
206 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
207 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
208 |
+
COLS_LITE = [
|
209 |
+
c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
210 |
+
]
|
211 |
+
TYPES_LITE = [
|
212 |
+
c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
213 |
+
]
|
214 |
|
215 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
216 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
src/populate.py
CHANGED
@@ -9,7 +9,9 @@ from src.leaderboard.filter_models import filter_models
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(
|
|
|
|
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
# all_data_json.append(baseline_row)
|
@@ -49,7 +51,9 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
49 |
all_evals.append(data)
|
50 |
elif ".md" not in entry:
|
51 |
# this is a folder
|
52 |
-
sub_entries = [
|
|
|
|
|
53 |
for sub_entry in sub_entries:
|
54 |
file_path = os.path.join(save_path, entry, sub_entry)
|
55 |
with open(file_path) as fp:
|
@@ -61,7 +65,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
61 |
|
62 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
63 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
64 |
-
finished_list = [
|
|
|
|
|
|
|
|
|
65 |
failed_list = [e for e in all_evals if e["status"] == "FAILED"]
|
66 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
67 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
+
def get_leaderboard_df(
|
13 |
+
results_path: str, requests_path: str, cols: list, benchmark_cols: list
|
14 |
+
) -> pd.DataFrame:
|
15 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
16 |
all_data_json = [v.to_dict() for v in raw_data]
|
17 |
# all_data_json.append(baseline_row)
|
|
|
51 |
all_evals.append(data)
|
52 |
elif ".md" not in entry:
|
53 |
# this is a folder
|
54 |
+
sub_entries = [
|
55 |
+
e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")
|
56 |
+
]
|
57 |
for sub_entry in sub_entries:
|
58 |
file_path = os.path.join(save_path, entry, sub_entry)
|
59 |
with open(file_path) as fp:
|
|
|
65 |
|
66 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
67 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
68 |
+
finished_list = [
|
69 |
+
e
|
70 |
+
for e in all_evals
|
71 |
+
if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"
|
72 |
+
]
|
73 |
failed_list = [e for e in all_evals if e["status"] == "FAILED"]
|
74 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
75 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|