per commited on
Commit
107f774
2 Parent(s): 0f78488 24dfb58

Merge branch #per/benchbench' into 'ibm/benchbench'

Browse files
app.py CHANGED
@@ -400,7 +400,7 @@ st.dataframe(
400
  column_order=cols_used,
401
  hide_index=True,
402
  use_container_width=True,
403
- height=300,
404
  column_config={col: {"alignment": "center"} for col in cols_used},
405
  )
406
 
 
400
  column_order=cols_used,
401
  hide_index=True,
402
  use_container_width=True,
403
+ height=500,
404
  column_config={col: {"alignment": "center"} for col in cols_used},
405
  )
406
 
cache/aggregate_scoress_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
@@ -1,122 +1,138 @@
1
  model,score
2
- gpt_4o_2024_05_13,0.9847612958226769
3
- claude_3_5_sonnet_20240620,0.982905982905983
4
- gpt_4o_2024_08_06,0.9575873827791986
5
- gpt_4_turbo_2024_04_09,0.9428463693169576
6
- gpt_4_0125_preview,0.9171132221004344
7
- mistral_large_2407,0.8868286445012787
8
- llama3_1_405b_instruct,0.8672150411280846
9
- yi_large_preview,0.8641553641553642
10
- hermes_3_llama3_1_70b,0.8626160990712074
11
- smaug_qwen2_72b_instruct,0.8593911248710011
12
- claude_3_opus_20240229,0.8573567665639277
13
- llama3_1_70b_instruct,0.8528408270971201
14
- athene_70b,0.8493788819875776
15
- deepseek_coder_v2,0.8444160272804775
16
- qwen2_72b_instruct,0.8354710666091739
17
- yi_large,0.8346273291925466
18
- gpt_4_0613,0.8146763722211293
19
- llama3_70b_instruct,0.8127546753337573
20
- llama3_70b,0.8105600539811066
21
- gemma_2_27b_it,0.8045273029120115
22
- gpt_4o_mini_2024_07_18,0.8032033326150972
23
- gemma_2_9b_it_dpo,0.790057915057915
24
- llama3_instruct_8b_simpo,0.7884068278805121
25
- phi_3_5_moe_instruct,0.7808307533539731
26
- qwen1_5_110b_chat,0.776004448721167
27
- qwen1_5_32b,0.7658569500674763
28
- yi_1_5_34b_chat,0.7553884711779449
29
- llama_2_70b,0.7303193882141251
30
- mixtral_8x22b_instruct_v0_1,0.7256023690940907
31
- gemma_2_9b_it_simpo,0.7199248120300753
32
- qwen1_5_32b_chat,0.7149122807017544
33
- mixtral_8x22b_v0_1,0.7135490753911806
34
- yi_34b,0.7128879892037787
35
- internlm2_5_20b_chat,0.6842105263157895
36
- phi_3_small_128k_instruct,0.66937564499484
37
- phi_3_medium_4k_instruct,0.6675079642841117
38
- claude_3_sonnet_20240229,0.653911731916847
39
- gemma_2_9b_it,0.6422797189051059
40
- infinity_instruct_3m_0625_llama3_8b,0.6273115220483642
 
 
 
 
 
41
  mistral_v0_1_7b,0.6239316239316239
42
- phi_3_5_mini_instruct,0.6202270381836945
43
- mistral_medium,0.6122209165687427
44
- mistral_large_2402,0.6058211467418628
45
- claude_instant_1_2,0.6049896049896051
46
- claude_2_0,0.6020066889632107
47
- yi_1_5_9b_chat,0.5881787802840435
48
- qwen1_5_14b,0.5770917678812416
49
- command_r_plus,0.5761033510394125
50
- llama_65b,0.5736992052781527
51
- gpt_3_5_turbo_0613,0.5724018332713985
52
- qwen1_5_72b_chat,0.5668371367348349
53
- phi_3_mini_4k_instruct,0.5548245614035088
54
- deepseek_llm_67b_chat,0.5506756756756757
55
- claude_3_haiku_20240307,0.549424005945745
56
- yi_34b_chat,0.5455449728905107
57
- dbrx_instructruct,0.5344129554655871
58
  jurassic_2_jumbo_178b,0.532051282051282
59
- llama3_1_8b_instruct,0.5175232440678665
60
- claude_2_1,0.5110980545763154
61
- qwen2_7b_instruct,0.5034227726178191
62
- mistral_small_2402,0.49924585218702866
63
- mixtral_8x7b_v0_1,0.49324324324324326
64
- glm_4_9b_chat,0.46499582289055974
65
- qwen1_5_14b_chat,0.4621068436857911
66
- phi_3_small_8k_instruct,0.45481670929241264
67
- gpt_3_5_turbo_0301,0.4528985507246377
68
- snorkel_mistral_pairrm_dpo,0.4521151586368978
69
- gemma_7b,0.4471997300944669
70
- gpt_3_5_turbo_0125,0.4401920188365201
71
- llama3_8b,0.43302968960863697
72
- dbrx_instruct,0.4266409266409266
73
- llama3_8b_instruct,0.420135922511747
74
- phi_3_mini_128k_instruct,0.4153205904787544
75
- llama_2_13b,0.41490478332583597
76
- jurassic_2_grande_17b,0.39529914529914534
77
- openhermes_2_5_mistral_7b,0.3832617447168531
78
- mistral_7b_v0_3,0.3737553342816501
79
- mixtral_8x7b_instruct_v0_1,0.3713078251895724
80
- qwen1_5_7b,0.3508771929824561
81
- yi_1_5_6b_chat,0.3354636591478697
82
- falcon_40b,0.32812265707002547
83
- command_r,0.32386140074759
84
- internlm2_chat_20b,0.32252252252252256
85
- mistral_7b_v0_2,0.31970128022759603
86
- luminous_supreme_70b,0.30128205128205127
87
- starling_lm_7b_alpha,0.29823530624445954
88
- yi_6b,0.29234143049932526
89
- mistral_7b_instruct_v0_2,0.28609513981031004
90
- zephyr_7b_alpha,0.2838442157327606
91
- zephyr_7b_beta,0.2666234345800909
92
- gemma_1_1_7b_it,0.26226051061156724
93
- mistral_7b_instruct_v0_3,0.2537839697282422
94
- starling_lm_7b_beta,0.25234441602728047
95
- llama_2_7b,0.2391288049182786
 
 
 
 
 
 
 
 
 
 
 
 
96
  luminous_extended_30b,0.2329059829059829
97
- alpaca_7b,0.22072072072072071
98
- vicuna_33b_v1_3,0.2056404230317274
99
- phi_2,0.20087901666849037
100
- qwen2_1_5b_instruct,0.19711042311661506
101
- yi_6b_chat,0.1938854489164087
102
- qwen1_5_7b_chat,0.1916569245052217
103
- tulu_2_dpo_70b,0.17624223602484473
104
- qwen1_5_4b_chat,0.1674406604747162
105
- llama_2_70b_chat,0.15527950310559005
106
- gpt_neox_20b,0.14400584795321636
107
- vicuna_7b_v1_5,0.13619501854795973
108
- falcon_40b_instruct,0.13264580369843526
109
- gemma_7b_it,0.12136319058515854
110
- falcon_7b,0.11407257459889038
111
- gpt_j_6b,0.10160818713450293
112
  luminous_base_13b,0.08333333333333333
113
- llama_2_7b_chat,0.08304448781801049
114
- gemma_1_1_2b_it,0.07665903890160183
115
- olmo_7b,0.06545209176788123
116
- gemma_2b_it,0.05921052631578947
117
- qwen1_5_1_8b_chat,0.059167526659786716
118
- qwen2_0_5b_instruct,0.059081527347781215
119
- pythia_12b,0.054093567251461985
120
- pythia_6_9b,0.019736842105263157
121
- falcon_7b_instruct,0.013513513513513514
122
- qwen1_5_0_5b_chat,0.013157894736842105
 
1
  model,score
2
+ gpt_4o_2024_05_13,0.9767482517482518
3
+ chatgpt_4o_latest,0.9754079254079254
4
+ gpt_4o_2024_08_06,0.9652680652680652
5
+ claude_3_5_sonnet_20240620,0.9572649572649573
6
+ gemini_1_5_pro_exp_0801,0.9545454545454546
7
+ llama3_1_70b_instruct,0.9343074620852398
8
+ gpt_4_turbo_2024_04_09,0.9055819180819181
9
+ claude_3_opus_20240229,0.8824397824397824
10
+ yi_large_preview,0.8714202464202464
11
+ llama3_1_405b_instruct,0.8598484848484849
12
+ gpt_4_0125_preview,0.8492118992118992
13
+ hermes_3_llama3_1_70b,0.8451178451178452
14
+ zephyr_orpo_141b_a35b_v0_1,0.8414055080721747
15
+ mistral_large_2407,0.8375291375291375
16
+ gpt_4o_mini_2024_07_18,0.8348776223776224
17
+ claude_2_0,0.8333333333333334
18
+ smaug_qwen2_72b_instruct,0.8331088664421997
19
+ gemini_1_5_pro_api_0514,0.8294871794871794
20
+ llama3_70b_instruct,0.8172801478357034
21
+ llama3_70b,0.8129154795821463
22
+ gemma_2_9b_it_dpo,0.8100649350649352
23
+ llama3_instruct_8b_simpo,0.7992424242424242
24
+ yi_large,0.7889194139194139
25
+ gemma_2_27b_it,0.776345259678593
26
+ qwen2_72b_instruct,0.7701936951936953
27
+ qwen1_5_32b,0.7678062678062678
28
+ gpt_4_0613,0.7641802641802643
29
+ phi_3_5_moe_instruct,0.7600448933782267
30
+ qwen1_5_110b_chat,0.7419770353103686
31
+ mixtral_8x22b_v0_1,0.7382154882154882
32
+ gemma_2_9b_it_simpo,0.7328042328042329
33
+ gemini_pro,0.7298951048951049
34
+ llama_2_70b,0.7293447293447294
35
+ gemini_1_5_flash_api_0514,0.7263403263403263
36
+ yi_34b,0.7188983855650521
37
+ deepseek_coder_v2,0.713053613053613
38
+ nous_hermes_2_mixtral_8x7b_dpo,0.7094017094017094
39
+ gpt_3_5_turbo_0613,0.6851851851851851
40
+ claude_2_1,0.6693861693861693
41
+ yi_1_5_34b_chat,0.6669566544566544
42
+ mistral_medium,0.657051282051282
43
+ phi_3_small_128k_instruct,0.6561167227833894
44
+ infinity_instruct_3m_0625_llama3_8b,0.6537598204264872
45
+ claude_instant_1_2,0.6486013986013985
46
  mistral_v0_1_7b,0.6239316239316239
47
+ command_r_plus,0.6183108558108558
48
+ phi_3_5_mini_instruct,0.6103254769921437
49
+ llama3_1_8b_instruct,0.6080822469711359
50
+ gemma_2_9b_it,0.6048877048877048
51
+ yi_1_5_9b_chat,0.6041446208112875
52
+ claude_3_sonnet_20240229,0.5985236985236985
53
+ mixtral_8x22b_instruct_v0_1,0.585565052231719
54
+ qwen1_5_14b,0.5797720797720798
55
+ llama_65b,0.5759734093067427
56
+ deepseek_llm_67b_chat,0.5734841290396846
57
+ qwen1_5_32b_chat,0.571383349161127
58
+ wizardlm_70b,0.5620629370629371
59
+ yi_34b_chat,0.5558361391694725
60
+ qwen1_5_72b_chat,0.5463669663669664
61
+ dbrx_instructruct,0.5379867046533713
 
62
  jurassic_2_jumbo_178b,0.532051282051282
63
+ mixtral_8x7b_v0_1,0.5310044893378227
64
+ openchat_3_5,0.5270655270655271
65
+ mistral_large_2402,0.5105672105672105
66
+ solar_10_7b_instruct_v1_0,0.5030864197530864
67
+ qwen2_7b_instruct,0.4970445192667415
68
+ phi_3_medium_4k_instruct,0.48541540763762986
69
+ dolphin_2_2_1_mistral_7b,0.4810606060606061
70
+ mistral_small_2402,0.47785547785547783
71
+ glm_4_9b_chat,0.4769547325102881
72
+ dbrx_instruct,0.4724025974025974
73
+ qwen1_5_14b_chat,0.45340153673487005
74
+ claude_3_haiku_20240307,0.44965034965034967
75
+ gemma_7b,0.4477682811016144
76
+ llama3_8b_instruct,0.4449662477440255
77
+ llama3_8b,0.4368471035137702
78
+ wizardlm_13b,0.42773892773892774
79
+ starling_lm_7b_alpha,0.42734323289878845
80
+ jurassic_2_grande_17b,0.4230769230769231
81
+ mistral_7b_v0_3,0.4228395061728395
82
+ llama_2_13b,0.4146881924659702
83
+ llama_2_70b_chat,0.412732329398996
84
+ phi_3_mini_4k_instruct,0.4048663270885493
85
+ openhermes_2_5_mistral_7b,0.40103708020374684
86
+ llama_2_13b_chat,0.38675213675213677
87
+ guanaco_33b,0.38374125874125875
88
+ phi_3_mini_128k_instruct,0.3778468445135112
89
+ mistral_7b_v0_2,0.3773849607182941
90
+ internlm2_chat_20b,0.37196969696969695
91
+ starling_lm_7b_beta,0.3611888111888112
92
+ gpt_3_5_turbo_0125,0.3591242091242091
93
+ tulu_2_dpo_70b,0.3585164835164835
94
+ qwen1_5_7b,0.35185185185185186
95
+ falcon_40b,0.3502690724912947
96
+ yi_1_5_6b_chat,0.33974132863021755
97
+ zephyr_7b_alpha,0.33875830959164294
98
+ command_r,0.3296911421911422
99
+ luminous_supreme_70b,0.32905982905982906
100
+ yi_6b,0.295346628679962
101
+ zephyr_7b_beta,0.28937667271000606
102
+ mixtral_8x7b_instruct_v0_1,0.284326167659501
103
+ qwen_14b_chat,0.2837995337995338
104
+ gemma_2_2b_it,0.28113553113553114
105
+ phi_3_small_8k_instruct,0.27051282051282055
106
+ gemma_1_1_7b_it,0.263927019482575
107
+ llama_2_7b,0.25466919911364355
108
+ mistral_7b_instruct_v0_2,0.250669392336059
109
+ mistral_7b_instruct_v0_3,0.24534231200897869
110
+ qwen1_5_7b_chat,0.24214088380755047
111
+ alpaca_7b,0.23484848484848483
112
  luminous_extended_30b,0.2329059829059829
113
+ llama_13b,0.2222222222222222
114
+ phi_2,0.19812080923192033
115
+ qwen2_1_5b_instruct,0.1968574635241302
116
+ yi_6b_chat,0.19393939393939394
117
+ vicuna_7b,0.1885198135198135
118
+ gemma_7b_it,0.18790982679871568
119
+ olmo_7b_instruct,0.15669515669515668
120
+ vicuna_7b_v1_5,0.15454545454545454
121
+ vicuna_13b,0.14714452214452214
122
+ gpt_neox_20b,0.1419753086419753
123
+ falcon_40b_instruct,0.13187429854096522
124
+ qwen1_5_4b_chat,0.12542806987251431
125
+ falcon_7b,0.11380183602405824
126
+ llama_2_7b_chat,0.1122679789346456
127
+ gpt_j_6b,0.09876543209876543
128
  luminous_base_13b,0.08333333333333333
129
+ gemma_2b_it,0.08119658119658119
130
+ gemma_1_1_2b_it,0.07454890788224121
131
+ olmo_7b,0.06220322886989553
132
+ qwen1_5_1_8b_chat,0.05544332210998878
133
+ qwen2_0_5b_instruct,0.055218855218855216
134
+ pythia_12b,0.05246913580246913
135
+ chatglm2_6b,0.029137529137529136
136
+ pythia_6_9b,0.018518518518518517
137
+ qwen1_5_0_5b_chat,0.012345679012345678
138
+ falcon_7b_instruct,0.011363636363636364
cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- git+https://github.com/ibm/benchbench@fb72ad8b83e3bddab4520c63d1a4673caff88a71
2
 
3
  altair==5.4.1
4
  attrs==24.2.0
@@ -51,4 +51,5 @@ toml==0.10.2
51
  tornado==6.4.1
52
  typing_extensions==4.12.2
53
  tzdata==2024.1
54
- urllib3==2.2.2
 
 
1
+ git+https://github.com/ibm/benchbench@08c7757323d565b70d024d82b193861b406ddf9d
2
 
3
  altair==5.4.1
4
  attrs==24.2.0
 
51
  tornado==6.4.1
52
  typing_extensions==4.12.2
53
  tzdata==2024.1
54
+ urllib3==2.2.2
55
+ tqdm