Spaces:
Running
Running
Merge branch #per/benchbench' into 'ibm/benchbench'
Browse files
app.py
CHANGED
@@ -400,7 +400,7 @@ st.dataframe(
|
|
400 |
column_order=cols_used,
|
401 |
hide_index=True,
|
402 |
use_container_width=True,
|
403 |
-
height=
|
404 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
405 |
)
|
406 |
|
|
|
400 |
column_order=cols_used,
|
401 |
hide_index=True,
|
402 |
use_container_width=True,
|
403 |
+
height=500,
|
404 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
405 |
)
|
406 |
|
cache/aggregate_scoress_cache_5e66a88dab42480065db47711c55c458.csv
CHANGED
@@ -1,122 +1,138 @@
|
|
1 |
model,score
|
2 |
-
gpt_4o_2024_05_13,0.
|
3 |
-
|
4 |
-
gpt_4o_2024_08_06,0.
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
qwen1_5_32b,0.
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
41 |
mistral_v0_1_7b,0.6239316239316239
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
llama_65b,0.
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
dbrx_instructruct,0.5344129554655871
|
58 |
jurassic_2_jumbo_178b,0.532051282051282
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
jurassic_2_grande_17b,0.
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
mistral_7b_v0_2,0.
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
luminous_extended_30b,0.2329059829059829
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
gpt_neox_20b,0.
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
gpt_j_6b,0.
|
112 |
luminous_base_13b,0.08333333333333333
|
113 |
-
|
114 |
-
gemma_1_1_2b_it,0.
|
115 |
-
olmo_7b,0.
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
pythia_6_9b,0.
|
121 |
-
|
122 |
-
|
|
|
1 |
model,score
|
2 |
+
gpt_4o_2024_05_13,0.9767482517482518
|
3 |
+
chatgpt_4o_latest,0.9754079254079254
|
4 |
+
gpt_4o_2024_08_06,0.9652680652680652
|
5 |
+
claude_3_5_sonnet_20240620,0.9572649572649573
|
6 |
+
gemini_1_5_pro_exp_0801,0.9545454545454546
|
7 |
+
llama3_1_70b_instruct,0.9343074620852398
|
8 |
+
gpt_4_turbo_2024_04_09,0.9055819180819181
|
9 |
+
claude_3_opus_20240229,0.8824397824397824
|
10 |
+
yi_large_preview,0.8714202464202464
|
11 |
+
llama3_1_405b_instruct,0.8598484848484849
|
12 |
+
gpt_4_0125_preview,0.8492118992118992
|
13 |
+
hermes_3_llama3_1_70b,0.8451178451178452
|
14 |
+
zephyr_orpo_141b_a35b_v0_1,0.8414055080721747
|
15 |
+
mistral_large_2407,0.8375291375291375
|
16 |
+
gpt_4o_mini_2024_07_18,0.8348776223776224
|
17 |
+
claude_2_0,0.8333333333333334
|
18 |
+
smaug_qwen2_72b_instruct,0.8331088664421997
|
19 |
+
gemini_1_5_pro_api_0514,0.8294871794871794
|
20 |
+
llama3_70b_instruct,0.8172801478357034
|
21 |
+
llama3_70b,0.8129154795821463
|
22 |
+
gemma_2_9b_it_dpo,0.8100649350649352
|
23 |
+
llama3_instruct_8b_simpo,0.7992424242424242
|
24 |
+
yi_large,0.7889194139194139
|
25 |
+
gemma_2_27b_it,0.776345259678593
|
26 |
+
qwen2_72b_instruct,0.7701936951936953
|
27 |
+
qwen1_5_32b,0.7678062678062678
|
28 |
+
gpt_4_0613,0.7641802641802643
|
29 |
+
phi_3_5_moe_instruct,0.7600448933782267
|
30 |
+
qwen1_5_110b_chat,0.7419770353103686
|
31 |
+
mixtral_8x22b_v0_1,0.7382154882154882
|
32 |
+
gemma_2_9b_it_simpo,0.7328042328042329
|
33 |
+
gemini_pro,0.7298951048951049
|
34 |
+
llama_2_70b,0.7293447293447294
|
35 |
+
gemini_1_5_flash_api_0514,0.7263403263403263
|
36 |
+
yi_34b,0.7188983855650521
|
37 |
+
deepseek_coder_v2,0.713053613053613
|
38 |
+
nous_hermes_2_mixtral_8x7b_dpo,0.7094017094017094
|
39 |
+
gpt_3_5_turbo_0613,0.6851851851851851
|
40 |
+
claude_2_1,0.6693861693861693
|
41 |
+
yi_1_5_34b_chat,0.6669566544566544
|
42 |
+
mistral_medium,0.657051282051282
|
43 |
+
phi_3_small_128k_instruct,0.6561167227833894
|
44 |
+
infinity_instruct_3m_0625_llama3_8b,0.6537598204264872
|
45 |
+
claude_instant_1_2,0.6486013986013985
|
46 |
mistral_v0_1_7b,0.6239316239316239
|
47 |
+
command_r_plus,0.6183108558108558
|
48 |
+
phi_3_5_mini_instruct,0.6103254769921437
|
49 |
+
llama3_1_8b_instruct,0.6080822469711359
|
50 |
+
gemma_2_9b_it,0.6048877048877048
|
51 |
+
yi_1_5_9b_chat,0.6041446208112875
|
52 |
+
claude_3_sonnet_20240229,0.5985236985236985
|
53 |
+
mixtral_8x22b_instruct_v0_1,0.585565052231719
|
54 |
+
qwen1_5_14b,0.5797720797720798
|
55 |
+
llama_65b,0.5759734093067427
|
56 |
+
deepseek_llm_67b_chat,0.5734841290396846
|
57 |
+
qwen1_5_32b_chat,0.571383349161127
|
58 |
+
wizardlm_70b,0.5620629370629371
|
59 |
+
yi_34b_chat,0.5558361391694725
|
60 |
+
qwen1_5_72b_chat,0.5463669663669664
|
61 |
+
dbrx_instructruct,0.5379867046533713
|
|
|
62 |
jurassic_2_jumbo_178b,0.532051282051282
|
63 |
+
mixtral_8x7b_v0_1,0.5310044893378227
|
64 |
+
openchat_3_5,0.5270655270655271
|
65 |
+
mistral_large_2402,0.5105672105672105
|
66 |
+
solar_10_7b_instruct_v1_0,0.5030864197530864
|
67 |
+
qwen2_7b_instruct,0.4970445192667415
|
68 |
+
phi_3_medium_4k_instruct,0.48541540763762986
|
69 |
+
dolphin_2_2_1_mistral_7b,0.4810606060606061
|
70 |
+
mistral_small_2402,0.47785547785547783
|
71 |
+
glm_4_9b_chat,0.4769547325102881
|
72 |
+
dbrx_instruct,0.4724025974025974
|
73 |
+
qwen1_5_14b_chat,0.45340153673487005
|
74 |
+
claude_3_haiku_20240307,0.44965034965034967
|
75 |
+
gemma_7b,0.4477682811016144
|
76 |
+
llama3_8b_instruct,0.4449662477440255
|
77 |
+
llama3_8b,0.4368471035137702
|
78 |
+
wizardlm_13b,0.42773892773892774
|
79 |
+
starling_lm_7b_alpha,0.42734323289878845
|
80 |
+
jurassic_2_grande_17b,0.4230769230769231
|
81 |
+
mistral_7b_v0_3,0.4228395061728395
|
82 |
+
llama_2_13b,0.4146881924659702
|
83 |
+
llama_2_70b_chat,0.412732329398996
|
84 |
+
phi_3_mini_4k_instruct,0.4048663270885493
|
85 |
+
openhermes_2_5_mistral_7b,0.40103708020374684
|
86 |
+
llama_2_13b_chat,0.38675213675213677
|
87 |
+
guanaco_33b,0.38374125874125875
|
88 |
+
phi_3_mini_128k_instruct,0.3778468445135112
|
89 |
+
mistral_7b_v0_2,0.3773849607182941
|
90 |
+
internlm2_chat_20b,0.37196969696969695
|
91 |
+
starling_lm_7b_beta,0.3611888111888112
|
92 |
+
gpt_3_5_turbo_0125,0.3591242091242091
|
93 |
+
tulu_2_dpo_70b,0.3585164835164835
|
94 |
+
qwen1_5_7b,0.35185185185185186
|
95 |
+
falcon_40b,0.3502690724912947
|
96 |
+
yi_1_5_6b_chat,0.33974132863021755
|
97 |
+
zephyr_7b_alpha,0.33875830959164294
|
98 |
+
command_r,0.3296911421911422
|
99 |
+
luminous_supreme_70b,0.32905982905982906
|
100 |
+
yi_6b,0.295346628679962
|
101 |
+
zephyr_7b_beta,0.28937667271000606
|
102 |
+
mixtral_8x7b_instruct_v0_1,0.284326167659501
|
103 |
+
qwen_14b_chat,0.2837995337995338
|
104 |
+
gemma_2_2b_it,0.28113553113553114
|
105 |
+
phi_3_small_8k_instruct,0.27051282051282055
|
106 |
+
gemma_1_1_7b_it,0.263927019482575
|
107 |
+
llama_2_7b,0.25466919911364355
|
108 |
+
mistral_7b_instruct_v0_2,0.250669392336059
|
109 |
+
mistral_7b_instruct_v0_3,0.24534231200897869
|
110 |
+
qwen1_5_7b_chat,0.24214088380755047
|
111 |
+
alpaca_7b,0.23484848484848483
|
112 |
luminous_extended_30b,0.2329059829059829
|
113 |
+
llama_13b,0.2222222222222222
|
114 |
+
phi_2,0.19812080923192033
|
115 |
+
qwen2_1_5b_instruct,0.1968574635241302
|
116 |
+
yi_6b_chat,0.19393939393939394
|
117 |
+
vicuna_7b,0.1885198135198135
|
118 |
+
gemma_7b_it,0.18790982679871568
|
119 |
+
olmo_7b_instruct,0.15669515669515668
|
120 |
+
vicuna_7b_v1_5,0.15454545454545454
|
121 |
+
vicuna_13b,0.14714452214452214
|
122 |
+
gpt_neox_20b,0.1419753086419753
|
123 |
+
falcon_40b_instruct,0.13187429854096522
|
124 |
+
qwen1_5_4b_chat,0.12542806987251431
|
125 |
+
falcon_7b,0.11380183602405824
|
126 |
+
llama_2_7b_chat,0.1122679789346456
|
127 |
+
gpt_j_6b,0.09876543209876543
|
128 |
luminous_base_13b,0.08333333333333333
|
129 |
+
gemma_2b_it,0.08119658119658119
|
130 |
+
gemma_1_1_2b_it,0.07454890788224121
|
131 |
+
olmo_7b,0.06220322886989553
|
132 |
+
qwen1_5_1_8b_chat,0.05544332210998878
|
133 |
+
qwen2_0_5b_instruct,0.055218855218855216
|
134 |
+
pythia_12b,0.05246913580246913
|
135 |
+
chatglm2_6b,0.029137529137529136
|
136 |
+
pythia_6_9b,0.018518518518518517
|
137 |
+
qwen1_5_0_5b_chat,0.012345679012345678
|
138 |
+
falcon_7b_instruct,0.011363636363636364
|
cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
git+https://github.com/ibm/benchbench@
|
2 |
|
3 |
altair==5.4.1
|
4 |
attrs==24.2.0
|
@@ -51,4 +51,5 @@ toml==0.10.2
|
|
51 |
tornado==6.4.1
|
52 |
typing_extensions==4.12.2
|
53 |
tzdata==2024.1
|
54 |
-
urllib3==2.2.2
|
|
|
|
1 |
+
git+https://github.com/ibm/benchbench@08c7757323d565b70d024d82b193861b406ddf9d
|
2 |
|
3 |
altair==5.4.1
|
4 |
attrs==24.2.0
|
|
|
51 |
tornado==6.4.1
|
52 |
typing_extensions==4.12.2
|
53 |
tzdata==2024.1
|
54 |
+
urllib3==2.2.2
|
55 |
+
tqdm
|