diff --git "a/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" "b/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" --- "a/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" +++ "b/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" @@ -3442,123 +3442,123 @@ tablellm_deepseek_coder_7b,tablebench_overall_dp,27.98,[],tablebench_241002.csv tablellm_llama3_1_8b,tablebench_overall_dp,27.19,[],tablebench_241002.csv tablellm_llama3_8b,tablebench_overall_dp,26.93,[],tablebench_241002.csv tablellm_qwen2_7b,tablebench_overall_dp,27.14,[],tablebench_241002.csv -gemma_2b_it,trustworthy_average,67.18,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_average,66.87,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_average,60.62,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_average,74.72,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_average,80.61,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_average,62.29,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_average,72.45,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_average,69.24,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_average,82.96,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_average,76.31,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_average,59.49,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_average,56.58,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_average,80.61,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_non_toxicity,77.07,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_non_toxicity,75.52,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_non_toxicity,28.0,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_non_toxicity,80.0,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_non_toxicity,77.53,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_non_toxicity,40.0,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_non_toxicity,47.0,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_non_toxicity,41.0,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_non_toxicity,86.46,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_non_toxicity,59.02,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_non_toxicity,39.0,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_non_toxicity,18.0,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_non_toxicity,77.53,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_non_stereotype,73.33,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_non_stereotype,100.0,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_non_stereotype,81.0,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_non_stereotype,97.6,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_non_stereotype,98.33,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_non_stereotype,84.6,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_non_stereotype,87.0,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_non_stereotype,77.0,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_non_stereotype,99.67,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_non_stereotype,87.34,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_non_stereotype,87.0,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_non_stereotype,73.0,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_non_stereotype,98.33,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_advglue_pp,43.21,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_advglue_pp,43.43,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_advglue_pp,52.16,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_advglue_pp,51.01,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_advglue_pp,67.28,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_advglue_pp,46.2,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_advglue_pp,56.69,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_advglue_pp,64.04,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_advglue_pp,51.36,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_advglue_pp,50.25,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_advglue_pp,43.98,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_advglue_pp,44.81,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_advglue_pp,67.28,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_ood,51.43,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_ood,61.78,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_ood,59.1,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_ood,75.65,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_ood,70.85,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_ood,64.26,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_ood,73.58,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_ood,87.55,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_ood,86.59,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_ood,79.07,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_ood,51.45,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_ood,54.21,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_ood,70.85,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_adv_demo,35.55,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_adv_demo,33.33,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_adv_demo,57.99,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_adv_demo,55.54,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_adv_demo,75.54,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_adv_demo,58.25,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_adv_demo,81.28,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_adv_demo,77.94,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_adv_demo,88.1,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_adv_demo,88.49,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_adv_demo,33.95,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_adv_demo,58.51,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_adv_demo,75.54,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_privacy,88.77,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_privacy,83.69,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_privacy,72.96,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_privacy,97.39,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_privacy,81.59,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_privacy,78.93,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_privacy,70.13,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_privacy,66.11,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_privacy,97.04,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_privacy,89.38,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_privacy,70.26,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_privacy,76.64,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_privacy,81.59,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_ethics,75.03,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_ethics,43.33,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_ethics,48.22,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_ethics,40.58,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_ethics,93.74,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_ethics,26.11,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_ethics,86.38,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_ethics,76.6,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_ethics,92.02,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_ethics,87.2,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_ethics,50.28,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_ethics,27.49,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_ethics,93.74,[],llm_trustworthy_241001.csv -gemma_2b_it,trustworthy_fairness,93.02,[],llm_trustworthy_241001.csv -gemma_7b_it,trustworthy_fairness,93.88,[],llm_trustworthy_241001.csv -vicuna_7b_v1_3,trustworthy_fairness,85.53,[],llm_trustworthy_241001.csv -llama_2_7b_chat,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv -llama3_8b_instruct,trustworthy_fairness,80.05,[],llm_trustworthy_241001.csv -mpt_7b_chat,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv -gpt_3_5_turbo_0301,trustworthy_fairness,77.57,[],llm_trustworthy_241001.csv -gpt_4_0314,trustworthy_fairness,63.67,[],llm_trustworthy_241001.csv -gpt_4o_2024_05_13,trustworthy_fairness,62.47,[],llm_trustworthy_241001.csv -gpt_4o_mini_2024_07_18,trustworthy_fairness,69.74,[],llm_trustworthy_241001.csv -falcon_7b_instruct,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv -redpajama_incite_7b_instruct,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv -gemini_pro_1_0,trustworthy_fairness,80.05,[],llm_trustworthy_241001.csv +gemma_2b_it,Trustworthy Average,0.5981804397270656,[],llm_trustworthy_241001.csv +gemma_7b_it,Trustworthy Average,0.6099317664897647,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,Trustworthy Average,0.8468536770280516,[],llm_trustworthy_241001.csv +llama_2_7b_chat,Trustworthy Average,0.31235784685367685,[],llm_trustworthy_241001.csv +llama3_8b_instruct,Trustworthy Average,0.0890826383623956,[],llm_trustworthy_241001.csv +mpt_7b_chat,Trustworthy Average,0.7835481425322213,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,Trustworthy Average,0.3984078847611824,[],llm_trustworthy_241001.csv +gpt_4_0314,Trustworthy Average,0.5200909780136467,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,Trustworthy Average,0.0,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,Trustworthy Average,0.2520849128127366,[],llm_trustworthy_241001.csv +falcon_7b_instruct,Trustworthy Average,0.8896891584533736,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,Trustworthy Average,1.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,Trustworthy Average,0.0890826383623956,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_non_toxicity,0.13716038562664334,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_non_toxicity,0.15980134385042355,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_non_toxicity,0.8539293017820625,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_non_toxicity,0.09436167104878757,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_non_toxicity,0.13044113350861808,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_non_toxicity,0.6786444639205376,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_non_toxicity,0.5763949751679813,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_non_toxicity,0.6640373940987437,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_non_toxicity,0.0,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_non_toxicity,0.4008179959100203,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_non_toxicity,0.6932515337423313,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_non_toxicity,1.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_non_toxicity,0.13044113350861808,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_non_stereotype,0.9877777777777779,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_non_stereotype,0.0,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_non_stereotype,0.7037037037037037,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_non_stereotype,0.08888888888888913,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_non_stereotype,0.061851851851851936,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_non_stereotype,0.5703703703703706,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_non_stereotype,0.4814814814814815,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_non_stereotype,0.8518518518518519,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_non_stereotype,0.012222222222222134,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_non_stereotype,0.4688888888888888,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_non_stereotype,0.4814814814814815,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_non_stereotype,1.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_non_stereotype,0.061851851851851936,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_advglue_pp,1.0,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_advglue_pp,0.9908599916909016,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_advglue_pp,0.6281678437889491,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_advglue_pp,0.6759451599501456,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_advglue_pp,0.0,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_advglue_pp,0.875778977980889,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_advglue_pp,0.43996676360614884,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_advglue_pp,0.13460739509763164,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_advglue_pp,0.661404237640216,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_advglue_pp,0.7075197341088493,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_advglue_pp,0.9680099709181555,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_advglue_pp,0.9335272122974657,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_advglue_pp,0.0,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_ood,1.0,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_ood,0.713455149501661,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_ood,0.7876522702104096,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_ood,0.3294573643410851,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_ood,0.4623477297895904,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_ood,0.6447951273532667,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_ood,0.38676633444075303,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_ood,0.0,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_ood,0.026578073089700838,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_ood,0.2347729789590256,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_ood,0.9994462901439645,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_ood,0.9230343300110742,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_ood,0.4623477297895904,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_adv_demo,0.9597534445250181,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_adv_demo,1.0,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_adv_demo,0.552936910804931,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_adv_demo,0.5973531544597535,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_adv_demo,0.23477157360406076,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_adv_demo,0.548223350253807,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_adv_demo,0.13071065989847708,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_adv_demo,0.19126178390137782,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_adv_demo,0.007070340826686006,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_adv_demo,0.0,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_adv_demo,0.9887599709934735,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_adv_demo,0.543509789702683,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_adv_demo,0.23477157360406076,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_privacy,0.2755754475703326,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_privacy,0.4379795396419438,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_privacy,0.7810102301790283,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_privacy,0.0,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_privacy,0.5051150895140664,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_privacy,0.5901534526854217,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_privacy,0.8714833759590794,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_privacy,1.0,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_privacy,0.011189258312020334,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_privacy,0.2560741687979541,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_privacy,0.8673273657289,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_privacy,0.6633631713554987,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_privacy,0.5051150895140664,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_ethics,0.2766523732071565,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_ethics,0.7453792695549313,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_ethics,0.6730740795504953,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_ethics,0.7860416974715363,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_ethics,0.0,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_ethics,1.0,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_ethics,0.10882744344225936,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_ethics,0.2534378234511312,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_ethics,0.025432500369658384,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_ethics,0.09670264675439877,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_ethics,0.642614224456602,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_ethics,0.9795948543545764,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_ethics,0.0,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_fairness,0.18598454569677603,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_fairness,0.1630695443645085,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_fairness,0.38555822009059415,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_fairness,0.5315747402078338,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_fairness,0.5976552091660007,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_fairness,0.9680255795363708,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_fairness,1.0,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_fairness,0.8062883026911805,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_fairness,0.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_fairness,0.5315747402078338,[],llm_trustworthy_241001.csv gpt_4o_20240513,OpenCompass Academic,77.0,[],opencompass_academic_240829.csv qwen2_72b_instruct,OpenCompass Academic,73.1,[],opencompass_academic_240829.csv gpt_4o_mini_20240718,OpenCompass Academic,72.5,[],opencompass_academic_240829.csv @@ -5671,363 +5671,363 @@ yi_34b_chat,AlphacaEval v2lc,27.2,,alphacaeval_v2lc_240829.csv yi_large_preview,AlphacaEval v2lc,51.9,,alphacaeval_v2lc_240829.csv zephyr_7b_alpha,AlphacaEval v2lc,10.3,,alphacaeval_v2lc_240829.csv zephyr_7b_beta,AlphacaEval v2lc,13.2,,alphacaeval_v2lc_240829.csv -claude_3_haiku_20240307,HELM AirBench Security Risks,0.005,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Security Risks,0.009,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Security Risks,0.065,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Security Risks,0.065,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Security Risks,0.097,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Security Risks,0.124,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Security Risks,0.137,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Security Risks,0.142,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Security Risks,0.158,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Security Risks,0.275,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Security Risks,0.297,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Security Risks,0.405,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Security Risks,0.453,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Security Risks,0.457,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Security Risks,0.509,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Security Risks,0.671,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Security Risks,0.777,[],helm_airbench_240916.csv -command_r,HELM AirBench Security Risks,0.782,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Security Risks,0.829,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Security Risks,0.932,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Security Risks,0.955,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Operational Misuses,0.572,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Operational Misuses,0.473,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Operational Misuses,0.694,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Operational Misuses,0.477,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Operational Misuses,0.338,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Operational Misuses,0.371,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Operational Misuses,0.551,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Operational Misuses,0.636,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Operational Misuses,0.726,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Operational Misuses,0.636,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Operational Misuses,0.813,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Operational Misuses,0.768,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Operational Misuses,0.772,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Operational Misuses,0.709,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Operational Misuses,0.691,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Operational Misuses,0.744,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Operational Misuses,0.818,[],helm_airbench_240916.csv -command_r,HELM AirBench Operational Misuses,0.878,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Operational Misuses,0.881,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Operational Misuses,0.841,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Operational Misuses,0.874,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Violence & Extremism,0.159,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Violence & Extremism,0.156,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Violence & Extremism,0.214,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Violence & Extremism,0.187,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Violence & Extremism,0.253,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Violence & Extremism,0.289,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Violence & Extremism,0.455,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Violence & Extremism,0.329,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Violence & Extremism,0.351,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Violence & Extremism,0.589,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Violence & Extremism,0.527,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Violence & Extremism,0.664,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Violence & Extremism,0.579,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Violence & Extremism,0.541,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Violence & Extremism,0.558,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Violence & Extremism,0.726,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Violence & Extremism,0.733,[],helm_airbench_240916.csv -command_r,HELM AirBench Violence & Extremism,0.775,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Violence & Extremism,0.816,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Violence & Extremism,0.806,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Violence & Extremism,0.841,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Hate/Toxicity,0.057,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Hate/Toxicity,0.071,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Hate/Toxicity,0.188,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Hate/Toxicity,0.091,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Hate/Toxicity,0.135,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Hate/Toxicity,0.164,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Hate/Toxicity,0.274,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Hate/Toxicity,0.144,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Hate/Toxicity,0.329,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Hate/Toxicity,0.433,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Hate/Toxicity,0.327,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Hate/Toxicity,0.51,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Hate/Toxicity,0.371,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Hate/Toxicity,0.365,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Hate/Toxicity,0.377,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.417,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.504,[],helm_airbench_240916.csv -command_r,HELM AirBench Hate/Toxicity,0.586,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Hate/Toxicity,0.653,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Hate/Toxicity,0.501,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Hate/Toxicity,0.624,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Sexual Content,0.219,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Sexual Content,0.184,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Sexual Content,0.208,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Sexual Content,0.34,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Sexual Content,0.288,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Sexual Content,0.302,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Sexual Content,0.549,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Sexual Content,0.378,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Sexual Content,0.49,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Sexual Content,0.559,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Sexual Content,0.524,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Sexual Content,0.667,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Sexual Content,0.635,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Sexual Content,0.622,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Sexual Content,0.576,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Sexual Content,0.569,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Sexual Content,0.632,[],helm_airbench_240916.csv -command_r,HELM AirBench Sexual Content,0.712,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Sexual Content,0.729,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Sexual Content,0.597,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Sexual Content,0.684,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Child Harm,0.152,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Child Harm,0.133,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Child Harm,0.276,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Child Harm,0.19,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Child Harm,0.233,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Child Harm,0.286,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Child Harm,0.429,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Child Harm,0.381,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Child Harm,0.267,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Child Harm,0.629,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Child Harm,0.552,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Child Harm,0.752,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Child Harm,0.686,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Child Harm,0.643,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Child Harm,0.624,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Child Harm,0.767,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Child Harm,0.848,[],helm_airbench_240916.csv -command_r,HELM AirBench Child Harm,0.824,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Child Harm,0.819,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Child Harm,0.924,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Child Harm,0.924,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Self Harm,0.022,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Self Harm,0.0,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Self Harm,0.089,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Self Harm,0.022,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Self Harm,0.078,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Self Harm,0.022,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Self Harm,0.089,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Self Harm,0.156,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Self Harm,0.078,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Self Harm,0.322,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Self Harm,0.189,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Self Harm,0.422,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Self Harm,0.356,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Self Harm,0.344,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Self Harm,0.289,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Self Harm,0.322,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Self Harm,0.533,[],helm_airbench_240916.csv -command_r,HELM AirBench Self Harm,0.578,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Self Harm,0.578,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Self Harm,0.522,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Self Harm,0.722,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Political Usage,0.041,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Political Usage,0.031,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Political Usage,0.177,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Political Usage,0.063,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Political Usage,0.161,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Political Usage,0.195,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Political Usage,0.463,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Political Usage,0.323,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Political Usage,0.339,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Political Usage,0.609,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Political Usage,0.601,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Political Usage,0.725,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Political Usage,0.616,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Political Usage,0.532,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Political Usage,0.52,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Political Usage,0.747,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Political Usage,0.808,[],helm_airbench_240916.csv -command_r,HELM AirBench Political Usage,0.861,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Political Usage,0.895,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Political Usage,0.909,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Political Usage,0.963,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Economic Harm,0.04,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Economic Harm,0.02,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Economic Harm,0.14,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Economic Harm,0.06,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Economic Harm,0.09,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Economic Harm,0.153,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Economic Harm,0.433,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Economic Harm,0.293,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Economic Harm,0.34,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Economic Harm,0.623,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Economic Harm,0.587,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Economic Harm,0.71,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Economic Harm,0.623,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Economic Harm,0.567,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Economic Harm,0.503,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Economic Harm,0.647,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Economic Harm,0.74,[],helm_airbench_240916.csv -command_r,HELM AirBench Economic Harm,0.82,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Economic Harm,0.897,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Economic Harm,0.91,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Economic Harm,0.953,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Deception,0.089,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Deception,0.096,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Deception,0.259,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Deception,0.126,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Deception,0.215,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Deception,0.3,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Deception,0.522,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Deception,0.304,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Deception,0.385,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Deception,0.659,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Deception,0.504,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Deception,0.748,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Deception,0.733,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Deception,0.648,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Deception,0.681,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Deception,0.726,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Deception,0.822,[],helm_airbench_240916.csv -command_r,HELM AirBench Deception,0.822,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Deception,0.867,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Deception,0.889,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Deception,0.926,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Manipulation,0.053,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Manipulation,0.127,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Manipulation,0.353,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Manipulation,0.1,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Manipulation,0.22,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Manipulation,0.293,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Manipulation,0.433,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Manipulation,0.34,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Manipulation,0.427,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Manipulation,0.573,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Manipulation,0.54,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Manipulation,0.7,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Manipulation,0.633,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Manipulation,0.573,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Manipulation,0.533,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Manipulation,0.66,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Manipulation,0.687,[],helm_airbench_240916.csv -command_r,HELM AirBench Manipulation,0.813,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Manipulation,0.853,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Manipulation,0.853,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Manipulation,0.953,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Defamation,0.037,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Defamation,0.074,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Defamation,0.38,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Defamation,0.074,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Defamation,0.194,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Defamation,0.278,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Defamation,0.463,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Defamation,0.167,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Defamation,0.574,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Defamation,0.481,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Defamation,0.426,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Defamation,0.593,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Defamation,0.63,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Defamation,0.407,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Defamation,0.491,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Defamation,0.463,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Defamation,0.602,[],helm_airbench_240916.csv -command_r,HELM AirBench Defamation,0.648,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Defamation,0.815,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Defamation,0.648,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Defamation,0.75,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Fundamental Rights,0.027,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Fundamental Rights,0.06,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Fundamental Rights,0.053,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Fundamental Rights,0.213,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Fundamental Rights,0.08,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Fundamental Rights,0.147,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Fundamental Rights,0.333,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Fundamental Rights,0.267,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Fundamental Rights,0.52,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Fundamental Rights,0.467,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Fundamental Rights,0.373,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Fundamental Rights,0.227,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Fundamental Rights,0.573,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Fundamental Rights,0.627,[],helm_airbench_240916.csv -command_r,HELM AirBench Fundamental Rights,0.773,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Fundamental Rights,0.8,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Fundamental Rights,0.893,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Fundamental Rights,0.947,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Discrimination/Bias,0.382,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Discrimination/Bias,0.332,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Discrimination/Bias,0.521,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Discrimination/Bias,0.27,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Discrimination/Bias,0.24,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Discrimination/Bias,0.325,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Discrimination/Bias,0.516,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Discrimination/Bias,0.461,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Discrimination/Bias,0.502,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Discrimination/Bias,0.589,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Discrimination/Bias,0.575,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Discrimination/Bias,0.624,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Discrimination/Bias,0.571,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Discrimination/Bias,0.584,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Discrimination/Bias,0.559,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.593,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.592,[],helm_airbench_240916.csv -command_r,HELM AirBench Discrimination/Bias,0.678,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Discrimination/Bias,0.68,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Discrimination/Bias,0.624,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Discrimination/Bias,0.675,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Privacy,0.086,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Privacy,0.089,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Privacy,0.225,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Privacy,0.096,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Privacy,0.123,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Privacy,0.14,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Privacy,0.316,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Privacy,0.207,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Privacy,0.274,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Privacy,0.39,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Privacy,0.45,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Privacy,0.471,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Privacy,0.546,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Privacy,0.515,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Privacy,0.436,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Privacy,0.593,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Privacy,0.579,[],helm_airbench_240916.csv -command_r,HELM AirBench Privacy,0.699,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Privacy,0.709,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Privacy,0.717,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Privacy,0.817,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench Criminal Activities,0.008,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench Criminal Activities,0.0,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench Criminal Activities,0.0,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench Criminal Activities,0.017,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench Criminal Activities,0.042,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench Criminal Activities,0.033,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench Criminal Activities,0.108,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench Criminal Activities,0.058,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench Criminal Activities,0.025,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench Criminal Activities,0.267,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench Criminal Activities,0.233,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench Criminal Activities,0.45,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench Criminal Activities,0.35,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench Criminal Activities,0.3,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench Criminal Activities,0.275,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench Criminal Activities,0.646,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench Criminal Activities,0.742,[],helm_airbench_240916.csv -command_r,HELM AirBench Criminal Activities,0.717,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench Criminal Activities,0.817,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench Criminal Activities,0.942,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench Criminal Activities,0.967,[],helm_airbench_240916.csv -claude_3_haiku_20240307,HELM AirBench AIR Score,0.198,[],helm_airbench_240916.csv -claude_3_sonnet_20240229,HELM AirBench AIR Score,0.177,[],helm_airbench_240916.csv -llama3_8b_chat,HELM AirBench AIR Score,0.386,[],helm_airbench_240916.csv -claude_3_opus_20240229,HELM AirBench AIR Score,0.177,[],helm_airbench_240916.csv -gemini_1_5_pro_001_safety_default,HELM AirBench AIR Score,0.189,[],helm_airbench_240916.csv -gemini_1_5_flash_001_safety_default,HELM AirBench AIR Score,0.233,[],helm_airbench_240916.csv -gpt_3_5_turbo_0613,HELM AirBench AIR Score,0.407,[],helm_airbench_240916.csv -gpt_4_turbo_2024_04_09,HELM AirBench AIR Score,0.322,[],helm_airbench_240916.csv -llama3_70b_chat,HELM AirBench AIR Score,0.386,[],helm_airbench_240916.csv -gpt_3_5_turbo_1106,HELM AirBench AIR Score,0.511,[],helm_airbench_240916.csv -gpt_4o_2024_05_13,HELM AirBench AIR Score,0.506,[],helm_airbench_240916.csv -gpt_3_5_turbo_0125,HELM AirBench AIR Score,0.593,[],helm_airbench_240916.csv -qwen1_5_72b_chat,HELM AirBench AIR Score,0.558,[],helm_airbench_240916.csv -deepseek_llm_67b_chat,HELM AirBench AIR Score,0.533,[],helm_airbench_240916.csv -yi_34b_chat,HELM AirBench AIR Score,0.507,[],helm_airbench_240916.csv -mixtral_8x22b_instruct_v0_1,HELM AirBench AIR Score,0.611,[],helm_airbench_240916.csv -mixtral_8x7b_instruct_v0_1,HELM AirBench AIR Score,0.645,[],helm_airbench_240916.csv -command_r,HELM AirBench AIR Score,0.722,[],helm_airbench_240916.csv -command_r_plus,HELM AirBench AIR Score,0.747,[],helm_airbench_240916.csv -mistral_7b_instruct_v0_3,HELM AirBench AIR Score,0.718,[],helm_airbench_240916.csv -dbrx_instructruct,HELM AirBench AIR Score,0.786,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Security Risks,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Security Risks,0.9957894736842106,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Security Risks,0.9368421052631579,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Security Risks,0.9368421052631579,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Security Risks,0.9031578947368422,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Security Risks,0.8747368421052631,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Security Risks,0.8610526315789473,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Security Risks,0.8557894736842105,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Security Risks,0.8389473684210527,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Security Risks,0.7157894736842105,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Security Risks,0.6926315789473685,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Security Risks,0.5789473684210527,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Security Risks,0.5284210526315789,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Security Risks,0.5242105263157895,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Security Risks,0.46947368421052627,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Security Risks,0.2989473684210525,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Security Risks,0.18736842105263152,[],helm_airbench_240916.csv +command_r,HELM AirBench Security Risks,0.18210526315789466,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Security Risks,0.13263157894736843,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Security Risks,0.02421052631578935,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Security Risks,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Operational Misuses,0.569060773480663,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Operational Misuses,0.7513812154696133,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Operational Misuses,0.3443830570902394,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Operational Misuses,0.7440147329650093,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Operational Misuses,1.0,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Operational Misuses,0.9392265193370166,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Operational Misuses,0.6077348066298341,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Operational Misuses,0.4511970534069981,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Operational Misuses,0.28545119705340694,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Operational Misuses,0.4511970534069981,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Operational Misuses,0.1252302025782689,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Operational Misuses,0.20810313075506437,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Operational Misuses,0.20073664825046034,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Operational Misuses,0.3167587476979742,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Operational Misuses,0.34990791896869256,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Operational Misuses,0.2523020257826887,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Operational Misuses,0.11602209944751385,[],helm_airbench_240916.csv +command_r,HELM AirBench Operational Misuses,0.0055248618784528025,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Operational Misuses,0.0,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Operational Misuses,0.07366482504604055,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Operational Misuses,0.012891344383056946,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Violence & Extremism,0.9956204379562044,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Violence & Extremism,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Violence & Extremism,0.9153284671532846,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Violence & Extremism,0.9547445255474453,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Violence & Extremism,0.8583941605839416,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Violence & Extremism,0.8058394160583942,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Violence & Extremism,0.5635036496350364,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Violence & Extremism,0.7474452554744525,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Violence & Extremism,0.7153284671532847,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Violence & Extremism,0.3678832116788322,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Violence & Extremism,0.45839416058394156,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Violence & Extremism,0.2583941605839415,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Violence & Extremism,0.3824817518248176,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Violence & Extremism,0.43795620437956195,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Violence & Extremism,0.4131386861313868,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Violence & Extremism,0.16788321167883213,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Violence & Extremism,0.15766423357664228,[],helm_airbench_240916.csv +command_r,HELM AirBench Violence & Extremism,0.09635036496350358,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Violence & Extremism,0.03649635036496357,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Violence & Extremism,0.05109489051094884,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Violence & Extremism,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Hate/Toxicity,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Hate/Toxicity,0.9765100671140939,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Hate/Toxicity,0.7802013422818792,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Hate/Toxicity,0.9429530201342282,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Hate/Toxicity,0.8691275167785235,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Hate/Toxicity,0.8204697986577181,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Hate/Toxicity,0.6359060402684563,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Hate/Toxicity,0.8540268456375839,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Hate/Toxicity,0.5436241610738255,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Hate/Toxicity,0.3691275167785235,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Hate/Toxicity,0.546979865771812,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Hate/Toxicity,0.23993288590604023,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Hate/Toxicity,0.4731543624161073,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Hate/Toxicity,0.4832214765100671,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Hate/Toxicity,0.46308724832214765,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.3959731543624161,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.25,[],helm_airbench_240916.csv +command_r,HELM AirBench Hate/Toxicity,0.11241610738255048,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Hate/Toxicity,0.0,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Hate/Toxicity,0.25503355704697983,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Hate/Toxicity,0.04865771812080544,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Sexual Content,0.9357798165137614,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Sexual Content,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Sexual Content,0.9559633027522936,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Sexual Content,0.7137614678899082,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Sexual Content,0.8091743119266055,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Sexual Content,0.7834862385321101,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Sexual Content,0.33027522935779796,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Sexual Content,0.6440366972477063,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Sexual Content,0.4385321100917431,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Sexual Content,0.31192660550458695,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Sexual Content,0.37614678899082554,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Sexual Content,0.11376146788990804,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Sexual Content,0.17247706422018338,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Sexual Content,0.19633027522935764,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Sexual Content,0.2807339449541284,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Sexual Content,0.29357798165137616,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Sexual Content,0.17798165137614663,[],helm_airbench_240916.csv +command_r,HELM AirBench Sexual Content,0.03119266055045855,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Sexual Content,0.0,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Sexual Content,0.24220183486238522,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Sexual Content,0.08256880733944938,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Child Harm,0.9759797724399495,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Child Harm,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Child Harm,0.8192161820480405,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Child Harm,0.9279393173198482,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Child Harm,0.8735777496839443,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Child Harm,0.806573957016435,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Child Harm,0.6257901390644753,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Child Harm,0.6864728192161821,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Child Harm,0.8305941845764855,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Child Harm,0.37294563843236417,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Child Harm,0.47029077117572693,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Child Harm,0.21744627054361576,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Child Harm,0.3008849557522124,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Child Harm,0.35524652338811635,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Child Harm,0.37926675094816686,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Child Harm,0.19848293299620734,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Child Harm,0.09608091024020238,[],helm_airbench_240916.csv +command_r,HELM AirBench Child Harm,0.1264222503160557,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Child Harm,0.1327433628318585,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Child Harm,0.0,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Child Harm,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Self Harm,0.9695290858725761,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Self Harm,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Self Harm,0.8767313019390581,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Self Harm,0.9695290858725761,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Self Harm,0.8919667590027701,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Self Harm,0.9695290858725761,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Self Harm,0.8767313019390581,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Self Harm,0.7839335180055401,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Self Harm,0.8919667590027701,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Self Harm,0.554016620498615,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Self Harm,0.7382271468144044,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Self Harm,0.41551246537396125,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Self Harm,0.5069252077562327,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Self Harm,0.5235457063711911,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Self Harm,0.5997229916897506,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Self Harm,0.554016620498615,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Self Harm,0.2617728531855955,[],helm_airbench_240916.csv +command_r,HELM AirBench Self Harm,0.19944598337950137,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Self Harm,0.19944598337950137,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Self Harm,0.2770083102493074,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Self Harm,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Political Usage,0.9892703862660944,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Political Usage,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Political Usage,0.8433476394849786,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Political Usage,0.9656652360515021,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Political Usage,0.8605150214592274,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Political Usage,0.8240343347639485,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Political Usage,0.5364806866952789,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Political Usage,0.686695278969957,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Political Usage,0.6695278969957081,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Political Usage,0.3798283261802575,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Political Usage,0.38841201716738194,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Political Usage,0.25536480686695284,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Political Usage,0.37231759656652363,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Political Usage,0.46244635193133043,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Political Usage,0.47532188841201717,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Political Usage,0.23175965665236054,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Political Usage,0.16630901287553645,[],helm_airbench_240916.csv +command_r,HELM AirBench Political Usage,0.1094420600858369,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Political Usage,0.07296137339055786,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Political Usage,0.05793991416309008,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Political Usage,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Economic Harm,0.9785637727759914,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Economic Harm,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Economic Harm,0.8713826366559485,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Economic Harm,0.9571275455519829,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Economic Harm,0.92497320471597,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Economic Harm,0.857449088960343,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Economic Harm,0.557341907824223,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Economic Harm,0.707395498392283,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Economic Harm,0.6570203644158628,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Economic Harm,0.3536977491961415,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Economic Harm,0.3922829581993569,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Economic Harm,0.26045016077170424,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Economic Harm,0.3536977491961415,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Economic Harm,0.4137191854233655,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Economic Harm,0.4823151125401929,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Economic Harm,0.32797427652733113,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Economic Harm,0.22829581993569126,[],helm_airbench_240916.csv +command_r,HELM AirBench Economic Harm,0.142550911039657,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Economic Harm,0.060021436227224,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Economic Harm,0.04608788853161838,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Economic Harm,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Deception,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Deception,0.991636798088411,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Deception,0.7968936678614098,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Deception,0.955794504181601,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Deception,0.8494623655913979,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Deception,0.7479091995221028,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Deception,0.48267622461170845,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Deception,0.7431302270011948,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Deception,0.6463560334528076,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Deception,0.31899641577060933,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Deception,0.5041816009557945,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Deception,0.21266427718040626,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Deception,0.23058542413381133,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Deception,0.3321385902031063,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Deception,0.29271206690561524,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Deception,0.23894862604540035,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Deception,0.12425328554360826,[],helm_airbench_240916.csv +command_r,HELM AirBench Deception,0.12425328554360826,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Deception,0.07048984468339314,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Deception,0.044205495818399054,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Deception,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Manipulation,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Manipulation,0.9177777777777778,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Manipulation,0.6666666666666666,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Manipulation,0.9477777777777777,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Manipulation,0.8144444444444444,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Manipulation,0.7333333333333334,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Manipulation,0.5777777777777777,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Manipulation,0.681111111111111,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Manipulation,0.5844444444444443,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Manipulation,0.4222222222222223,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Manipulation,0.4588888888888888,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Manipulation,0.2811111111111111,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Manipulation,0.3555555555555555,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Manipulation,0.4222222222222223,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Manipulation,0.46666666666666656,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Manipulation,0.3255555555555555,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Manipulation,0.29555555555555546,[],helm_airbench_240916.csv +command_r,HELM AirBench Manipulation,0.15555555555555556,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Manipulation,0.11111111111111105,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Manipulation,0.11111111111111105,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Manipulation,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Defamation,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Defamation,0.9524421593830334,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Defamation,0.5591259640102827,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Defamation,0.9524421593830334,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Defamation,0.7982005141388174,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Defamation,0.6902313624678662,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Defamation,0.45244215938303334,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Defamation,0.8329048843187661,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Defamation,0.3097686375321337,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Defamation,0.42930591259640094,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Defamation,0.4999999999999999,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Defamation,0.28534704370179953,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Defamation,0.23778920308483287,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Defamation,0.5244215938303342,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Defamation,0.41645244215938293,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Defamation,0.45244215938303334,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Defamation,0.2737789203084833,[],helm_airbench_240916.csv +command_r,HELM AirBench Defamation,0.21465295629820047,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Defamation,0.0,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Defamation,0.21465295629820047,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Defamation,0.08354755784061696,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Fundamental Rights,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Fundamental Rights,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Fundamental Rights,0.9714889123548046,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Fundamental Rights,1.0,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Fundamental Rights,0.9366420274551215,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Fundamental Rights,0.9440337909186906,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Fundamental Rights,0.7750791974656811,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Fundamental Rights,0.9155227032734953,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Fundamental Rights,0.8447729672650475,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Fundamental Rights,0.648363252375924,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Fundamental Rights,0.7180570221752904,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Fundamental Rights,0.450897571277719,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Fundamental Rights,0.5068637803590285,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Fundamental Rights,0.6061246040126715,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Fundamental Rights,0.7602956705385427,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Fundamental Rights,0.3949313621964098,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Fundamental Rights,0.337909186906019,[],helm_airbench_240916.csv +command_r,HELM AirBench Fundamental Rights,0.1837381203801478,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Fundamental Rights,0.15522703273495242,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Fundamental Rights,0.057022175290390664,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Discrimination/Bias,0.6772727272727272,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Discrimination/Bias,0.7909090909090909,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Discrimination/Bias,0.36136363636363633,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Discrimination/Bias,0.9318181818181818,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Discrimination/Bias,1.0,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Discrimination/Bias,0.8068181818181818,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Discrimination/Bias,0.3727272727272728,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Discrimination/Bias,0.4977272727272727,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Discrimination/Bias,0.40454545454545465,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Discrimination/Bias,0.206818181818182,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Discrimination/Bias,0.23863636363636387,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Discrimination/Bias,0.12727272727272732,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Discrimination/Bias,0.2477272727272729,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Discrimination/Bias,0.21818181818181837,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Discrimination/Bias,0.2749999999999999,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.19772727272727286,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.20000000000000018,[],helm_airbench_240916.csv +command_r,HELM AirBench Discrimination/Bias,0.004545454545454519,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Discrimination/Bias,0.0,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Discrimination/Bias,0.12727272727272732,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Discrimination/Bias,0.011363636363636354,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Privacy,1.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Privacy,0.9958960328317373,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Privacy,0.8098495212038304,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Privacy,0.9863201094391245,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Privacy,0.9493844049247606,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Privacy,0.9261285909712722,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Privacy,0.6853625170998632,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Privacy,0.8344733242134063,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Privacy,0.7428180574555403,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Privacy,0.5841313269493844,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Privacy,0.5020519835841313,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Privacy,0.4733242134062927,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Privacy,0.37072503419972624,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Privacy,0.4131326949384404,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Privacy,0.5212038303693571,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Privacy,0.3064295485636115,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Privacy,0.32558139534883723,[],helm_airbench_240916.csv +command_r,HELM AirBench Privacy,0.16142270861833108,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Privacy,0.14774281805745548,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Privacy,0.13679890560875507,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Privacy,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Criminal Activities,0.9917269906928645,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Criminal Activities,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Criminal Activities,1.0,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Criminal Activities,0.9824198552223371,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Criminal Activities,0.9565667011375387,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Criminal Activities,0.9658738366080661,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Criminal Activities,0.8883143743536711,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Criminal Activities,0.9400206825232679,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Criminal Activities,0.9741468459152016,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Criminal Activities,0.7238883143743536,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Criminal Activities,0.7590486039296794,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Criminal Activities,0.5346432264736298,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Criminal Activities,0.6380558428128231,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Criminal Activities,0.6897621509824199,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Criminal Activities,0.7156153050672182,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Criminal Activities,0.33195449844881075,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Criminal Activities,0.23267838676318509,[],helm_airbench_240916.csv +command_r,HELM AirBench Criminal Activities,0.25853154084798347,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Criminal Activities,0.15511892450879006,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Criminal Activities,0.02585315408479838,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Criminal Activities,0.0,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench AIR Score,0.9655172413793103,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench AIR Score,1.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench AIR Score,0.6568144499178982,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench AIR Score,1.0,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench AIR Score,0.9802955665024631,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench AIR Score,0.9080459770114943,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench AIR Score,0.6223316912972086,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench AIR Score,0.7619047619047619,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench AIR Score,0.6568144499178982,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench AIR Score,0.451559934318555,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench AIR Score,0.45977011494252873,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench AIR Score,0.31691297208538594,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench AIR Score,0.37438423645320185,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench AIR Score,0.41543513957307054,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench AIR Score,0.458128078817734,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench AIR Score,0.28735632183908044,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench AIR Score,0.23152709359605905,[],helm_airbench_240916.csv +command_r,HELM AirBench AIR Score,0.1050903119868638,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench AIR Score,0.064039408866995,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench AIR Score,0.11165845648604278,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench AIR Score,0.0,[],helm_airbench_240916.csv claude_3_5_sonnet,OpenCompass,67.9,[],opencompass_240829.csv gpt_4o_20240513,OpenCompass,67.7,[],opencompass_240829.csv mistral_large,OpenCompass,63.2,[],opencompass_240829.csv @@ -20998,36 +20998,36 @@ orca_2_13b,BIGGEN Multilingual,2.043,[],biggen_240829.csv qwen1_5_0_5b_chat,BIGGEN Multilingual,1.159,[],biggen_240829.csv orca_2_7b,BIGGEN Multilingual,1.729,[],biggen_240829.csv aya_101,BIGGEN Multilingual,1.129,[],biggen_240829.csv -command_r,ruler,85.5,,ruler_bench_241002.csv -command_r_0824,ruler,86.0,,ruler_bench_241002.csv -command_r_plus,ruler,82.7,,ruler_bench_241002.csv -command_r_plus_0824,ruler,83.4,,ruler_bench_241002.csv -dbrx,ruler,38.0,,ruler_bench_241002.csv -film_7b*,ruler,66.4,,ruler_bench_241002.csv -gemini_1_5_pro,ruler,95.5,,ruler_bench_241002.csv -glm3,ruler,62.0,,ruler_bench_241002.csv -glm4,ruler,88.0,,ruler_bench_241002.csv -gpt_4_1106_preview,ruler,89.0,,ruler_bench_241002.csv -internlm2_5,ruler,77.8,,ruler_bench_241002.csv -jamba_1_5_large,ruler,95.7,,ruler_bench_241002.csv -jamba_1_5_mini,ruler,93.1,,ruler_bench_241002.csv -llama3,ruler,82.6,,ruler_bench_241002.csv -llama3_1,ruler,85.5,,ruler_bench_241002.csv -longalpaca,ruler,24.7,,ruler_bench_241002.csv -longchat,ruler,33.1,,ruler_bench_241002.csv -lwm,ruler,69.9,,ruler_bench_241002.csv -megabeam_mistral,ruler,87.3,,ruler_bench_241002.csv -mistral,ruler,55.6,,ruler_bench_241002.csv -mistral_large,ruler,70.6,,ruler_bench_241002.csv -mistral_nemo,ruler,54.7,,ruler_bench_241002.csv -mixtral_8x22b,ruler,73.5,,ruler_bench_241002.csv -mixtral_8x7b,ruler,72.8,,ruler_bench_241002.csv -phi3_medium,ruler,74.8,,ruler_bench_241002.csv -phi3_mini,ruler,80.9,,ruler_bench_241002.csv -qwen1_5,ruler,37.5,,ruler_bench_241002.csv -qwen2,ruler,79.6,,ruler_bench_241002.csv -together,ruler,33.8,,ruler_bench_241002.csv -yi,ruler,84.8,,ruler_bench_241002.csv +command_r,RULER,85.5,,ruler_bench_241002.csv +command_r_0824,RULER,86.0,,ruler_bench_241002.csv +command_r_plus,RULER,82.7,,ruler_bench_241002.csv +command_r_plus_0824,RULER,83.4,,ruler_bench_241002.csv +dbrx,RULER,38.0,,ruler_bench_241002.csv +film_7b*,RULER,66.4,,ruler_bench_241002.csv +gemini_1_5_pro,RULER,95.5,,ruler_bench_241002.csv +glm3,RULER,62.0,,ruler_bench_241002.csv +glm4,RULER,88.0,,ruler_bench_241002.csv +gpt_4_1106_preview,RULER,89.0,,ruler_bench_241002.csv +internlm2_5,RULER,77.8,,ruler_bench_241002.csv +jamba_1_5_large,RULER,95.7,,ruler_bench_241002.csv +jamba_1_5_mini,RULER,93.1,,ruler_bench_241002.csv +llama3,RULER,82.6,,ruler_bench_241002.csv +llama3_1,RULER,85.5,,ruler_bench_241002.csv +longalpaca,RULER,24.7,,ruler_bench_241002.csv +longchat,RULER,33.1,,ruler_bench_241002.csv +lwm,RULER,69.9,,ruler_bench_241002.csv +megabeam_mistral,RULER,87.3,,ruler_bench_241002.csv +mistral,RULER,55.6,,ruler_bench_241002.csv +mistral_large,RULER,70.6,,ruler_bench_241002.csv +mistral_nemo,RULER,54.7,,ruler_bench_241002.csv +mixtral_8x22b,RULER,73.5,,ruler_bench_241002.csv +mixtral_8x7b,RULER,72.8,,ruler_bench_241002.csv +phi3_medium,RULER,74.8,,ruler_bench_241002.csv +phi3_mini,RULER,80.9,,ruler_bench_241002.csv +qwen1_5,RULER,37.5,,ruler_bench_241002.csv +qwen2,RULER,79.6,,ruler_bench_241002.csv +together,RULER,33.8,,ruler_bench_241002.csv +yi,RULER,84.8,,ruler_bench_241002.csv zephyr_7b_beta,LiveBench 240624,17.32,[],livebench_240701.csv zephyr_7b_alpha,LiveBench 240624,19.28,[],livebench_240701.csv yi_6b_chat,LiveBench 240624,9.02,[],livebench_240701.csv @@ -21518,505 +21518,505 @@ claude_3_opus_20240229,LiveBench Instruction Following Average,70.87,[],livebenc claude_3_haiku_20240307,LiveBench Instruction Following Average,64.03,[],livebench_240701.csv claude_3_5_sonnet_20240620,LiveBench Instruction Following Average,72.3,[],livebench_240701.csv chatgpt_4o_latest,LiveBench Instruction Following Average,72.52,[],livebench_240701.csv -abab5_5_chat,hydrox_integrity,8.09,,hydrox_safety_241001.csv -abab5_5_chat,hydrox_overall_score,6.6,,hydrox_safety_241001.csv -abab5_5_chat,hydrox_privacy,5.13,,hydrox_safety_241001.csv -abab5_5_chat,hydrox_safety,8.32,,hydrox_safety_241001.csv -abab5_5_chat,hydrox_security,4.85,,hydrox_safety_241001.csv -abab5_5s_chat,hydrox_integrity,19.46,,hydrox_safety_241001.csv -abab5_5s_chat,hydrox_overall_score,19.12,,hydrox_safety_241001.csv -abab5_5s_chat,hydrox_privacy,20.63,,hydrox_safety_241001.csv -abab5_5s_chat,hydrox_safety,22.54,,hydrox_safety_241001.csv -abab5_5s_chat,hydrox_security,14.17,,hydrox_safety_241001.csv -claude_3_5_sonnet,hydrox_integrity,95.56,,hydrox_safety_241001.csv -claude_3_5_sonnet,hydrox_overall_score,94.18,,hydrox_safety_241001.csv -claude_3_5_sonnet,hydrox_privacy,93.83,,hydrox_safety_241001.csv -claude_3_5_sonnet,hydrox_safety,94.75,,hydrox_safety_241001.csv -claude_3_5_sonnet,hydrox_security,92.61,,hydrox_safety_241001.csv -claude_3_haiku,hydrox_integrity,89.53,,hydrox_safety_241001.csv -claude_3_haiku,hydrox_overall_score,91.59,,hydrox_safety_241001.csv -claude_3_haiku,hydrox_privacy,93.69,,hydrox_safety_241001.csv -claude_3_haiku,hydrox_safety,91.52,,hydrox_safety_241001.csv -claude_3_haiku,hydrox_security,91.39,,hydrox_safety_241001.csv -claude_3_opus,hydrox_integrity,94.08,,hydrox_safety_241001.csv -claude_3_opus,hydrox_overall_score,92.02,,hydrox_safety_241001.csv -claude_3_opus,hydrox_privacy,91.26,,hydrox_safety_241001.csv -claude_3_opus,hydrox_safety,92.5,,hydrox_safety_241001.csv -claude_3_opus,hydrox_security,90.47,,hydrox_safety_241001.csv -claude_3_sonnet,hydrox_integrity,94.14,,hydrox_safety_241001.csv -claude_3_sonnet,hydrox_overall_score,93.62,,hydrox_safety_241001.csv -claude_3_sonnet,hydrox_privacy,94.36,,hydrox_safety_241001.csv -claude_3_sonnet,hydrox_safety,92.33,,hydrox_safety_241001.csv -claude_3_sonnet,hydrox_security,94.62,,hydrox_safety_241001.csv -deepseek_v2_chat_0628,hydrox_integrity,0.0,,hydrox_safety_241001.csv -deepseek_v2_chat_0628,hydrox_overall_score,50.0,,hydrox_safety_241001.csv -deepseek_v2_chat_0628,hydrox_privacy,0.0,,hydrox_safety_241001.csv -deepseek_v2_chat_0628,hydrox_safety,50.0,,hydrox_safety_241001.csv -deepseek_v2_chat_0628,hydrox_security,0.0,,hydrox_safety_241001.csv -deepseek_v2_lite_chat,hydrox_integrity,45.93,,hydrox_safety_241001.csv -deepseek_v2_lite_chat,hydrox_overall_score,44.91,,hydrox_safety_241001.csv -deepseek_v2_lite_chat,hydrox_privacy,48.84,,hydrox_safety_241001.csv -deepseek_v2_lite_chat,hydrox_safety,44.26,,hydrox_safety_241001.csv -deepseek_v2_lite_chat,hydrox_security,41.91,,hydrox_safety_241001.csv -dolly_v2_12b,hydrox_integrity,3.72,,hydrox_safety_241001.csv -dolly_v2_12b,hydrox_overall_score,6.21,,hydrox_safety_241001.csv -dolly_v2_12b,hydrox_privacy,3.48,,hydrox_safety_241001.csv -dolly_v2_12b,hydrox_safety,11.46,,hydrox_safety_241001.csv -dolly_v2_12b,hydrox_security,3.39,,hydrox_safety_241001.csv -dolly_v2_3b,hydrox_integrity,0.18,,hydrox_safety_241001.csv -dolly_v2_3b,hydrox_overall_score,1.81,,hydrox_safety_241001.csv -dolly_v2_3b,hydrox_privacy,1.08,,hydrox_safety_241001.csv -dolly_v2_3b,hydrox_safety,4.08,,hydrox_safety_241001.csv -dolly_v2_3b,hydrox_security,0.55,,hydrox_safety_241001.csv -dolly_v2_7b,hydrox_integrity,8.33,,hydrox_safety_241001.csv -dolly_v2_7b,hydrox_overall_score,7.79,,hydrox_safety_241001.csv -dolly_v2_7b,hydrox_privacy,8.33,,hydrox_safety_241001.csv -dolly_v2_7b,hydrox_safety,9.92,,hydrox_safety_241001.csv -dolly_v2_7b,hydrox_security,4.96,,hydrox_safety_241001.csv -falcon_40b,hydrox_integrity,0.64,,hydrox_safety_241001.csv -falcon_40b,hydrox_overall_score,0.9,,hydrox_safety_241001.csv -falcon_40b,hydrox_privacy,0.25,,hydrox_safety_241001.csv -falcon_40b,hydrox_safety,2.08,,hydrox_safety_241001.csv -falcon_40b,hydrox_security,0.4,,hydrox_safety_241001.csv -falcon_40b_instruct,hydrox_integrity,30.32,,hydrox_safety_241001.csv -falcon_40b_instruct,hydrox_overall_score,27.55,,hydrox_safety_241001.csv -falcon_40b_instruct,hydrox_privacy,30.83,,hydrox_safety_241001.csv -falcon_40b_instruct,hydrox_safety,28.1,,hydrox_safety_241001.csv -falcon_40b_instruct,hydrox_security,22.97,,hydrox_safety_241001.csv -falcon_7b,hydrox_integrity,0.23,,hydrox_safety_241001.csv -falcon_7b,hydrox_overall_score,0.51,,hydrox_safety_241001.csv -falcon_7b,hydrox_privacy,0.11,,hydrox_safety_241001.csv -falcon_7b,hydrox_safety,1.05,,hydrox_safety_241001.csv -falcon_7b,hydrox_security,0.43,,hydrox_safety_241001.csv -falcon_7b_instruct,hydrox_integrity,15.76,,hydrox_safety_241001.csv -falcon_7b_instruct,hydrox_overall_score,14.01,,hydrox_safety_241001.csv -falcon_7b_instruct,hydrox_privacy,11.3,,hydrox_safety_241001.csv -falcon_7b_instruct,hydrox_safety,14.64,,hydrox_safety_241001.csv -falcon_7b_instruct,hydrox_security,14.01,,hydrox_safety_241001.csv -gemini_1_0_pro,hydrox_integrity,87.11,,hydrox_safety_241001.csv -gemini_1_0_pro,hydrox_overall_score,77.2,,hydrox_safety_241001.csv -gemini_1_0_pro,hydrox_privacy,90.39,,hydrox_safety_241001.csv -gemini_1_0_pro,hydrox_safety,65.18,,hydrox_safety_241001.csv -gemini_1_0_pro,hydrox_security,79.93,,hydrox_safety_241001.csv -gemini_1_0_pro_latest,hydrox_integrity,88.61,,hydrox_safety_241001.csv -gemini_1_0_pro_latest,hydrox_overall_score,78.29,,hydrox_safety_241001.csv -gemini_1_0_pro_latest,hydrox_privacy,87.82,,hydrox_safety_241001.csv -gemini_1_0_pro_latest,hydrox_safety,69.2,,hydrox_safety_241001.csv -gemini_1_0_pro_latest,hydrox_security,77.91,,hydrox_safety_241001.csv -gemini_1_5_flash,hydrox_integrity,60.0,,hydrox_safety_241001.csv -gemini_1_5_flash,hydrox_overall_score,74.43,,hydrox_safety_241001.csv -gemini_1_5_flash,hydrox_privacy,83.33,,hydrox_safety_241001.csv -gemini_1_5_flash,hydrox_safety,77.61,,hydrox_safety_241001.csv -gemini_1_5_flash,hydrox_security,72.05,,hydrox_safety_241001.csv -gemini_1_5_pro,hydrox_integrity,40.84,,hydrox_safety_241001.csv -gemini_1_5_pro,hydrox_overall_score,43.27,,hydrox_safety_241001.csv -gemini_1_5_pro,hydrox_privacy,40.63,,hydrox_safety_241001.csv -gemini_1_5_pro,hydrox_safety,46.99,,hydrox_safety_241001.csv -gemini_1_5_pro,hydrox_security,41.65,,hydrox_safety_241001.csv -gemini_pro,hydrox_integrity,84.42,,hydrox_safety_241001.csv -gemini_pro,hydrox_overall_score,73.04,,hydrox_safety_241001.csv -gemini_pro,hydrox_privacy,90.6,,hydrox_safety_241001.csv -gemini_pro,hydrox_safety,63.56,,hydrox_safety_241001.csv -gemini_pro,hydrox_security,67.49,,hydrox_safety_241001.csv -gemma_2_27b_it,hydrox_integrity,10.94,,hydrox_safety_241001.csv -gemma_2_27b_it,hydrox_overall_score,9.67,,hydrox_safety_241001.csv -gemma_2_27b_it,hydrox_privacy,11.11,,hydrox_safety_241001.csv -gemma_2_27b_it,hydrox_safety,8.1,,hydrox_safety_241001.csv -gemma_2_27b_it,hydrox_security,10.0,,hydrox_safety_241001.csv -gemma_2_2b,hydrox_integrity,24.88,,hydrox_safety_241001.csv -gemma_2_2b,hydrox_overall_score,25.5,,hydrox_safety_241001.csv -gemma_2_2b,hydrox_privacy,27.04,,hydrox_safety_241001.csv -gemma_2_2b,hydrox_safety,25.61,,hydrox_safety_241001.csv -gemma_2_2b,hydrox_security,24.5,,hydrox_safety_241001.csv -gemma_2_2b_it,hydrox_integrity,93.14,,hydrox_safety_241001.csv -gemma_2_2b_it,hydrox_overall_score,91.66,,hydrox_safety_241001.csv -gemma_2_2b_it,hydrox_privacy,92.43,,hydrox_safety_241001.csv -gemma_2_2b_it,hydrox_safety,92.15,,hydrox_safety_241001.csv -gemma_2_2b_it,hydrox_security,89.22,,hydrox_safety_241001.csv -gemma_2b,hydrox_integrity,6.39,,hydrox_safety_241001.csv -gemma_2b,hydrox_overall_score,7.99,,hydrox_safety_241001.csv -gemma_2b,hydrox_privacy,8.27,,hydrox_safety_241001.csv -gemma_2b,hydrox_safety,8.55,,hydrox_safety_241001.csv -gemma_2b,hydrox_security,8.09,,hydrox_safety_241001.csv -gpt_3_5_turbo_0613,hydrox_integrity,80.84,,hydrox_safety_241001.csv -gpt_3_5_turbo_0613,hydrox_overall_score,72.04,,hydrox_safety_241001.csv -gpt_3_5_turbo_0613,hydrox_privacy,90.0,,hydrox_safety_241001.csv -gpt_3_5_turbo_0613,hydrox_safety,56.94,,hydrox_safety_241001.csv -gpt_3_5_turbo_0613,hydrox_security,93.43,,hydrox_safety_241001.csv -gpt_4_0314,hydrox_integrity,54.0,,hydrox_safety_241001.csv -gpt_4_0314,hydrox_overall_score,62.51,,hydrox_safety_241001.csv -gpt_4_0314,hydrox_privacy,76.67,,hydrox_safety_241001.csv -gpt_4_0314,hydrox_safety,56.36,,hydrox_safety_241001.csv -gpt_4_0314,hydrox_security,72.79,,hydrox_safety_241001.csv -gpt_4_0613,hydrox_integrity,96.04,,hydrox_safety_241001.csv -gpt_4_0613,hydrox_overall_score,85.43,,hydrox_safety_241001.csv -gpt_4_0613,hydrox_privacy,91.79,,hydrox_safety_241001.csv -gpt_4_0613,hydrox_safety,79.94,,hydrox_safety_241001.csv -gpt_4_0613,hydrox_security,92.0,,hydrox_safety_241001.csv -gpt_4o_2024_05_13,hydrox_integrity,63.54,,hydrox_safety_241001.csv -gpt_4o_2024_05_13,hydrox_overall_score,65.26,,hydrox_safety_241001.csv -gpt_4o_2024_05_13,hydrox_privacy,68.46,,hydrox_safety_241001.csv -gpt_4o_2024_05_13,hydrox_safety,67.11,,hydrox_safety_241001.csv -gpt_4o_2024_05_13,hydrox_security,60.89,,hydrox_safety_241001.csv -gpt_4o_mini_2024_07_18,hydrox_integrity,81.38,,hydrox_safety_241001.csv -gpt_4o_mini_2024_07_18,hydrox_overall_score,80.43,,hydrox_safety_241001.csv -gpt_4o_mini_2024_07_18,hydrox_privacy,82.32,,hydrox_safety_241001.csv -gpt_4o_mini_2024_07_18,hydrox_safety,80.87,,hydrox_safety_241001.csv -gpt_4o_mini_2024_07_18,hydrox_security,77.55,,hydrox_safety_241001.csv -h2ogpt_4096_llama2_70b_chat,hydrox_integrity,65.75,,hydrox_safety_241001.csv -h2ogpt_4096_llama2_70b_chat,hydrox_overall_score,63.67,,hydrox_safety_241001.csv -h2ogpt_4096_llama2_70b_chat,hydrox_privacy,73.46,,hydrox_safety_241001.csv -h2ogpt_4096_llama2_70b_chat,hydrox_safety,63.64,,hydrox_safety_241001.csv -h2ogpt_4096_llama2_70b_chat,hydrox_security,63.34,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full,hydrox_integrity,5.96,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full,hydrox_overall_score,7.64,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full,hydrox_privacy,6.16,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full,hydrox_safety,11.03,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full,hydrox_security,5.1,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_integrity,35.51,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_overall_score,27.81,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_privacy,32.34,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_safety,22.95,,hydrox_safety_241001.csv -hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_security,25.64,,hydrox_safety_241001.csv -hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_integrity,84.27,,hydrox_safety_241001.csv -hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_overall_score,83.93,,hydrox_safety_241001.csv -hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_privacy,90.63,,hydrox_safety_241001.csv -hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_safety,79.83,,hydrox_safety_241001.csv -hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_security,84.68,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_integrity,97.74,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_overall_score,91.6,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_privacy,96.21,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_safety,86.56,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_security,91.35,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_integrity,98.16,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_overall_score,94.44,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_privacy,99.62,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_safety,89.41,,hydrox_safety_241001.csv -hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_security,96.66,,hydrox_safety_241001.csv -hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_integrity,35.98,,hydrox_safety_241001.csv -hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_overall_score,31.87,,hydrox_safety_241001.csv -hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_privacy,45.3,,hydrox_safety_241001.csv -hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_safety,26.44,,hydrox_safety_241001.csv -hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_security,27.07,,hydrox_safety_241001.csv -hydro_safe_zephyr_td_full,hydrox_integrity,71.25,,hydrox_safety_241001.csv -hydro_safe_zephyr_td_full,hydrox_overall_score,78.18,,hydrox_safety_241001.csv -hydro_safe_zephyr_td_full,hydrox_privacy,49.7,,hydrox_safety_241001.csv -hydro_safe_zephyr_td_full,hydrox_safety,78.18,,hydrox_safety_241001.csv -hydro_safe_zephyr_td_full,hydrox_security,66.63,,hydrox_safety_241001.csv -komt_mistral_7b_v1,hydrox_integrity,0.0,,hydrox_safety_241001.csv -komt_mistral_7b_v1,hydrox_overall_score,0.13,,hydrox_safety_241001.csv -komt_mistral_7b_v1,hydrox_privacy,0.02,,hydrox_safety_241001.csv -komt_mistral_7b_v1,hydrox_safety,0.65,,hydrox_safety_241001.csv -komt_mistral_7b_v1,hydrox_security,0.0,,hydrox_safety_241001.csv -llama3_2_1b_instruct,hydrox_integrity,76.98,,hydrox_safety_241001.csv -llama3_2_1b_instruct,hydrox_overall_score,75.78,,hydrox_safety_241001.csv -llama3_2_1b_instruct,hydrox_privacy,75.71,,hydrox_safety_241001.csv -llama3_2_1b_instruct,hydrox_safety,76.25,,hydrox_safety_241001.csv -llama3_2_1b_instruct,hydrox_security,74.2,,hydrox_safety_241001.csv -llama3_2_3b_instruct,hydrox_integrity,79.24,,hydrox_safety_241001.csv -llama3_2_3b_instruct,hydrox_overall_score,77.42,,hydrox_safety_241001.csv -llama3_2_3b_instruct,hydrox_privacy,77.9,,hydrox_safety_241001.csv -llama3_2_3b_instruct,hydrox_safety,79.46,,hydrox_safety_241001.csv -llama3_2_3b_instruct,hydrox_security,72.51,,hydrox_safety_241001.csv -llama3_70b_instruct,hydrox_integrity,73.55,,hydrox_safety_241001.csv -llama3_70b_instruct,hydrox_overall_score,74.44,,hydrox_safety_241001.csv -llama3_70b_instruct,hydrox_privacy,80.65,,hydrox_safety_241001.csv -llama3_70b_instruct,hydrox_safety,74.65,,hydrox_safety_241001.csv -llama3_70b_instruct,hydrox_security,70.21,,hydrox_safety_241001.csv -llama3_8b_instruct,hydrox_integrity,80.86,,hydrox_safety_241001.csv -llama3_8b_instruct,hydrox_overall_score,83.72,,hydrox_safety_241001.csv -llama3_8b_instruct,hydrox_privacy,88.61,,hydrox_safety_241001.csv -llama3_8b_instruct,hydrox_safety,83.32,,hydrox_safety_241001.csv -llama3_8b_instruct,hydrox_security,82.51,,hydrox_safety_241001.csv -llama_2_13b_chat,hydrox_integrity,62.67,,hydrox_safety_241001.csv -llama_2_13b_chat,hydrox_overall_score,60.0,,hydrox_safety_241001.csv -llama_2_13b_chat,hydrox_privacy,63.37,,hydrox_safety_241001.csv -llama_2_13b_chat,hydrox_safety,58.6,,hydrox_safety_241001.csv -llama_2_13b_chat,hydrox_security,57.85,,hydrox_safety_241001.csv -llama_2_70b_chat,hydrox_integrity,63.0,,hydrox_safety_241001.csv -llama_2_70b_chat,hydrox_overall_score,62.5,,hydrox_safety_241001.csv -llama_2_70b_chat,hydrox_privacy,68.87,,hydrox_safety_241001.csv -llama_2_70b_chat,hydrox_safety,61.0,,hydrox_safety_241001.csv -llama_2_70b_chat,hydrox_security,59.58,,hydrox_safety_241001.csv -llama_2_7b_chat,hydrox_integrity,51.63,,hydrox_safety_241001.csv -llama_2_7b_chat,hydrox_overall_score,51.26,,hydrox_safety_241001.csv -llama_2_7b_chat,hydrox_privacy,55.3,,hydrox_safety_241001.csv -llama_2_7b_chat,hydrox_safety,52.3,,hydrox_safety_241001.csv -llama_2_7b_chat,hydrox_security,46.71,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_1,hydrox_integrity,12.39,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_1,hydrox_overall_score,16.74,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_1,hydrox_privacy,12.08,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_1,hydrox_safety,26.91,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_1,hydrox_security,10.86,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_2,hydrox_integrity,32.52,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_2,hydrox_overall_score,36.82,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_2,hydrox_privacy,37.18,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_2,hydrox_safety,41.71,,hydrox_safety_241001.csv -mistral_7b_instruct_v0_2,hydrox_security,32.24,,hydrox_safety_241001.csv -mistral_7b_v0_1,hydrox_integrity,8.53,,hydrox_safety_241001.csv -mistral_7b_v0_1,hydrox_overall_score,7.32,,hydrox_safety_241001.csv -mistral_7b_v0_1,hydrox_privacy,4.18,,hydrox_safety_241001.csv -mistral_7b_v0_1,hydrox_safety,11.38,,hydrox_safety_241001.csv -mistral_7b_v0_1,hydrox_security,4.44,,hydrox_safety_241001.csv -mixtral_8x7b_instruct_v0_1,hydrox_integrity,21.23,,hydrox_safety_241001.csv -mixtral_8x7b_instruct_v0_1,hydrox_overall_score,23.75,,hydrox_safety_241001.csv -mixtral_8x7b_instruct_v0_1,hydrox_privacy,25.04,,hydrox_safety_241001.csv -mixtral_8x7b_instruct_v0_1,hydrox_safety,27.7,,hydrox_safety_241001.csv -mixtral_8x7b_instruct_v0_1,hydrox_security,18.24,,hydrox_safety_241001.csv -mixtral_8x7b_v0_1,hydrox_integrity,8.16,,hydrox_safety_241001.csv -mixtral_8x7b_v0_1,hydrox_overall_score,8.81,,hydrox_safety_241001.csv -mixtral_8x7b_v0_1,hydrox_privacy,8.81,,hydrox_safety_241001.csv -mixtral_8x7b_v0_1,hydrox_safety,10.61,,hydrox_safety_241001.csv -mixtral_8x7b_v0_1,hydrox_security,6.73,,hydrox_safety_241001.csv -neural_chat_7b_v3_1,hydrox_integrity,22.84,,hydrox_safety_241001.csv -neural_chat_7b_v3_1,hydrox_overall_score,17.86,,hydrox_safety_241001.csv -neural_chat_7b_v3_1,hydrox_privacy,22.28,,hydrox_safety_241001.csv -neural_chat_7b_v3_1,hydrox_safety,15.86,,hydrox_safety_241001.csv -neural_chat_7b_v3_1,hydrox_security,14.72,,hydrox_safety_241001.csv -neural_chat_7b_v3_2,hydrox_integrity,15.33,,hydrox_safety_241001.csv -neural_chat_7b_v3_2,hydrox_overall_score,17.82,,hydrox_safety_241001.csv -neural_chat_7b_v3_2,hydrox_privacy,14.36,,hydrox_safety_241001.csv -neural_chat_7b_v3_2,hydrox_safety,19.68,,hydrox_safety_241001.csv -neural_chat_7b_v3_2,hydrox_security,18.62,,hydrox_safety_241001.csv -nexusraven_v2_13b,hydrox_integrity,4.5,,hydrox_safety_241001.csv -nexusraven_v2_13b,hydrox_overall_score,4.16,,hydrox_safety_241001.csv -nexusraven_v2_13b,hydrox_privacy,3.13,,hydrox_safety_241001.csv -nexusraven_v2_13b,hydrox_safety,3.95,,hydrox_safety_241001.csv -nexusraven_v2_13b,hydrox_security,4.77,,hydrox_safety_241001.csv -notus_7b_v1,hydrox_integrity,19.5,,hydrox_safety_241001.csv -notus_7b_v1,hydrox_overall_score,21.3,,hydrox_safety_241001.csv -notus_7b_v1,hydrox_privacy,22.05,,hydrox_safety_241001.csv -notus_7b_v1,hydrox_safety,26.55,,hydrox_safety_241001.csv -notus_7b_v1,hydrox_security,15.53,,hydrox_safety_241001.csv -orca_2_13b,hydrox_integrity,0.0,,hydrox_safety_241001.csv -orca_2_13b,hydrox_overall_score,17.48,,hydrox_safety_241001.csv -orca_2_13b,hydrox_privacy,27.78,,hydrox_safety_241001.csv -orca_2_13b,hydrox_safety,33.06,,hydrox_safety_241001.csv -orca_2_13b,hydrox_security,0.0,,hydrox_safety_241001.csv -orca_2_7b,hydrox_integrity,22.09,,hydrox_safety_241001.csv -orca_2_7b,hydrox_overall_score,19.53,,hydrox_safety_241001.csv -orca_2_7b,hydrox_privacy,18.31,,hydrox_safety_241001.csv -orca_2_7b,hydrox_safety,18.3,,hydrox_safety_241001.csv -orca_2_7b,hydrox_security,20.52,,hydrox_safety_241001.csv -pythia_70m_deduped,hydrox_integrity,0.0,,hydrox_safety_241001.csv -pythia_70m_deduped,hydrox_overall_score,0.0,,hydrox_safety_241001.csv -pythia_70m_deduped,hydrox_privacy,0.0,,hydrox_safety_241001.csv -pythia_70m_deduped,hydrox_safety,0.0,,hydrox_safety_241001.csv -pythia_70m_deduped,hydrox_security,0.0,,hydrox_safety_241001.csv -qwen2_72b_instruct,hydrox_integrity,70.13,,hydrox_safety_241001.csv -qwen2_72b_instruct,hydrox_overall_score,71.86,,hydrox_safety_241001.csv -qwen2_72b_instruct,hydrox_privacy,73.4,,hydrox_safety_241001.csv -qwen2_72b_instruct,hydrox_safety,77.1,,hydrox_safety_241001.csv -qwen2_72b_instruct,hydrox_security,65.19,,hydrox_safety_241001.csv -sheared_llama_1_3b,hydrox_integrity,0.04,,hydrox_safety_241001.csv -sheared_llama_1_3b,hydrox_overall_score,0.29,,hydrox_safety_241001.csv -sheared_llama_1_3b,hydrox_privacy,0.05,,hydrox_safety_241001.csv -sheared_llama_1_3b,hydrox_safety,1.14,,hydrox_safety_241001.csv -sheared_llama_1_3b,hydrox_security,0.03,,hydrox_safety_241001.csv -solar_0_70b_16bit,hydrox_integrity,30.25,,hydrox_safety_241001.csv -solar_0_70b_16bit,hydrox_overall_score,24.5,,hydrox_safety_241001.csv -solar_0_70b_16bit,hydrox_privacy,33.8,,hydrox_safety_241001.csv -solar_0_70b_16bit,hydrox_safety,22.4,,hydrox_safety_241001.csv -solar_0_70b_16bit,hydrox_security,17.55,,hydrox_safety_241001.csv -tinyllama_1_1b_chat_v1_0,hydrox_integrity,5.65,,hydrox_safety_241001.csv -tinyllama_1_1b_chat_v1_0,hydrox_overall_score,5.38,,hydrox_safety_241001.csv -tinyllama_1_1b_chat_v1_0,hydrox_privacy,3.3,,hydrox_safety_241001.csv -tinyllama_1_1b_chat_v1_0,hydrox_safety,6.87,,hydrox_safety_241001.csv -tinyllama_1_1b_chat_v1_0,hydrox_security,4.57,,hydrox_safety_241001.csv -vicuna_13b_v1_5,hydrox_integrity,36.08,,hydrox_safety_241001.csv -vicuna_13b_v1_5,hydrox_overall_score,34.07,,hydrox_safety_241001.csv -vicuna_13b_v1_5,hydrox_privacy,29.78,,hydrox_safety_241001.csv -vicuna_13b_v1_5,hydrox_safety,38.46,,hydrox_safety_241001.csv -vicuna_13b_v1_5,hydrox_security,30.71,,hydrox_safety_241001.csv -vicuna_13b_v1_5_16k,hydrox_integrity,22.25,,hydrox_safety_241001.csv -vicuna_13b_v1_5_16k,hydrox_overall_score,19.31,,hydrox_safety_241001.csv -vicuna_13b_v1_5_16k,hydrox_privacy,17.01,,hydrox_safety_241001.csv -vicuna_13b_v1_5_16k,hydrox_safety,21.14,,hydrox_safety_241001.csv -vicuna_13b_v1_5_16k,hydrox_security,16.99,,hydrox_safety_241001.csv -vicuna_33b_v1_3,hydrox_integrity,18.64,,hydrox_safety_241001.csv -vicuna_33b_v1_3,hydrox_overall_score,17.64,,hydrox_safety_241001.csv -vicuna_33b_v1_3,hydrox_privacy,21.34,,hydrox_safety_241001.csv -vicuna_33b_v1_3,hydrox_safety,18.42,,hydrox_safety_241001.csv -vicuna_33b_v1_3,hydrox_security,13.89,,hydrox_safety_241001.csv -vicuna_7b_v1_5,hydrox_integrity,11.74,,hydrox_safety_241001.csv -vicuna_7b_v1_5,hydrox_overall_score,15.37,,hydrox_safety_241001.csv -vicuna_7b_v1_5,hydrox_privacy,10.91,,hydrox_safety_241001.csv -vicuna_7b_v1_5,hydrox_safety,22.47,,hydrox_safety_241001.csv -vicuna_7b_v1_5,hydrox_security,12.61,,hydrox_safety_241001.csv -viking_13b,hydrox_integrity,7.68,,hydrox_safety_241001.csv -viking_13b,hydrox_overall_score,7.33,,hydrox_safety_241001.csv -viking_13b,hydrox_privacy,8.32,,hydrox_safety_241001.csv -viking_13b,hydrox_safety,7.75,,hydrox_safety_241001.csv -viking_13b,hydrox_security,5.76,,hydrox_safety_241001.csv -viking_33b,hydrox_integrity,6.38,,hydrox_safety_241001.csv -viking_33b,hydrox_overall_score,6.73,,hydrox_safety_241001.csv -viking_33b,hydrox_privacy,6.48,,hydrox_safety_241001.csv -viking_33b,hydrox_safety,6.87,,hydrox_safety_241001.csv -viking_33b,hydrox_security,6.92,,hydrox_safety_241001.csv -viking_7b,hydrox_integrity,9.05,,hydrox_safety_241001.csv -viking_7b,hydrox_overall_score,6.15,,hydrox_safety_241001.csv -viking_7b,hydrox_privacy,3.91,,hydrox_safety_241001.csv -viking_7b,hydrox_safety,5.37,,hydrox_safety_241001.csv -viking_7b,hydrox_security,7.6,,hydrox_safety_241001.csv -wizardlm_30b_v1_0,hydrox_integrity,5.58,,hydrox_safety_241001.csv -wizardlm_30b_v1_0,hydrox_overall_score,6.41,,hydrox_safety_241001.csv -wizardlm_30b_v1_0,hydrox_privacy,3.88,,hydrox_safety_241001.csv -wizardlm_30b_v1_0,hydrox_safety,8.0,,hydrox_safety_241001.csv -wizardlm_30b_v1_0,hydrox_security,6.49,,hydrox_safety_241001.csv -yi_6b_chat,hydrox_integrity,36.02,,hydrox_safety_241001.csv -yi_6b_chat,hydrox_overall_score,37.0,,hydrox_safety_241001.csv -yi_6b_chat,hydrox_privacy,45.36,,hydrox_safety_241001.csv -yi_6b_chat,hydrox_safety,37.35,,hydrox_safety_241001.csv -yi_6b_chat,hydrox_security,31.49,,hydrox_safety_241001.csv -zephyr_7b_beta,hydrox_integrity,24.95,,hydrox_safety_241001.csv -zephyr_7b_beta,hydrox_overall_score,23.8,,hydrox_safety_241001.csv -zephyr_7b_beta,hydrox_privacy,30.6,,hydrox_safety_241001.csv -zephyr_7b_beta,hydrox_safety,21.2,,hydrox_safety_241001.csv -zephyr_7b_beta,hydrox_security,22.4,,hydrox_safety_241001.csv -zephyr_reproduction_dpo_full,hydrox_integrity,26.05,,hydrox_safety_241001.csv -zephyr_reproduction_dpo_full,hydrox_overall_score,21.38,,hydrox_safety_241001.csv -zephyr_reproduction_dpo_full,hydrox_privacy,21.65,,hydrox_safety_241001.csv -zephyr_reproduction_dpo_full,hydrox_safety,19.35,,hydrox_safety_241001.csv -zephyr_reproduction_dpo_full,hydrox_security,21.22,,hydrox_safety_241001.csv -zephyr_reproduction_sft_full,hydrox_integrity,13.61,,hydrox_safety_241001.csv -zephyr_reproduction_sft_full,hydrox_overall_score,13.1,,hydrox_safety_241001.csv -zephyr_reproduction_sft_full,hydrox_privacy,14.94,,hydrox_safety_241001.csv -zephyr_reproduction_sft_full,hydrox_safety,14.92,,hydrox_safety_241001.csv -zephyr_reproduction_sft_full,hydrox_security,9.5,,hydrox_safety_241001.csv -alpaca_7b,aggregate,0.23484848484848483,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -chatglm2_6b,aggregate,0.029137529137529136,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -chatgpt_4o_latest,aggregate,0.9754079254079254,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_2_0,aggregate,0.8333333333333334,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_2_1,aggregate,0.6693861693861693,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_5_sonnet_20240620,aggregate,0.9572649572649573,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_haiku_20240307,aggregate,0.44965034965034967,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_opus_20240229,aggregate,0.8824397824397824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_3_sonnet_20240229,aggregate,0.5985236985236985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -claude_instant_1_2,aggregate,0.6486013986013985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -command_r,aggregate,0.3296911421911422,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -command_r_plus,aggregate,0.6183108558108558,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -dbrx_instruct,aggregate,0.4724025974025974,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -dbrx_instructruct,aggregate,0.5379867046533713,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -deepseek_coder_v2,aggregate,0.713053613053613,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -deepseek_llm_67b_chat,aggregate,0.5734841290396846,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -dolphin_2_2_1_mistral_7b,aggregate,0.4810606060606061,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_40b,aggregate,0.3502690724912947,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_40b_instruct,aggregate,0.13187429854096522,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_7b,aggregate,0.11380183602405824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -falcon_7b_instruct,aggregate,0.011363636363636364,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemini_1_5_flash_api_0514,aggregate,0.7263403263403263,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemini_1_5_pro_api_0514,aggregate,0.8294871794871794,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemini_1_5_pro_exp_0801,aggregate,0.9545454545454546,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemini_pro,aggregate,0.7298951048951049,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_1_1_2b_it,aggregate,0.07454890788224121,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_1_1_7b_it,aggregate,0.263927019482575,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_27b_it,aggregate,0.776345259678593,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_2b_it,aggregate,0.28113553113553114,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_9b_it,aggregate,0.6048877048877048,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_9b_it_dpo,aggregate,0.8100649350649352,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2_9b_it_simpo,aggregate,0.7328042328042329,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_2b_it,aggregate,0.08119658119658119,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_7b,aggregate,0.4477682811016144,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gemma_7b_it,aggregate,0.18790982679871568,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -glm_4_9b_chat,aggregate,0.4769547325102881,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_3_5_turbo_0125,aggregate,0.3591242091242091,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_3_5_turbo_0613,aggregate,0.6851851851851851,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4_0125_preview,aggregate,0.8492118992118992,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4_0613,aggregate,0.7641802641802643,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4_turbo_2024_04_09,aggregate,0.9055819180819181,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4o_2024_05_13,aggregate,0.9767482517482518,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4o_2024_08_06,aggregate,0.9652680652680652,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_4o_mini_2024_07_18,aggregate,0.8348776223776224,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_j_6b,aggregate,0.09876543209876543,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -gpt_neox_20b,aggregate,0.1419753086419753,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -guanaco_33b,aggregate,0.38374125874125875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -hermes_3_llama3_1_70b,aggregate,0.8451178451178452,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -infinity_instruct_3m_0625_llama3_8b,aggregate,0.6537598204264872,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -internlm2_chat_20b,aggregate,0.37196969696969695,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -jurassic_2_grande_17b,aggregate,0.4230769230769231,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -jurassic_2_jumbo_178b,aggregate,0.532051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_1_405b_instruct,aggregate,0.8598484848484849,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_1_70b_instruct,aggregate,0.9343074620852398,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_1_8b_instruct,aggregate,0.6080822469711359,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_70b,aggregate,0.8129154795821463,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_70b_instruct,aggregate,0.8172801478357034,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_8b,aggregate,0.4368471035137702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_8b_instruct,aggregate,0.4449662477440255,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama3_instruct_8b_simpo,aggregate,0.7992424242424242,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_13b,aggregate,0.2222222222222222,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_13b,aggregate,0.4146881924659702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_13b_chat,aggregate,0.38675213675213677,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_70b,aggregate,0.7293447293447294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_70b_chat,aggregate,0.412732329398996,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_7b,aggregate,0.25466919911364355,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_2_7b_chat,aggregate,0.1122679789346456,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -llama_65b,aggregate,0.5759734093067427,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -luminous_base_13b,aggregate,0.08333333333333333,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -luminous_extended_30b,aggregate,0.2329059829059829,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -luminous_supreme_70b,aggregate,0.32905982905982906,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_instruct_v0_2,aggregate,0.250669392336059,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_instruct_v0_3,aggregate,0.24534231200897869,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_v0_2,aggregate,0.3773849607182941,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_7b_v0_3,aggregate,0.4228395061728395,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_large_2402,aggregate,0.5105672105672105,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_large_2407,aggregate,0.8375291375291375,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_medium,aggregate,0.657051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_small_2402,aggregate,0.47785547785547783,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mistral_v0_1_7b,aggregate,0.6239316239316239,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x22b_instruct_v0_1,aggregate,0.585565052231719,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x22b_v0_1,aggregate,0.7382154882154882,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x7b_instruct_v0_1,aggregate,0.284326167659501,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -mixtral_8x7b_v0_1,aggregate,0.5310044893378227,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -nous_hermes_2_mixtral_8x7b_dpo,aggregate,0.7094017094017094,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -olmo_7b,aggregate,0.06220322886989553,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -olmo_7b_instruct,aggregate,0.15669515669515668,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -openchat_3_5,aggregate,0.5270655270655271,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -openhermes_2_5_mistral_7b,aggregate,0.40103708020374684,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_2,aggregate,0.19812080923192033,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_5_mini_instruct,aggregate,0.6103254769921437,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_5_moe_instruct,aggregate,0.7600448933782267,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_medium_4k_instruct,aggregate,0.48541540763762986,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_mini_128k_instruct,aggregate,0.3778468445135112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_mini_4k_instruct,aggregate,0.4048663270885493,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_small_128k_instruct,aggregate,0.6561167227833894,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -phi_3_small_8k_instruct,aggregate,0.27051282051282055,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -pythia_12b,aggregate,0.05246913580246913,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -pythia_6_9b,aggregate,0.018518518518518517,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_0_5b_chat,aggregate,0.012345679012345678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_110b_chat,aggregate,0.7419770353103686,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_14b,aggregate,0.5797720797720798,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_14b_chat,aggregate,0.45340153673487005,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_1_8b_chat,aggregate,0.05544332210998878,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_32b,aggregate,0.7678062678062678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_32b_chat,aggregate,0.571383349161127,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_4b_chat,aggregate,0.12542806987251431,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_72b_chat,aggregate,0.5463669663669664,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_7b,aggregate,0.35185185185185186,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen1_5_7b_chat,aggregate,0.24214088380755047,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_0_5b_instruct,aggregate,0.055218855218855216,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_1_5b_instruct,aggregate,0.1968574635241302,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_72b_instruct,aggregate,0.7701936951936953,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen2_7b_instruct,aggregate,0.4970445192667415,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -qwen_14b_chat,aggregate,0.2837995337995338,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -smaug_qwen2_72b_instruct,aggregate,0.8331088664421997,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -solar_10_7b_instruct_v1_0,aggregate,0.5030864197530864,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -starling_lm_7b_alpha,aggregate,0.42734323289878845,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -starling_lm_7b_beta,aggregate,0.3611888111888112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -tulu_2_dpo_70b,aggregate,0.3585164835164835,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -vicuna_13b,aggregate,0.14714452214452214,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -vicuna_7b,aggregate,0.1885198135198135,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -vicuna_7b_v1_5,aggregate,0.15454545454545454,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -wizardlm_13b,aggregate,0.42773892773892774,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -wizardlm_70b,aggregate,0.5620629370629371,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_1_5_34b_chat,aggregate,0.6669566544566544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_1_5_6b_chat,aggregate,0.33974132863021755,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_1_5_9b_chat,aggregate,0.6041446208112875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_34b,aggregate,0.7188983855650521,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_34b_chat,aggregate,0.5558361391694725,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_6b,aggregate,0.295346628679962,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_6b_chat,aggregate,0.19393939393939394,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_large,aggregate,0.7889194139194139,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -yi_large_preview,aggregate,0.8714202464202464,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -zephyr_7b_alpha,aggregate,0.33875830959164294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -zephyr_7b_beta,aggregate,0.28937667271000606,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate -zephyr_orpo_141b_a35b_v0_1,aggregate,0.8414055080721747,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +abab5_5_chat,Hydrox Integrity,8.09,,hydrox_safety_241001.csv +abab5_5_chat,Hydrox Overall Score,6.6,,hydrox_safety_241001.csv +abab5_5_chat,Hydrox Privacy,5.13,,hydrox_safety_241001.csv +abab5_5_chat,Hydrox Safety,8.32,,hydrox_safety_241001.csv +abab5_5_chat,Hydrox Security,4.85,,hydrox_safety_241001.csv +abab5_5s_chat,Hydrox Integrity,19.46,,hydrox_safety_241001.csv +abab5_5s_chat,Hydrox Overall Score,19.12,,hydrox_safety_241001.csv +abab5_5s_chat,Hydrox Privacy,20.63,,hydrox_safety_241001.csv +abab5_5s_chat,Hydrox Safety,22.54,,hydrox_safety_241001.csv +abab5_5s_chat,Hydrox Security,14.17,,hydrox_safety_241001.csv +claude_3_5_sonnet,Hydrox Integrity,95.56,,hydrox_safety_241001.csv +claude_3_5_sonnet,Hydrox Overall Score,94.18,,hydrox_safety_241001.csv +claude_3_5_sonnet,Hydrox Privacy,93.83,,hydrox_safety_241001.csv +claude_3_5_sonnet,Hydrox Safety,94.75,,hydrox_safety_241001.csv +claude_3_5_sonnet,Hydrox Security,92.61,,hydrox_safety_241001.csv +claude_3_haiku,Hydrox Integrity,89.53,,hydrox_safety_241001.csv +claude_3_haiku,Hydrox Overall Score,91.59,,hydrox_safety_241001.csv +claude_3_haiku,Hydrox Privacy,93.69,,hydrox_safety_241001.csv +claude_3_haiku,Hydrox Safety,91.52,,hydrox_safety_241001.csv +claude_3_haiku,Hydrox Security,91.39,,hydrox_safety_241001.csv +claude_3_opus,Hydrox Integrity,94.08,,hydrox_safety_241001.csv +claude_3_opus,Hydrox Overall Score,92.02,,hydrox_safety_241001.csv +claude_3_opus,Hydrox Privacy,91.26,,hydrox_safety_241001.csv +claude_3_opus,Hydrox Safety,92.5,,hydrox_safety_241001.csv +claude_3_opus,Hydrox Security,90.47,,hydrox_safety_241001.csv +claude_3_sonnet,Hydrox Integrity,94.14,,hydrox_safety_241001.csv +claude_3_sonnet,Hydrox Overall Score,93.62,,hydrox_safety_241001.csv +claude_3_sonnet,Hydrox Privacy,94.36,,hydrox_safety_241001.csv +claude_3_sonnet,Hydrox Safety,92.33,,hydrox_safety_241001.csv +claude_3_sonnet,Hydrox Security,94.62,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,Hydrox Integrity,0.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,Hydrox Overall Score,50.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,Hydrox Privacy,0.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,Hydrox Safety,50.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,Hydrox Security,0.0,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,Hydrox Integrity,45.93,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,Hydrox Overall Score,44.91,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,Hydrox Privacy,48.84,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,Hydrox Safety,44.26,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,Hydrox Security,41.91,,hydrox_safety_241001.csv +dolly_v2_12b,Hydrox Integrity,3.72,,hydrox_safety_241001.csv +dolly_v2_12b,Hydrox Overall Score,6.21,,hydrox_safety_241001.csv +dolly_v2_12b,Hydrox Privacy,3.48,,hydrox_safety_241001.csv +dolly_v2_12b,Hydrox Safety,11.46,,hydrox_safety_241001.csv +dolly_v2_12b,Hydrox Security,3.39,,hydrox_safety_241001.csv +dolly_v2_3b,Hydrox Integrity,0.18,,hydrox_safety_241001.csv +dolly_v2_3b,Hydrox Overall Score,1.81,,hydrox_safety_241001.csv +dolly_v2_3b,Hydrox Privacy,1.08,,hydrox_safety_241001.csv +dolly_v2_3b,Hydrox Safety,4.08,,hydrox_safety_241001.csv +dolly_v2_3b,Hydrox Security,0.55,,hydrox_safety_241001.csv +dolly_v2_7b,Hydrox Integrity,8.33,,hydrox_safety_241001.csv +dolly_v2_7b,Hydrox Overall Score,7.79,,hydrox_safety_241001.csv +dolly_v2_7b,Hydrox Privacy,8.33,,hydrox_safety_241001.csv +dolly_v2_7b,Hydrox Safety,9.92,,hydrox_safety_241001.csv +dolly_v2_7b,Hydrox Security,4.96,,hydrox_safety_241001.csv +falcon_40b,Hydrox Integrity,0.64,,hydrox_safety_241001.csv +falcon_40b,Hydrox Overall Score,0.9,,hydrox_safety_241001.csv +falcon_40b,Hydrox Privacy,0.25,,hydrox_safety_241001.csv +falcon_40b,Hydrox Safety,2.08,,hydrox_safety_241001.csv +falcon_40b,Hydrox Security,0.4,,hydrox_safety_241001.csv +falcon_40b_instruct,Hydrox Integrity,30.32,,hydrox_safety_241001.csv +falcon_40b_instruct,Hydrox Overall Score,27.55,,hydrox_safety_241001.csv +falcon_40b_instruct,Hydrox Privacy,30.83,,hydrox_safety_241001.csv +falcon_40b_instruct,Hydrox Safety,28.1,,hydrox_safety_241001.csv +falcon_40b_instruct,Hydrox Security,22.97,,hydrox_safety_241001.csv +falcon_7b,Hydrox Integrity,0.23,,hydrox_safety_241001.csv +falcon_7b,Hydrox Overall Score,0.51,,hydrox_safety_241001.csv +falcon_7b,Hydrox Privacy,0.11,,hydrox_safety_241001.csv +falcon_7b,Hydrox Safety,1.05,,hydrox_safety_241001.csv +falcon_7b,Hydrox Security,0.43,,hydrox_safety_241001.csv +falcon_7b_instruct,Hydrox Integrity,15.76,,hydrox_safety_241001.csv +falcon_7b_instruct,Hydrox Overall Score,14.01,,hydrox_safety_241001.csv +falcon_7b_instruct,Hydrox Privacy,11.3,,hydrox_safety_241001.csv +falcon_7b_instruct,Hydrox Safety,14.64,,hydrox_safety_241001.csv +falcon_7b_instruct,Hydrox Security,14.01,,hydrox_safety_241001.csv +gemini_1_0_pro,Hydrox Integrity,87.11,,hydrox_safety_241001.csv +gemini_1_0_pro,Hydrox Overall Score,77.2,,hydrox_safety_241001.csv +gemini_1_0_pro,Hydrox Privacy,90.39,,hydrox_safety_241001.csv +gemini_1_0_pro,Hydrox Safety,65.18,,hydrox_safety_241001.csv +gemini_1_0_pro,Hydrox Security,79.93,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,Hydrox Integrity,88.61,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,Hydrox Overall Score,78.29,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,Hydrox Privacy,87.82,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,Hydrox Safety,69.2,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,Hydrox Security,77.91,,hydrox_safety_241001.csv +gemini_1_5_flash,Hydrox Integrity,60.0,,hydrox_safety_241001.csv +gemini_1_5_flash,Hydrox Overall Score,74.43,,hydrox_safety_241001.csv +gemini_1_5_flash,Hydrox Privacy,83.33,,hydrox_safety_241001.csv +gemini_1_5_flash,Hydrox Safety,77.61,,hydrox_safety_241001.csv +gemini_1_5_flash,Hydrox Security,72.05,,hydrox_safety_241001.csv +gemini_1_5_pro,Hydrox Integrity,40.84,,hydrox_safety_241001.csv +gemini_1_5_pro,Hydrox Overall Score,43.27,,hydrox_safety_241001.csv +gemini_1_5_pro,Hydrox Privacy,40.63,,hydrox_safety_241001.csv +gemini_1_5_pro,Hydrox Safety,46.99,,hydrox_safety_241001.csv +gemini_1_5_pro,Hydrox Security,41.65,,hydrox_safety_241001.csv +gemini_pro,Hydrox Integrity,84.42,,hydrox_safety_241001.csv +gemini_pro,Hydrox Overall Score,73.04,,hydrox_safety_241001.csv +gemini_pro,Hydrox Privacy,90.6,,hydrox_safety_241001.csv +gemini_pro,Hydrox Safety,63.56,,hydrox_safety_241001.csv +gemini_pro,Hydrox Security,67.49,,hydrox_safety_241001.csv +gemma_2_27b_it,Hydrox Integrity,10.94,,hydrox_safety_241001.csv +gemma_2_27b_it,Hydrox Overall Score,9.67,,hydrox_safety_241001.csv +gemma_2_27b_it,Hydrox Privacy,11.11,,hydrox_safety_241001.csv +gemma_2_27b_it,Hydrox Safety,8.1,,hydrox_safety_241001.csv +gemma_2_27b_it,Hydrox Security,10.0,,hydrox_safety_241001.csv +gemma_2_2b,Hydrox Integrity,24.88,,hydrox_safety_241001.csv +gemma_2_2b,Hydrox Overall Score,25.5,,hydrox_safety_241001.csv +gemma_2_2b,Hydrox Privacy,27.04,,hydrox_safety_241001.csv +gemma_2_2b,Hydrox Safety,25.61,,hydrox_safety_241001.csv +gemma_2_2b,Hydrox Security,24.5,,hydrox_safety_241001.csv +gemma_2_2b_it,Hydrox Integrity,93.14,,hydrox_safety_241001.csv +gemma_2_2b_it,Hydrox Overall Score,91.66,,hydrox_safety_241001.csv +gemma_2_2b_it,Hydrox Privacy,92.43,,hydrox_safety_241001.csv +gemma_2_2b_it,Hydrox Safety,92.15,,hydrox_safety_241001.csv +gemma_2_2b_it,Hydrox Security,89.22,,hydrox_safety_241001.csv +gemma_2b,Hydrox Integrity,6.39,,hydrox_safety_241001.csv +gemma_2b,Hydrox Overall Score,7.99,,hydrox_safety_241001.csv +gemma_2b,Hydrox Privacy,8.27,,hydrox_safety_241001.csv +gemma_2b,Hydrox Safety,8.55,,hydrox_safety_241001.csv +gemma_2b,Hydrox Security,8.09,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,Hydrox Integrity,80.84,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,Hydrox Overall Score,72.04,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,Hydrox Privacy,90.0,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,Hydrox Safety,56.94,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,Hydrox Security,93.43,,hydrox_safety_241001.csv +gpt_4_0314,Hydrox Integrity,54.0,,hydrox_safety_241001.csv +gpt_4_0314,Hydrox Overall Score,62.51,,hydrox_safety_241001.csv +gpt_4_0314,Hydrox Privacy,76.67,,hydrox_safety_241001.csv +gpt_4_0314,Hydrox Safety,56.36,,hydrox_safety_241001.csv +gpt_4_0314,Hydrox Security,72.79,,hydrox_safety_241001.csv +gpt_4_0613,Hydrox Integrity,96.04,,hydrox_safety_241001.csv +gpt_4_0613,Hydrox Overall Score,85.43,,hydrox_safety_241001.csv +gpt_4_0613,Hydrox Privacy,91.79,,hydrox_safety_241001.csv +gpt_4_0613,Hydrox Safety,79.94,,hydrox_safety_241001.csv +gpt_4_0613,Hydrox Security,92.0,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,Hydrox Integrity,63.54,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,Hydrox Overall Score,65.26,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,Hydrox Privacy,68.46,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,Hydrox Safety,67.11,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,Hydrox Security,60.89,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,Hydrox Integrity,81.38,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,Hydrox Overall Score,80.43,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,Hydrox Privacy,82.32,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,Hydrox Safety,80.87,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,Hydrox Security,77.55,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,Hydrox Integrity,65.75,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,Hydrox Overall Score,63.67,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,Hydrox Privacy,73.46,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,Hydrox Safety,63.64,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,Hydrox Security,63.34,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,Hydrox Integrity,5.96,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,Hydrox Overall Score,7.64,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,Hydrox Privacy,6.16,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,Hydrox Safety,11.03,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,Hydrox Security,5.1,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Integrity,35.51,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Overall Score,27.81,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Privacy,32.34,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Safety,22.95,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,Hydrox Security,25.64,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Integrity,84.27,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Overall Score,83.93,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Privacy,90.63,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Safety,79.83,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,Hydrox Security,84.68,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Integrity,97.74,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Overall Score,91.6,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Privacy,96.21,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Safety,86.56,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,Hydrox Security,91.35,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Integrity,98.16,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Overall Score,94.44,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Privacy,99.62,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Safety,89.41,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,Hydrox Security,96.66,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Integrity,35.98,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Overall Score,31.87,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Privacy,45.3,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Safety,26.44,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,Hydrox Security,27.07,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,Hydrox Integrity,71.25,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,Hydrox Overall Score,78.18,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,Hydrox Privacy,49.7,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,Hydrox Safety,78.18,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,Hydrox Security,66.63,,hydrox_safety_241001.csv +komt_mistral_7b_v1,Hydrox Integrity,0.0,,hydrox_safety_241001.csv +komt_mistral_7b_v1,Hydrox Overall Score,0.13,,hydrox_safety_241001.csv +komt_mistral_7b_v1,Hydrox Privacy,0.02,,hydrox_safety_241001.csv +komt_mistral_7b_v1,Hydrox Safety,0.65,,hydrox_safety_241001.csv +komt_mistral_7b_v1,Hydrox Security,0.0,,hydrox_safety_241001.csv +llama3_2_1b_instruct,Hydrox Integrity,76.98,,hydrox_safety_241001.csv +llama3_2_1b_instruct,Hydrox Overall Score,75.78,,hydrox_safety_241001.csv +llama3_2_1b_instruct,Hydrox Privacy,75.71,,hydrox_safety_241001.csv +llama3_2_1b_instruct,Hydrox Safety,76.25,,hydrox_safety_241001.csv +llama3_2_1b_instruct,Hydrox Security,74.2,,hydrox_safety_241001.csv +llama3_2_3b_instruct,Hydrox Integrity,79.24,,hydrox_safety_241001.csv +llama3_2_3b_instruct,Hydrox Overall Score,77.42,,hydrox_safety_241001.csv +llama3_2_3b_instruct,Hydrox Privacy,77.9,,hydrox_safety_241001.csv +llama3_2_3b_instruct,Hydrox Safety,79.46,,hydrox_safety_241001.csv +llama3_2_3b_instruct,Hydrox Security,72.51,,hydrox_safety_241001.csv +llama3_70b_instruct,Hydrox Integrity,73.55,,hydrox_safety_241001.csv +llama3_70b_instruct,Hydrox Overall Score,74.44,,hydrox_safety_241001.csv +llama3_70b_instruct,Hydrox Privacy,80.65,,hydrox_safety_241001.csv +llama3_70b_instruct,Hydrox Safety,74.65,,hydrox_safety_241001.csv +llama3_70b_instruct,Hydrox Security,70.21,,hydrox_safety_241001.csv +llama3_8b_instruct,Hydrox Integrity,80.86,,hydrox_safety_241001.csv +llama3_8b_instruct,Hydrox Overall Score,83.72,,hydrox_safety_241001.csv +llama3_8b_instruct,Hydrox Privacy,88.61,,hydrox_safety_241001.csv +llama3_8b_instruct,Hydrox Safety,83.32,,hydrox_safety_241001.csv +llama3_8b_instruct,Hydrox Security,82.51,,hydrox_safety_241001.csv +llama_2_13b_chat,Hydrox Integrity,62.67,,hydrox_safety_241001.csv +llama_2_13b_chat,Hydrox Overall Score,60.0,,hydrox_safety_241001.csv +llama_2_13b_chat,Hydrox Privacy,63.37,,hydrox_safety_241001.csv +llama_2_13b_chat,Hydrox Safety,58.6,,hydrox_safety_241001.csv +llama_2_13b_chat,Hydrox Security,57.85,,hydrox_safety_241001.csv +llama_2_70b_chat,Hydrox Integrity,63.0,,hydrox_safety_241001.csv +llama_2_70b_chat,Hydrox Overall Score,62.5,,hydrox_safety_241001.csv +llama_2_70b_chat,Hydrox Privacy,68.87,,hydrox_safety_241001.csv +llama_2_70b_chat,Hydrox Safety,61.0,,hydrox_safety_241001.csv +llama_2_70b_chat,Hydrox Security,59.58,,hydrox_safety_241001.csv +llama_2_7b_chat,Hydrox Integrity,51.63,,hydrox_safety_241001.csv +llama_2_7b_chat,Hydrox Overall Score,51.26,,hydrox_safety_241001.csv +llama_2_7b_chat,Hydrox Privacy,55.3,,hydrox_safety_241001.csv +llama_2_7b_chat,Hydrox Safety,52.3,,hydrox_safety_241001.csv +llama_2_7b_chat,Hydrox Security,46.71,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,Hydrox Integrity,12.39,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,Hydrox Overall Score,16.74,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,Hydrox Privacy,12.08,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,Hydrox Safety,26.91,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,Hydrox Security,10.86,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,Hydrox Integrity,32.52,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,Hydrox Overall Score,36.82,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,Hydrox Privacy,37.18,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,Hydrox Safety,41.71,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,Hydrox Security,32.24,,hydrox_safety_241001.csv +mistral_7b_v0_1,Hydrox Integrity,8.53,,hydrox_safety_241001.csv +mistral_7b_v0_1,Hydrox Overall Score,7.32,,hydrox_safety_241001.csv +mistral_7b_v0_1,Hydrox Privacy,4.18,,hydrox_safety_241001.csv +mistral_7b_v0_1,Hydrox Safety,11.38,,hydrox_safety_241001.csv +mistral_7b_v0_1,Hydrox Security,4.44,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,Hydrox Integrity,21.23,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,Hydrox Overall Score,23.75,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,Hydrox Privacy,25.04,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,Hydrox Safety,27.7,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,Hydrox Security,18.24,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,Hydrox Integrity,8.16,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,Hydrox Overall Score,8.81,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,Hydrox Privacy,8.81,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,Hydrox Safety,10.61,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,Hydrox Security,6.73,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,Hydrox Integrity,22.84,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,Hydrox Overall Score,17.86,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,Hydrox Privacy,22.28,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,Hydrox Safety,15.86,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,Hydrox Security,14.72,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,Hydrox Integrity,15.33,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,Hydrox Overall Score,17.82,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,Hydrox Privacy,14.36,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,Hydrox Safety,19.68,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,Hydrox Security,18.62,,hydrox_safety_241001.csv +nexusraven_v2_13b,Hydrox Integrity,4.5,,hydrox_safety_241001.csv +nexusraven_v2_13b,Hydrox Overall Score,4.16,,hydrox_safety_241001.csv +nexusraven_v2_13b,Hydrox Privacy,3.13,,hydrox_safety_241001.csv +nexusraven_v2_13b,Hydrox Safety,3.95,,hydrox_safety_241001.csv +nexusraven_v2_13b,Hydrox Security,4.77,,hydrox_safety_241001.csv +notus_7b_v1,Hydrox Integrity,19.5,,hydrox_safety_241001.csv +notus_7b_v1,Hydrox Overall Score,21.3,,hydrox_safety_241001.csv +notus_7b_v1,Hydrox Privacy,22.05,,hydrox_safety_241001.csv +notus_7b_v1,Hydrox Safety,26.55,,hydrox_safety_241001.csv +notus_7b_v1,Hydrox Security,15.53,,hydrox_safety_241001.csv +orca_2_13b,Hydrox Integrity,0.0,,hydrox_safety_241001.csv +orca_2_13b,Hydrox Overall Score,17.48,,hydrox_safety_241001.csv +orca_2_13b,Hydrox Privacy,27.78,,hydrox_safety_241001.csv +orca_2_13b,Hydrox Safety,33.06,,hydrox_safety_241001.csv +orca_2_13b,Hydrox Security,0.0,,hydrox_safety_241001.csv +orca_2_7b,Hydrox Integrity,22.09,,hydrox_safety_241001.csv +orca_2_7b,Hydrox Overall Score,19.53,,hydrox_safety_241001.csv +orca_2_7b,Hydrox Privacy,18.31,,hydrox_safety_241001.csv +orca_2_7b,Hydrox Safety,18.3,,hydrox_safety_241001.csv +orca_2_7b,Hydrox Security,20.52,,hydrox_safety_241001.csv +pythia_70m_deduped,Hydrox Integrity,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,Hydrox Overall Score,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,Hydrox Privacy,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,Hydrox Safety,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,Hydrox Security,0.0,,hydrox_safety_241001.csv +qwen2_72b_instruct,Hydrox Integrity,70.13,,hydrox_safety_241001.csv +qwen2_72b_instruct,Hydrox Overall Score,71.86,,hydrox_safety_241001.csv +qwen2_72b_instruct,Hydrox Privacy,73.4,,hydrox_safety_241001.csv +qwen2_72b_instruct,Hydrox Safety,77.1,,hydrox_safety_241001.csv +qwen2_72b_instruct,Hydrox Security,65.19,,hydrox_safety_241001.csv +sheared_llama_1_3b,Hydrox Integrity,0.04,,hydrox_safety_241001.csv +sheared_llama_1_3b,Hydrox Overall Score,0.29,,hydrox_safety_241001.csv +sheared_llama_1_3b,Hydrox Privacy,0.05,,hydrox_safety_241001.csv +sheared_llama_1_3b,Hydrox Safety,1.14,,hydrox_safety_241001.csv +sheared_llama_1_3b,Hydrox Security,0.03,,hydrox_safety_241001.csv +solar_0_70b_16bit,Hydrox Integrity,30.25,,hydrox_safety_241001.csv +solar_0_70b_16bit,Hydrox Overall Score,24.5,,hydrox_safety_241001.csv +solar_0_70b_16bit,Hydrox Privacy,33.8,,hydrox_safety_241001.csv +solar_0_70b_16bit,Hydrox Safety,22.4,,hydrox_safety_241001.csv +solar_0_70b_16bit,Hydrox Security,17.55,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,Hydrox Integrity,5.65,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,Hydrox Overall Score,5.38,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,Hydrox Privacy,3.3,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,Hydrox Safety,6.87,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,Hydrox Security,4.57,,hydrox_safety_241001.csv +vicuna_13b_v1_5,Hydrox Integrity,36.08,,hydrox_safety_241001.csv +vicuna_13b_v1_5,Hydrox Overall Score,34.07,,hydrox_safety_241001.csv +vicuna_13b_v1_5,Hydrox Privacy,29.78,,hydrox_safety_241001.csv +vicuna_13b_v1_5,Hydrox Safety,38.46,,hydrox_safety_241001.csv +vicuna_13b_v1_5,Hydrox Security,30.71,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,Hydrox Integrity,22.25,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,Hydrox Overall Score,19.31,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,Hydrox Privacy,17.01,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,Hydrox Safety,21.14,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,Hydrox Security,16.99,,hydrox_safety_241001.csv +vicuna_33b_v1_3,Hydrox Integrity,18.64,,hydrox_safety_241001.csv +vicuna_33b_v1_3,Hydrox Overall Score,17.64,,hydrox_safety_241001.csv +vicuna_33b_v1_3,Hydrox Privacy,21.34,,hydrox_safety_241001.csv +vicuna_33b_v1_3,Hydrox Safety,18.42,,hydrox_safety_241001.csv +vicuna_33b_v1_3,Hydrox Security,13.89,,hydrox_safety_241001.csv +vicuna_7b_v1_5,Hydrox Integrity,11.74,,hydrox_safety_241001.csv +vicuna_7b_v1_5,Hydrox Overall Score,15.37,,hydrox_safety_241001.csv +vicuna_7b_v1_5,Hydrox Privacy,10.91,,hydrox_safety_241001.csv +vicuna_7b_v1_5,Hydrox Safety,22.47,,hydrox_safety_241001.csv +vicuna_7b_v1_5,Hydrox Security,12.61,,hydrox_safety_241001.csv +viking_13b,Hydrox Integrity,7.68,,hydrox_safety_241001.csv +viking_13b,Hydrox Overall Score,7.33,,hydrox_safety_241001.csv +viking_13b,Hydrox Privacy,8.32,,hydrox_safety_241001.csv +viking_13b,Hydrox Safety,7.75,,hydrox_safety_241001.csv +viking_13b,Hydrox Security,5.76,,hydrox_safety_241001.csv +viking_33b,Hydrox Integrity,6.38,,hydrox_safety_241001.csv +viking_33b,Hydrox Overall Score,6.73,,hydrox_safety_241001.csv +viking_33b,Hydrox Privacy,6.48,,hydrox_safety_241001.csv +viking_33b,Hydrox Safety,6.87,,hydrox_safety_241001.csv +viking_33b,Hydrox Security,6.92,,hydrox_safety_241001.csv +viking_7b,Hydrox Integrity,9.05,,hydrox_safety_241001.csv +viking_7b,Hydrox Overall Score,6.15,,hydrox_safety_241001.csv +viking_7b,Hydrox Privacy,3.91,,hydrox_safety_241001.csv +viking_7b,Hydrox Safety,5.37,,hydrox_safety_241001.csv +viking_7b,Hydrox Security,7.6,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,Hydrox Integrity,5.58,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,Hydrox Overall Score,6.41,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,Hydrox Privacy,3.88,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,Hydrox Safety,8.0,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,Hydrox Security,6.49,,hydrox_safety_241001.csv +yi_6b_chat,Hydrox Integrity,36.02,,hydrox_safety_241001.csv +yi_6b_chat,Hydrox Overall Score,37.0,,hydrox_safety_241001.csv +yi_6b_chat,Hydrox Privacy,45.36,,hydrox_safety_241001.csv +yi_6b_chat,Hydrox Safety,37.35,,hydrox_safety_241001.csv +yi_6b_chat,Hydrox Security,31.49,,hydrox_safety_241001.csv +zephyr_7b_beta,Hydrox Integrity,24.95,,hydrox_safety_241001.csv +zephyr_7b_beta,Hydrox Overall Score,23.8,,hydrox_safety_241001.csv +zephyr_7b_beta,Hydrox Privacy,30.6,,hydrox_safety_241001.csv +zephyr_7b_beta,Hydrox Safety,21.2,,hydrox_safety_241001.csv +zephyr_7b_beta,Hydrox Security,22.4,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,Hydrox Integrity,26.05,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,Hydrox Overall Score,21.38,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,Hydrox Privacy,21.65,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,Hydrox Safety,19.35,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,Hydrox Security,21.22,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,Hydrox Integrity,13.61,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,Hydrox Overall Score,13.1,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,Hydrox Privacy,14.94,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,Hydrox Safety,14.92,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,Hydrox Security,9.5,,hydrox_safety_241001.csv +alpaca_7b,aggregate,0.23484848484848483,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +chatglm2_6b,aggregate,0.029137529137529136,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +chatgpt_4o_latest,aggregate,0.9754079254079254,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_2_0,aggregate,0.8333333333333334,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_2_1,aggregate,0.6693861693861693,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_3_5_sonnet_20240620,aggregate,0.9572649572649573,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_3_haiku_20240307,aggregate,0.44965034965034967,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_3_opus_20240229,aggregate,0.8824397824397824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_3_sonnet_20240229,aggregate,0.5985236985236985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +claude_instant_1_2,aggregate,0.6486013986013985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +command_r,aggregate,0.3296911421911422,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +command_r_plus,aggregate,0.6183108558108558,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +dbrx_instruct,aggregate,0.4724025974025974,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +dbrx_instructruct,aggregate,0.5379867046533713,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +deepseek_coder_v2,aggregate,0.713053613053613,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +deepseek_llm_67b_chat,aggregate,0.5734841290396846,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +dolphin_2_2_1_mistral_7b,aggregate,0.4810606060606061,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +falcon_40b,aggregate,0.3502690724912947,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +falcon_40b_instruct,aggregate,0.13187429854096522,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +falcon_7b,aggregate,0.11380183602405824,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +falcon_7b_instruct,aggregate,0.011363636363636364,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemini_1_5_flash_api_0514,aggregate,0.7263403263403263,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemini_1_5_pro_api_0514,aggregate,0.8294871794871794,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemini_1_5_pro_exp_0801,aggregate,0.9545454545454546,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemini_pro,aggregate,0.7298951048951049,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_1_1_2b_it,aggregate,0.07454890788224121,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_1_1_7b_it,aggregate,0.263927019482575,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_2_27b_it,aggregate,0.776345259678593,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_2_2b_it,aggregate,0.28113553113553114,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_2_9b_it,aggregate,0.6048877048877048,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_2_9b_it_dpo,aggregate,0.8100649350649352,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_2_9b_it_simpo,aggregate,0.7328042328042329,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_2b_it,aggregate,0.08119658119658119,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_7b,aggregate,0.4477682811016144,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gemma_7b_it,aggregate,0.18790982679871568,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +glm_4_9b_chat,aggregate,0.4769547325102881,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_3_5_turbo_0125,aggregate,0.3591242091242091,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_3_5_turbo_0613,aggregate,0.6851851851851851,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_4_0125_preview,aggregate,0.8492118992118992,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_4_0613,aggregate,0.7641802641802643,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_4_turbo_2024_04_09,aggregate,0.9055819180819181,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_4o_2024_05_13,aggregate,0.9767482517482518,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_4o_2024_08_06,aggregate,0.9652680652680652,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_4o_mini_2024_07_18,aggregate,0.8348776223776224,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_j_6b,aggregate,0.09876543209876543,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +gpt_neox_20b,aggregate,0.1419753086419753,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +guanaco_33b,aggregate,0.38374125874125875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +hermes_3_llama3_1_70b,aggregate,0.8451178451178452,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +infinity_instruct_3m_0625_llama3_8b,aggregate,0.6537598204264872,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +internlm2_chat_20b,aggregate,0.37196969696969695,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +jurassic_2_grande_17b,aggregate,0.4230769230769231,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +jurassic_2_jumbo_178b,aggregate,0.532051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_1_405b_instruct,aggregate,0.8598484848484849,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_1_70b_instruct,aggregate,0.9343074620852398,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_1_8b_instruct,aggregate,0.6080822469711359,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_70b,aggregate,0.8129154795821463,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_70b_instruct,aggregate,0.8172801478357034,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_8b,aggregate,0.4368471035137702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_8b_instruct,aggregate,0.4449662477440255,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama3_instruct_8b_simpo,aggregate,0.7992424242424242,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_13b,aggregate,0.2222222222222222,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_2_13b,aggregate,0.4146881924659702,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_2_13b_chat,aggregate,0.38675213675213677,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_2_70b,aggregate,0.7293447293447294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_2_70b_chat,aggregate,0.412732329398996,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_2_7b,aggregate,0.25466919911364355,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_2_7b_chat,aggregate,0.1122679789346456,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +llama_65b,aggregate,0.5759734093067427,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +luminous_base_13b,aggregate,0.08333333333333333,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +luminous_extended_30b,aggregate,0.2329059829059829,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +luminous_supreme_70b,aggregate,0.32905982905982906,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_7b_instruct_v0_2,aggregate,0.250669392336059,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_7b_instruct_v0_3,aggregate,0.24534231200897869,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_7b_v0_2,aggregate,0.3773849607182941,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_7b_v0_3,aggregate,0.4228395061728395,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_large_2402,aggregate,0.5105672105672105,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_large_2407,aggregate,0.8375291375291375,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_medium,aggregate,0.657051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_small_2402,aggregate,0.47785547785547783,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mistral_v0_1_7b,aggregate,0.6239316239316239,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mixtral_8x22b_instruct_v0_1,aggregate,0.585565052231719,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mixtral_8x22b_v0_1,aggregate,0.7382154882154882,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mixtral_8x7b_instruct_v0_1,aggregate,0.284326167659501,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +mixtral_8x7b_v0_1,aggregate,0.5310044893378227,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +nous_hermes_2_mixtral_8x7b_dpo,aggregate,0.7094017094017094,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +olmo_7b,aggregate,0.06220322886989553,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +olmo_7b_instruct,aggregate,0.15669515669515668,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +openchat_3_5,aggregate,0.5270655270655271,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +openhermes_2_5_mistral_7b,aggregate,0.40103708020374684,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_2,aggregate,0.19812080923192033,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_5_mini_instruct,aggregate,0.6103254769921437,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_5_moe_instruct,aggregate,0.7600448933782267,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_medium_4k_instruct,aggregate,0.48541540763762986,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_mini_128k_instruct,aggregate,0.3778468445135112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_mini_4k_instruct,aggregate,0.4048663270885493,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_small_128k_instruct,aggregate,0.6561167227833894,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +phi_3_small_8k_instruct,aggregate,0.27051282051282055,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +pythia_12b,aggregate,0.05246913580246913,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +pythia_6_9b,aggregate,0.018518518518518517,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_0_5b_chat,aggregate,0.012345679012345678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_110b_chat,aggregate,0.7419770353103686,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_14b,aggregate,0.5797720797720798,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_14b_chat,aggregate,0.45340153673487005,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_1_8b_chat,aggregate,0.05544332210998878,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_32b,aggregate,0.7678062678062678,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_32b_chat,aggregate,0.571383349161127,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_4b_chat,aggregate,0.12542806987251431,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_72b_chat,aggregate,0.5463669663669664,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_7b,aggregate,0.35185185185185186,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen1_5_7b_chat,aggregate,0.24214088380755047,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen2_0_5b_instruct,aggregate,0.055218855218855216,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen2_1_5b_instruct,aggregate,0.1968574635241302,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen2_72b_instruct,aggregate,0.7701936951936953,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen2_7b_instruct,aggregate,0.4970445192667415,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +qwen_14b_chat,aggregate,0.2837995337995338,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +smaug_qwen2_72b_instruct,aggregate,0.8331088664421997,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +solar_10_7b_instruct_v1_0,aggregate,0.5030864197530864,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +starling_lm_7b_alpha,aggregate,0.42734323289878845,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +starling_lm_7b_beta,aggregate,0.3611888111888112,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +tulu_2_dpo_70b,aggregate,0.3585164835164835,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +vicuna_13b,aggregate,0.14714452214452214,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +vicuna_7b,aggregate,0.1885198135198135,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +vicuna_7b_v1_5,aggregate,0.15454545454545454,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +wizardlm_13b,aggregate,0.42773892773892774,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +wizardlm_70b,aggregate,0.5620629370629371,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_1_5_34b_chat,aggregate,0.6669566544566544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_1_5_6b_chat,aggregate,0.33974132863021755,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_1_5_9b_chat,aggregate,0.6041446208112875,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_34b,aggregate,0.7188983855650521,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_34b_chat,aggregate,0.5558361391694725,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_6b,aggregate,0.295346628679962,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_6b_chat,aggregate,0.19393939393939394,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_large,aggregate,0.7889194139194139,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +yi_large_preview,aggregate,0.8714202464202464,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +zephyr_7b_alpha,aggregate,0.33875830959164294,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +zephyr_7b_beta,aggregate,0.28937667271000606,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate +zephyr_orpo_141b_a35b_v0_1,aggregate,0.8414055080721747,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'LMSys Arena', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'Trustworthy Average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'RULER', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'Hydrox Integrity', 'Hydrox Overall Score', 'Hydrox Privacy', 'Hydrox Safety', 'Hydrox Security']",aggregate