diff --git "a/cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv" "b/cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv" --- "a/cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv" +++ "b/cache/agreements_cache_5e66a88dab42480065db47711c55c458.csv" @@ -1,817 +1,763 @@ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value -Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 +Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 +Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 +Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 +Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 +Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.47280542884465016,0.10506382347888965 +Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +LMSys Arena,chatbot_arena_241104.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3401680257083045,0.25175949861106117 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.10910894511799618,0.7083840532183997 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 -tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -trustworthy_average,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985 -trustworthy_non_toxicity,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 -trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762 -trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.32732683535398854,0.2618277009271762 -trustworthy_non_stereotype,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.32732683535398854,0.2618277009271762 -trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 -trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -trustworthy_advglue_pp,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -trustworthy_ood,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -trustworthy_adv_demo,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -trustworthy_privacy,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 -trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -trustworthy_ethics,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.836501912571304,0.004136737098676645 -trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,1,-0.836501912571304,0.004136737098676645 -trustworthy_fairness,llm_trustworthy_241001.csv,aggregate,aggregate,kendall,random,8,2,-0.836501912571304,0.004136737098676645 -OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8153742483272113,0.0057021327615243405 +HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111 +HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 +HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +tablebench_overall_dp,tablebench_241002.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5929994533288809,0.04437842734548688 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748 +OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347 +Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3706246583305506,0.20891238174069848 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 +Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.036369648372665396,0.9007802600472398 +Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.836501912571304,0.004136737098676645 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.22237479499833035,0.45088703102517036 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 +MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7637626158259734,0.008839740160738534 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5455447255899809,0.0614649096074132 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9092412093166348,0.0018276750354536814 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 -HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111 -HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 -HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2857142857142857,0.39875992063492066 -HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 -HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 -HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2545875386086578,0.38281014365989596 -HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.47280542884465016,0.10506382347888965 -HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2857142857142857,0.39875992063492066 -HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2545875386086578,0.38281014365989596 -HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.21428571428571427,0.5484126984126985 -HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.18184824186332696,0.5330356744917513 -HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2545875386086578,0.38281014365989596 -HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 -HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 -HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 -HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,-0.2545875386086578,0.38281014365989596 -HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7412493166611012,0.011966745157436277 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 +HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +HELM AirBench Security Risks,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +HELM AirBench Operational Misuses,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Violence & Extremism,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 +HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 +HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +HELM AirBench Sexual Content,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Child Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.32732683535398854,0.2618277009271762 +HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +HELM AirBench Self Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +HELM AirBench Political Usage,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Economic Harm,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965 +HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +HELM AirBench Deception,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Manipulation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499 +HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Defamation,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.40006613209931935,0.17023995462900499 +HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499 +HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Fundamental Rights,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 +HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965 +HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +HELM AirBench Privacy,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench Criminal Activities,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +HELM AirBench AIR Score,helm_airbench_240916.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 +OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 +OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.836501912571304,0.004136737098676645 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.07142857142857142,0.9048611111111111 +OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.07142857142857142,0.9048611111111111 +OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748 +LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9819805060619657,0.0007619896395304237 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347 -Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,2,0.18184824186332696,0.5330356744917513 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,0,0.4447495899966607,0.1315867602811863 +Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 +WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 +Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 +Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 +HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 -magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +eq_bench,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +magi_hard,eqbench_240912.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748 +BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 +BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7412493166611012,0.011966745157436277 +LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 -hydrox_integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985 -hydrox_overall_score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -hydrox_privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 -hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -hydrox_safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 -hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985 -hydrox_security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,0.0,1.0 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 +LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 +LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +Hydrox Integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111 +Hydrox Integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Hydrox Integrity,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +Hydrox Overall Score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 +Hydrox Overall Score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Hydrox Overall Score,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +Hydrox Privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 +Hydrox Privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Hydrox Privacy,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 +Hydrox Safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 +Hydrox Safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +Hydrox Safety,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +Hydrox Security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111 +Hydrox Security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +Hydrox Security,hydrox_safety_241001.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 +aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 +aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 +aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 +aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.47280542884465016,0.10506382347888965 +aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,LMSys Arena,chatbot_arena_241104.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.3401680257083045,0.25175949861106117 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.0,1.0 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.10910894511799618,0.7083840532183997 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,trustworthy_average,llm_trustworthy_241001.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,trustworthy_non_toxicity,llm_trustworthy_241001.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762 -aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,1,0.32732683535398854,0.2618277009271762 -aggregate,aggregate,trustworthy_non_stereotype,llm_trustworthy_241001.csv,kendall,random,8,2,0.32732683535398854,0.2618277009271762 -aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,trustworthy_advglue_pp,llm_trustworthy_241001.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_ood,llm_trustworthy_241001.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_adv_demo,llm_trustworthy_241001.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,trustworthy_privacy,llm_trustworthy_241001.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,trustworthy_ethics,llm_trustworthy_241001.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,0,-0.836501912571304,0.004136737098676645 -aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,1,-0.836501912571304,0.004136737098676645 -aggregate,aggregate,trustworthy_fairness,llm_trustworthy_241001.csv,kendall,random,8,2,-0.836501912571304,0.004136737098676645 -aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8153742483272113,0.0057021327615243405 +aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,tablebench_overall_dp,tablebench_241002.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.5929994533288809,0.04437842734548688 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347 +aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,2,0.3706246583305506,0.20891238174069848 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,0,0.0,1.0 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,1,0.036369648372665396,0.9007802600472398 +aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,2,0.836501912571304,0.004136737098676645 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.22237479499833035,0.45088703102517036 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 +aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.7637626158259734,0.008839740160738534 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.9092412093166348,0.0018276750354536814 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111 -aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 -aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.2545875386086578,0.38281014365989596 -aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.47280542884465016,0.10506382347888965 -aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,-0.2545875386086578,0.38281014365989596 -aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,-0.18184824186332696,0.5330356744917513 -aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,-0.2545875386086578,0.38281014365989596 -aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 -aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,-0.4999999999999999,0.10868055555555556 -aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 -aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 -aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,-0.21428571428571427,0.5484126984126985 -aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,-0.2545875386086578,0.38281014365989596 -aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,-0.42857142857142855,0.17886904761904762 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.7412493166611012,0.011966745157436277 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench Security Risks,helm_airbench_240916.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Operational Misuses,helm_airbench_240916.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Violence & Extremism,helm_airbench_240916.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Hate/Toxicity,helm_airbench_240916.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Sexual Content,helm_airbench_240916.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Child Harm,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,0,0.32732683535398854,0.2618277009271762 +aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HELM AirBench Self Harm,helm_airbench_240916.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench Political Usage,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Economic Harm,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965 +aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HELM AirBench Deception,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Manipulation,helm_airbench_240916.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499 +aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Defamation,helm_airbench_240916.csv,kendall,random,8,2,0.40006613209931935,0.17023995462900499 +aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499 +aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Fundamental Rights,helm_airbench_240916.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 +aggregate,aggregate,HELM AirBench Discrimination/Bias,helm_airbench_240916.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965 +aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Privacy,helm_airbench_240916.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench Criminal Activities,helm_airbench_240916.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HELM AirBench AIR Score,helm_airbench_240916.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.0,1.0 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 +aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.836501912571304,0.004136737098676645 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,-0.07142857142857142,0.9048611111111111 +aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.9819805060619657,0.0007619896395304237 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347 -aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,2,0.18184824186332696,0.5330356744917513 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,0,0.4447495899966607,0.1315867602811863 +aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Enkrypt AI Safety,enkrypt_ai_safety_240916.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 +aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.9092412093166348,0.0018276750354536814 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 +aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 +aggregate,aggregate,Decentralized Arena (0-1 Normalized),dec_arena_241022.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.0,1.0 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,eq_bench,eqbench_240912.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,magi_hard,eqbench_240912.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 +aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.7412493166611012,0.011966745157436277 +aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 +aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 +aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 -aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,hydrox_integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,hydrox_overall_score,hydrox_safety_241001.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 -aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,hydrox_privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 -aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0 -aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,hydrox_safety,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 -aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508 -aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985 -aggregate,aggregate,hydrox_security,hydrox_safety_241001.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 +aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Hydrox Integrity,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,Hydrox Integrity,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Hydrox Integrity,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,Hydrox Overall Score,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0 +aggregate,aggregate,Hydrox Overall Score,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Hydrox Overall Score,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,Hydrox Privacy,hydrox_safety_241001.csv,kendall,random,8,0,0.0,1.0 +aggregate,aggregate,Hydrox Privacy,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Hydrox Privacy,hydrox_safety_241001.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 +aggregate,aggregate,Hydrox Safety,hydrox_safety_241001.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 +aggregate,aggregate,Hydrox Safety,hydrox_safety_241001.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 +aggregate,aggregate,Hydrox Safety,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 +aggregate,aggregate,Hydrox Security,hydrox_safety_241001.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111 +aggregate,aggregate,Hydrox Security,hydrox_safety_241001.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 +aggregate,aggregate,Hydrox Security,hydrox_safety_241001.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066