diff --git "a/cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv" "b/cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv" deleted file mode 100644--- "a/cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv" +++ /dev/null @@ -1,1001 +0,0 @@ -scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value -Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 -Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 -Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 -Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 -Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,4,0.11553071904436202,0.7852997192967395 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8743737489954189,0.004501296794893102 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8019858294586086,0.01664169341252048 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865218326418788,0.005519059390504801 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 -Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.1162588388208577,0.78397092283469 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 -Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5719061307929368,0.1385541569597628 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 -Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7247956777996108,0.04194484960329344 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2767660595168839,0.5069548295866992 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3337223270100439,0.4191769676693079 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6126891094585267,0.10632638977302632 -Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8079257463851817,0.015261307993340337 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6647150497002838,0.07212235537894374 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7704800482268904,0.025262942539415363 -Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9028773381740962,0.002126756432137772 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.748982925973149,0.032470780295939985 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8599957450436625,0.006160409391629476 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8718735582848011,0.004766072993988772 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9069576656171551,0.001875739334441522 -Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8905328662549648,0.003016032865892646 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5058552901713423,0.20090402274559316 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6767432630833718,0.0652968761285632 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7135518769682414,0.04685902831102101 -Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.873661116609048,0.004575776138454243 -Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8775217778627072,0.004181622363896538 -Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7683490298001087,0.025928082489068475 -Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.741463148953373,0.035258455741147623 -Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7891209052525207,0.019892902878583873 -Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8157900850650412,0.013547661219765379 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8625206786227912,0.005844699973375535 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.49625129009057833,0.211004712621783 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7482300147416783,0.0327435760119495 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 -Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8540419074377281,0.00694751386877189 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827735900001105,0.021632253958226707 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7416615606437577,0.03518309274676423 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8208959354305796,0.01250307893717913 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 -Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7807842071724994,0.022196180227557687 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6016089012086534,0.11460809097860054 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.85978308688271,0.006187486327563118 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 -Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780599537830846,0.022248986205867058 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.753379355065838,0.030905705190702806 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8379676352721162,0.009384640911630616 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8462209992405952,0.008075105621350536 -HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9392379026634557,0.000535591367028614 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7412355057774336,0.035345043191044964 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8195179387247324,0.01277979740900836 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 -HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7466011946729814,0.03333852605723143 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.46353588273705637,0.24734250900688215 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8866352243352398,0.003339629955133934 -HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937902652612242,0.0005710971446370687 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.2831911510498836,0.4967225093410736 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2031844122583542,0.6293846722461313 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.549284007260608,0.15849945140105312 -HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7889373199563972,0.01994193933246426 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7526431927239958,0.0311644661156264 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 -HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937590300147702,0.0005796196796032962 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5831241321997315,0.12921116102954364 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5561145441014004,0.1523217142123119 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5664450708720614,0.14323389729888122 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.47517181530974595,0.23407895750101468 -HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.718855715365913,0.04449992445427745 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7168604276016974,0.04537877960385103 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.18264726732113173,0.6650765454064547 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.04614314940391431,0.9136043258512831 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6369093478690498,0.08944819108801377 -HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8650362997962656,0.005540656777637369 -OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 -OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8579024362848122,0.006430262194723998 -OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 -OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 -OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7303458809128464,0.03963972108447683 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7466964409211542,0.03330355520543848 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9036719475219376,0.002076262347775526 -OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7623592248502944,0.02785522986224059 -OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 -OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8349964637145074,0.009887030967730168 -OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 -OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 -OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 -OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8000397965603336,0.01711033114623395 -OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7666453684194998,0.026467542617941944 -OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8751438663188438,0.004421691058140597 -OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 -OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8195357136433342,0.012776203631959988 -OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 -OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 -OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 -OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 -OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7428545649568395,0.03473202812850355 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8136140570811612,0.01400900062666989 -OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5749045753814719,0.13602130778385005 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780595487125304,0.022250145374352125 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8389921086523722,0.009215256295109017 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8803463320171083,0.003907570379771439 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7142670311425445,0.04653663665491792 -OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7977979460712193,0.017660348313797546 -OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7240026280446691,0.04228069432019545 -OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8051290094703403,0.01590190576987268 -OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 -OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.770582228125362,0.025231318204288148 -OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5188109005585113,0.18769119165787862 -Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 -Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7836454491081474,0.021387948565361206 -Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865235745718993,0.005516995432107779 -Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,3,0.819500116935474,0.012783401302719894 -Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7432637726714306,0.034578129186903464 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 -LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 -Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 -Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6730282904268812,0.06736225845470355 -Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 -Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8618105831276622,0.005932414266978994 -Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 -Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8046621876144952,0.01601044603512172 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.41770329390345684,0.30313696659492734 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6529975286213465,0.07915856325659755 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6950517775314824,0.05566978580633573 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5130382972054114,0.19351964488420637 -Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6825577913683614,0.062140382561143265 -Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 -Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8702987510549938,0.00493787146977232 -Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8349295032906534,0.009898545248446817 -Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8245663895988613,0.011784555837564846 -Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.899783088468177,0.002330962388754791 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8724919719311256,0.004699674798249593 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 -Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.926933634016331,0.000922537739358256 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6984411569502376,0.05398723363884652 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.754828418128203,0.03040022622820331 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5655988276473191,0.14396676855997925 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9407474980820671,0.000497230334167822 -Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.770589245932409,0.025229147116181697 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7775815292717585,0.023123063813025962 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5611200837416681,0.14787988852194642 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.34646366697352105,0.40049416986179387 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7868643731535557,0.020500867535993103 -Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8114670933196435,0.014473750045325934 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.4013581254554363,0.32436552572418753 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.28341806840646894,0.4963625961904983 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3139211847524032,0.44892434309679713 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2606167560977108,0.5330194398770082 -Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.32260154615753545,0.43577896021471924 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827817854375669,0.021629949458519884 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5386185630062554,0.16841388744478442 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7045551126623175,0.05103000019308416 -Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8414540075802577,0.00881618884168942 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8614522174161048,0.005976999431835443 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7878166990611953,0.02024289628983945 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8381151096374623,0.009360136935052572 -Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.876154278920616,0.0043186280005204514 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8661864185981796,0.005405102460401999 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8297856426405835,0.010808669505560614 -MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9329487606730291,0.000716243089312378 -MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.822202489777381,0.01224422861798353 -MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6968865871905413,0.05475511707469452 -MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 -MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7939152572032528,0.018638835543465734 -MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7761614135775217,0.02354161442763604 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5935991848770941,0.12081484777974201 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9131963004520903,0.001530535130781307 -MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7594573765014532,0.02881968270449265 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6622792441367216,0.07355344210000651 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5835165093102912,0.1288909419896904 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7271748558955601,0.04094703171178795 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7369082697183147,0.0370157216672518 -MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7219159720057066,0.04317213020613491 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973595810319037,0.002499476856786579 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6540145328427245,0.07853263145320354 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.798793471524343,0.017414760604056785 -MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.766501585020503,0.026513385703318352 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6776894663079587,0.06477689572321889 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6576248245381009,0.07633405000799688 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.796342090311639,0.018023378799051942 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.689140856921657,0.058678219175095074 -MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6705942614169457,0.06873614015066103 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6842754194067544,0.0612256583562849 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7338112096805872,0.03824046140795786 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 -MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8522000994286094,0.007203358614415384 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7479170810940026,0.03285737031031745 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5899049701184135,0.1237398240474465 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.864013241961245,0.005663050469813282 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.726560560314063,0.04120326937800088 -MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7600546147835674,0.02861953111724766 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8675817638279608,0.00524352512595729 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4358953069712842,0.280322780055143 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8724977849323057,0.004699053502733089 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.871502377377448,0.004806214049293794 -MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.636462032322589,0.08974474991245225 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7407371067623334,0.035535069908202585 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.13754152986907456,0.7453436298315592 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8584434869588686,0.006359804257501524 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 -MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.429513562091493,0.2882272134157949 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7258395762861067,0.04150524782255408 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4140057077993773,0.3078793667149351 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8860840192325219,0.003387122941063616 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8531999374729967,0.007063738601380546 -MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.570698753672453,0.13958138247636556 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,1,0.820982530302196,0.012485817170678851 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 -AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9051882617143683,0.001982079878231783 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8448816290057799,0.008279149903754354 -OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7138885174194392,0.046707103452906885 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.40763933138747765,0.3161269846214854 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5033557119680766,0.20350786972733814 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4943676910774294,0.21301612937354739 -OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.3662549994154035,0.3722134961617391 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6943274080319848,0.05603338677616118 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.888202282224346,0.0032069637473251308 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.862959786938574,0.0057908774192851585 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4422315456206938,0.2725814015162671 -OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8576726697477571,0.006460333718352682 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6477798867796105,0.08241558395766836 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7105249096891054,0.04823848031855015 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7433756448219943,0.034536127920169364 -OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.465629371128827,0.24492880327618063 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8115257987039834,0.014460915122317916 -OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8840656907304268,0.003564741739845647 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8762491857760322,0.004309027650395265 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.822174167720692,0.012249803466994006 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8388480886223416,0.009238949980481774 -OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6752208316271633,0.06613869004956173 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7677373687773497,0.026120973578910495 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7919204265038193,0.01915443839404165 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8238198607264919,0.01192852239680578 -OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5937971020205063,0.1206592532108973 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6743688104667733,0.0666125934693148 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6092910701405022,0.10882867605607495 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.02436876480189197,0.954326651607438 -OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7114255278499215,0.04782552820112736 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5887872724291499,0.12463254240428198 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4029552549015283,0.32226121873409685 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.19589220319331574,0.6419903458052949 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5147894627560958,0.1917415408232741 -OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.43696792691727815,0.2790047957490856 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 -LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8698029729880158,0.00499276771087744 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8938963574061565,0.002753683842916408 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 -LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8197843971795349,0.012725991028944833 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 -LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9482689395026054,0.000332805134027447 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 -LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8996453255999854,0.00234033776853281 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8662449830102448,0.005398257529969565 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 -LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9469225816315634,0.000359102582060145 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.790872393374341,0.019428850798750914 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7384692720332464,0.03640761031575469 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 -LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7853349194194776,0.020919442242219075 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8636070293544758,0.005712124057773506 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.837126038633602,0.009525258316342535 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7663953319208139,0.026547294337781743 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8834569465544357,0.00361946726545403 -LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8480938359553485,0.00779520658099071 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 -WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8793267175321069,0.004005119722136405 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8760721346635911,0.004326948446281908 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 -WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7675767218262903,0.026171781192995118 -WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8483878251754778,0.007751839541749867 -WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 -WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 -WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 -WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.692434840005101,0.056990052908859494 -WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 -WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 -WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 -WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 -WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 -WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 -WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 -WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 -WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 -WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 -WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9412248237761267,0.000485487558057933 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 -WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 -Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7164442699126142,0.04556339297891151 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5643812833359342,0.14502482192576685 -HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.4448334653124403,0.269433453257965 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9020957808919513,0.002177191904645508 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6613543728531551,0.07410115498793113 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.4797794956768499,0.2289297958345603 -HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.49503702005526434,0.21230024172428238 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8658004484348707,0.005450353400185282 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6878185417270377,0.05936418242167244 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.6427492187377651,0.08562857067256696 -HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.19987101474191585,0.6351028985023905 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.7695981699173929,0.025536900476404875 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.851160886507116,0.00735033097799936 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7535063061583401,0.030861215825263487 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.26946310602236634,0.5186811891252074 -HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5071239778851739,0.19958915881626008 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.845558834843199,0.00817557674320208 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8223598748455347,0.01221327849153134 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7520379034546343,0.03137821860478068 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5986152394502113,0.1169062576526029 -HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.766509325140422,0.026510916638992615 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.6388656044215879,0.08815791552969902 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8220592376168137,0.012272442496278822 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.1610992186087647,0.7031245257171708 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.22938177579714764,0.584757473087143 -HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.16217150942988084,0.7012176634258844 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7448797028215589,0.033974472983626124 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.436470242791583,0.2796159471960331 -HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5113717481429286,0.195219904727713 -BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 -BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 -BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,2,0.7024798803756629,0.05202256738347333 -BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,3,0.6111548412929141,0.10745210550108082 -BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 -BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8443252756395498,0.008364861793357709 -BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8199557285303699,0.012691469447090417 -BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6898121736766818,0.05833178396126367 -BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.1445400076243653,0.732738456710739 -BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8543556725359636,0.006904516600543572 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7671160990392422,0.026317800283773948 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4230508906614041,0.29634091151848907 -BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.29492042180464345,0.478252042515081 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8192056092552416,0.01284304904344425 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8053230426409881,0.015856927546595193 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6785867773117831,0.06428605698561919 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.021028776761034942,0.960582665935811 -BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8101772449555595,0.014757563523095152 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7844308170919763,0.021169355122089707 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6407686957715764,0.08691312009391092 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.042093006210129874,0.9211687904012325 -BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8350456630970934,0.00987857623206292 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.879311548672376,0.004006582681021272 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6951300585252861,0.0556305769370549 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.30955291195703166,0.4556002793087552 -BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.09897629382276267,0.8156278898050575 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8313126956210078,0.010533178480029779 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8169388413464165,0.01330802664448977 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8065284450649773,0.015579295379409611 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.23722382427262312,0.5716108619128892 -BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.026088426326565897,0.9511063910298649 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5558829816104426,0.15252894598370506 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6390946692796851,0.08800754271923365 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.24121345447897227,0.5649619826999719 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 -BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7467577882406231,0.03328104267130768 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7611545287510072,0.028253164658278467 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6541774611460981,0.07843262445172178 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.0830822493170678,0.8449361587214159 -BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9103256104990007,0.001683717098370581 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8079204807250888,0.015262498588799642 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7253154362419392,0.0417256201301186 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2776474358858506,0.5055464711128136 -BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.919432996814919,0.0012296819224052442 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.87005129824662,0.004965222567299112 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9073703100625691,0.001851485138509531 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8673887162219034,0.005265692212272121 -BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9165887813382055,0.001361572704071016 -LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9292369266176062,0.000839501038985727 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 -LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9317002702003969,0.000756276259880365 -LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8240635545541923,0.011881405061211926 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 -LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9498225876442147,0.000304071618749767 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8197292667265523,0.012737111858293043 -LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9413025091864188,0.000483593804288479 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9083254977326705,0.001796125778484392 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.8626635526406192,0.005827152548807454 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8043418970652331,0.016085184583393794 -LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9025950086780581,0.002144887259438991 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.7564264003460613,0.02984872863501939 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9033527343998258,0.002096452391428316 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 -LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8534145445088147,0.007033997470343221 -aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 -aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 -aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 -aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 -aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,4,0.11553071904436202,0.7852997192967395 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,0,0.8743737489954189,0.004501296794893102 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,1,0.8019858294586086,0.01664169341252048 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,2,0.865218326418788,0.005519059390504801 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 -aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,2,-0.1162588388208577,0.78397092283469 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 -aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,0,0.5719061307929368,0.1385541569597628 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 -aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,0,0.7247956777996108,0.04194484960329344 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,1,0.2767660595168839,0.5069548295866992 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,2,0.3337223270100439,0.4191769676693079 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,3,0.6126891094585267,0.10632638977302632 -aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,4,0.8079257463851817,0.015261307993340337 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,0,0.6647150497002838,0.07212235537894374 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,3,0.7704800482268904,0.025262942539415363 -aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,4,0.9028773381740962,0.002126756432137772 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,0,0.748982925973149,0.032470780295939985 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,1,0.8599957450436625,0.006160409391629476 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,2,0.8718735582848011,0.004766072993988772 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,3,0.9069576656171551,0.001875739334441522 -aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,0,0.8905328662549648,0.003016032865892646 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,1,0.5058552901713423,0.20090402274559316 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,2,0.6767432630833718,0.0652968761285632 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,3,0.7135518769682414,0.04685902831102101 -aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,4,0.873661116609048,0.004575776138454243 -aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,0,0.8775217778627072,0.004181622363896538 -aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,1,0.7683490298001087,0.025928082489068475 -aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,2,0.741463148953373,0.035258455741147623 -aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,3,0.7891209052525207,0.019892902878583873 -aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,4,0.8157900850650412,0.013547661219765379 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,0,0.8625206786227912,0.005844699973375535 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,1,0.49625129009057833,0.211004712621783 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,2,0.7482300147416783,0.0327435760119495 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 -aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,4,0.8540419074377281,0.00694751386877189 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,0,0.7827735900001105,0.021632253958226707 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,1,0.7416615606437577,0.03518309274676423 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,2,0.8208959354305796,0.01250307893717913 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 -aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,0,0.7807842071724994,0.022196180227557687 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,1,0.6016089012086534,0.11460809097860054 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,2,0.85978308688271,0.006187486327563118 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 -aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.780599537830846,0.022248986205867058 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.753379355065838,0.030905705190702806 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8379676352721162,0.009384640911630616 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8462209992405952,0.008075105621350536 -aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9392379026634557,0.000535591367028614 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.7412355057774336,0.035345043191044964 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8195179387247324,0.01277979740900836 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 -aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7466011946729814,0.03333852605723143 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.46353588273705637,0.24734250900688215 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8866352243352398,0.003339629955133934 -aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937902652612242,0.0005710971446370687 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.2831911510498836,0.4967225093410736 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.2031844122583542,0.6293846722461313 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.549284007260608,0.15849945140105312 -aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.7889373199563972,0.01994193933246426 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.7526431927239958,0.0311644661156264 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 -aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937590300147702,0.0005796196796032962 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.5831241321997315,0.12921116102954364 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.5561145441014004,0.1523217142123119 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.5664450708720614,0.14323389729888122 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.47517181530974595,0.23407895750101468 -aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.718855715365913,0.04449992445427745 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7168604276016974,0.04537877960385103 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.18264726732113173,0.6650765454064547 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.04614314940391431,0.9136043258512831 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.6369093478690498,0.08944819108801377 -aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.8650362997962656,0.005540656777637369 -aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 -aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,1,0.8579024362848122,0.006430262194723998 -aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 -aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 -aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.7303458809128464,0.03963972108447683 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7466964409211542,0.03330355520543848 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.9036719475219376,0.002076262347775526 -aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.7623592248502944,0.02785522986224059 -aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 -aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,1,0.8349964637145074,0.009887030967730168 -aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 -aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 -aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 -aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.8000397965603336,0.01711033114623395 -aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7666453684194998,0.026467542617941944 -aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8751438663188438,0.004421691058140597 -aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 -aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.8195357136433342,0.012776203631959988 -aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 -aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 -aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 -aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 -aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,0,0.7428545649568395,0.03473202812850355 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,3,0.8136140570811612,0.01400900062666989 -aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,4,0.5749045753814719,0.13602130778385005 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,0,0.780595487125304,0.022250145374352125 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8389921086523722,0.009215256295109017 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,2,0.8803463320171083,0.003907570379771439 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,3,0.7142670311425445,0.04653663665491792 -aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,4,0.7977979460712193,0.017660348313797546 -aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,0,0.7240026280446691,0.04228069432019545 -aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8051290094703403,0.01590190576987268 -aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 -aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,3,0.770582228125362,0.025231318204288148 -aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,4,0.5188109005585113,0.18769119165787862 -aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 -aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,1,0.7836454491081474,0.021387948565361206 -aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,2,0.865235745718993,0.005516995432107779 -aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,3,0.819500116935474,0.012783401302719894 -aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,4,0.7432637726714306,0.034578129186903464 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 -aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 -aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 -aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,0,0.6730282904268812,0.06736225845470355 -aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 -aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,2,0.8618105831276622,0.005932414266978994 -aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 -aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,4,0.8046621876144952,0.01601044603512172 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,0,0.41770329390345684,0.30313696659492734 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,1,0.6529975286213465,0.07915856325659755 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,2,0.6950517775314824,0.05566978580633573 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,3,0.5130382972054114,0.19351964488420637 -aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,4,0.6825577913683614,0.062140382561143265 -aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 -aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,1,0.8702987510549938,0.00493787146977232 -aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,2,0.8349295032906534,0.009898545248446817 -aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,3,0.8245663895988613,0.011784555837564846 -aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,0,0.899783088468177,0.002330962388754791 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,1,0.8724919719311256,0.004699674798249593 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 -aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,4,0.926933634016331,0.000922537739358256 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,0,0.6984411569502376,0.05398723363884652 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,1,0.754828418128203,0.03040022622820331 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,2,0.5655988276473191,0.14396676855997925 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,3,0.9407474980820671,0.000497230334167822 -aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,4,0.770589245932409,0.025229147116181697 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,0,0.7775815292717585,0.023123063813025962 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,1,0.5611200837416681,0.14787988852194642 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,2,0.34646366697352105,0.40049416986179387 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,3,0.7868643731535557,0.020500867535993103 -aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,4,0.8114670933196435,0.014473750045325934 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,0,0.4013581254554363,0.32436552572418753 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,1,0.28341806840646894,0.4963625961904983 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,2,0.3139211847524032,0.44892434309679713 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,3,0.2606167560977108,0.5330194398770082 -aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,4,0.32260154615753545,0.43577896021471924 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,0,0.7827817854375669,0.021629949458519884 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,2,0.5386185630062554,0.16841388744478442 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,3,0.7045551126623175,0.05103000019308416 -aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,4,0.8414540075802577,0.00881618884168942 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,1,0.8614522174161048,0.005976999431835443 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,2,0.7878166990611953,0.02024289628983945 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,3,0.8381151096374623,0.009360136935052572 -aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,4,0.876154278920616,0.0043186280005204514 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,2,0.8661864185981796,0.005405102460401999 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,3,0.8297856426405835,0.010808669505560614 -aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,4,0.9329487606730291,0.000716243089312378 -aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,0,0.822202489777381,0.01224422861798353 -aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,1,0.6968865871905413,0.05475511707469452 -aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 -aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,3,0.7939152572032528,0.018638835543465734 -aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,4,0.7761614135775217,0.02354161442763604 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,1,0.5935991848770941,0.12081484777974201 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,3,0.9131963004520903,0.001530535130781307 -aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,4,0.7594573765014532,0.02881968270449265 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,0,0.6622792441367216,0.07355344210000651 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,1,0.5835165093102912,0.1288909419896904 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,2,0.7271748558955601,0.04094703171178795 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,3,0.7369082697183147,0.0370157216672518 -aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,4,0.7219159720057066,0.04317213020613491 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,0,0.8973595810319037,0.002499476856786579 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,1,0.6540145328427245,0.07853263145320354 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,3,0.798793471524343,0.017414760604056785 -aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,4,0.766501585020503,0.026513385703318352 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,0,0.6776894663079587,0.06477689572321889 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,1,0.6576248245381009,0.07633405000799688 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,2,0.796342090311639,0.018023378799051942 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,3,0.689140856921657,0.058678219175095074 -aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,4,0.6705942614169457,0.06873614015066103 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,0,0.6842754194067544,0.0612256583562849 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,1,0.7338112096805872,0.03824046140795786 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 -aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,4,0.8522000994286094,0.007203358614415384 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,0,0.7479170810940026,0.03285737031031745 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,1,0.5899049701184135,0.1237398240474465 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,2,0.864013241961245,0.005663050469813282 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,3,0.726560560314063,0.04120326937800088 -aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,4,0.7600546147835674,0.02861953111724766 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,0,0.8675817638279608,0.00524352512595729 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,1,0.4358953069712842,0.280322780055143 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,2,0.8724977849323057,0.004699053502733089 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,3,0.871502377377448,0.004806214049293794 -aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,4,0.636462032322589,0.08974474991245225 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,0,0.7407371067623334,0.035535069908202585 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,1,0.13754152986907456,0.7453436298315592 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,2,0.8584434869588686,0.006359804257501524 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 -aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,4,0.429513562091493,0.2882272134157949 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,0,0.7258395762861067,0.04150524782255408 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,1,0.4140057077993773,0.3078793667149351 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,2,0.8860840192325219,0.003387122941063616 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,3,0.8531999374729967,0.007063738601380546 -aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,4,0.570698753672453,0.13958138247636556 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,1,0.820982530302196,0.012485817170678851 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 -aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,2,0.9051882617143683,0.001982079878231783 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,3,0.8448816290057799,0.008279149903754354 -aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,0,0.7138885174194392,0.046707103452906885 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,1,0.40763933138747765,0.3161269846214854 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,2,0.5033557119680766,0.20350786972733814 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,3,0.4943676910774294,0.21301612937354739 -aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,4,0.3662549994154035,0.3722134961617391 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,0,0.6943274080319848,0.05603338677616118 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,1,0.888202282224346,0.0032069637473251308 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,2,0.862959786938574,0.0057908774192851585 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,3,0.4422315456206938,0.2725814015162671 -aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,0,0.8576726697477571,0.006460333718352682 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,1,0.6477798867796105,0.08241558395766836 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,2,0.7105249096891054,0.04823848031855015 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,3,0.7433756448219943,0.034536127920169364 -aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,4,0.465629371128827,0.24492880327618063 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,3,0.8115257987039834,0.014460915122317916 -aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,4,0.8840656907304268,0.003564741739845647 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,1,0.8762491857760322,0.004309027650395265 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,2,0.822174167720692,0.012249803466994006 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,3,0.8388480886223416,0.009238949980481774 -aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,0,0.6752208316271633,0.06613869004956173 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,1,0.7677373687773497,0.026120973578910495 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,2,0.7919204265038193,0.01915443839404165 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,3,0.8238198607264919,0.01192852239680578 -aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,0,0.5937971020205063,0.1206592532108973 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,1,0.6743688104667733,0.0666125934693148 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,2,0.6092910701405022,0.10882867605607495 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,3,0.02436876480189197,0.954326651607438 -aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,4,0.7114255278499215,0.04782552820112736 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,0,0.5887872724291499,0.12463254240428198 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,1,0.4029552549015283,0.32226121873409685 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,2,0.19589220319331574,0.6419903458052949 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,3,0.5147894627560958,0.1917415408232741 -aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,4,0.43696792691727815,0.2790047957490856 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 -aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,0,0.8698029729880158,0.00499276771087744 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,2,0.8938963574061565,0.002753683842916408 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 -aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,2,0.8197843971795349,0.012725991028944833 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 -aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,1,0.9482689395026054,0.000332805134027447 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 -aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,1,0.8996453255999854,0.00234033776853281 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,2,0.8662449830102448,0.005398257529969565 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 -aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,0,0.9469225816315634,0.000359102582060145 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,1,0.790872393374341,0.019428850798750914 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,2,0.7384692720332464,0.03640761031575469 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 -aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,4,0.7853349194194776,0.020919442242219075 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,0,0.8636070293544758,0.005712124057773506 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,1,0.837126038633602,0.009525258316342535 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,2,0.7663953319208139,0.026547294337781743 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,3,0.8834569465544357,0.00361946726545403 -aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,4,0.8480938359553485,0.00779520658099071 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 -aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,0,0.8793267175321069,0.004005119722136405 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,1,0.8760721346635911,0.004326948446281908 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 -aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,4,0.7675767218262903,0.026171781192995118 -aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,0,0.8483878251754778,0.007751839541749867 -aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 -aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 -aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 -aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,4,0.692434840005101,0.056990052908859494 -aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 -aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 -aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 -aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 -aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 -aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 -aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 -aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 -aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 -aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 -aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,2,0.9412248237761267,0.000485487558057933 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 -aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 -aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7164442699126142,0.04556339297891151 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5643812833359342,0.14502482192576685 -aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.4448334653124403,0.269433453257965 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9020957808919513,0.002177191904645508 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6613543728531551,0.07410115498793113 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.4797794956768499,0.2289297958345603 -aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.49503702005526434,0.21230024172428238 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8658004484348707,0.005450353400185282 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6878185417270377,0.05936418242167244 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.6427492187377651,0.08562857067256696 -aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.19987101474191585,0.6351028985023905 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.7695981699173929,0.025536900476404875 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.851160886507116,0.00735033097799936 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7535063061583401,0.030861215825263487 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.26946310602236634,0.5186811891252074 -aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5071239778851739,0.19958915881626008 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.845558834843199,0.00817557674320208 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8223598748455347,0.01221327849153134 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7520379034546343,0.03137821860478068 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5986152394502113,0.1169062576526029 -aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.766509325140422,0.026510916638992615 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.6388656044215879,0.08815791552969902 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8220592376168137,0.012272442496278822 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.1610992186087647,0.7031245257171708 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.22938177579714764,0.584757473087143 -aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.16217150942988084,0.7012176634258844 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7448797028215589,0.033974472983626124 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.436470242791583,0.2796159471960331 -aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5113717481429286,0.195219904727713 -aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 -aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 -aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,2,0.7024798803756629,0.05202256738347333 -aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,3,0.6111548412929141,0.10745210550108082 -aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 -aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,0,0.8443252756395498,0.008364861793357709 -aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,1,0.8199557285303699,0.012691469447090417 -aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,2,0.6898121736766818,0.05833178396126367 -aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,3,0.1445400076243653,0.732738456710739 -aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,1,0.8543556725359636,0.006904516600543572 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,2,0.7671160990392422,0.026317800283773948 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,3,0.4230508906614041,0.29634091151848907 -aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,4,0.29492042180464345,0.478252042515081 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,0,0.8192056092552416,0.01284304904344425 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,1,0.8053230426409881,0.015856927546595193 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,2,0.6785867773117831,0.06428605698561919 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,3,0.021028776761034942,0.960582665935811 -aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,0,0.8101772449555595,0.014757563523095152 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,1,0.7844308170919763,0.021169355122089707 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,2,0.6407686957715764,0.08691312009391092 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,3,0.042093006210129874,0.9211687904012325 -aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,0,0.8350456630970934,0.00987857623206292 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,1,0.879311548672376,0.004006582681021272 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,2,0.6951300585252861,0.0556305769370549 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,3,0.30955291195703166,0.4556002793087552 -aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,4,0.09897629382276267,0.8156278898050575 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,0,0.8313126956210078,0.010533178480029779 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,1,0.8169388413464165,0.01330802664448977 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,2,0.8065284450649773,0.015579295379409611 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,3,0.23722382427262312,0.5716108619128892 -aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,4,0.026088426326565897,0.9511063910298649 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,0,0.5558829816104426,0.15252894598370506 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,1,0.6390946692796851,0.08800754271923365 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,2,0.24121345447897227,0.5649619826999719 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 -aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,0,0.7467577882406231,0.03328104267130768 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,1,0.7611545287510072,0.028253164658278467 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,2,0.6541774611460981,0.07843262445172178 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,3,0.0830822493170678,0.8449361587214159 -aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,0,0.9103256104990007,0.001683717098370581 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,1,0.8079204807250888,0.015262498588799642 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,2,0.7253154362419392,0.0417256201301186 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,3,0.2776474358858506,0.5055464711128136 -aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,0,0.919432996814919,0.0012296819224052442 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,1,0.87005129824662,0.004965222567299112 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,2,0.9073703100625691,0.001851485138509531 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,3,0.8673887162219034,0.005265692212272121 -aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,3,0.9165887813382055,0.001361572704071016 -aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,0,0.9292369266176062,0.000839501038985727 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 -aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,3,0.9317002702003969,0.000756276259880365 -aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,4,0.8240635545541923,0.011881405061211926 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 -aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,1,0.9498225876442147,0.000304071618749767 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,3,0.8197292667265523,0.012737111858293043 -aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,0,0.9413025091864188,0.000483593804288479 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,1,0.9083254977326705,0.001796125778484392 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,2,0.8626635526406192,0.005827152548807454 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,3,0.8043418970652331,0.016085184583393794 -aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,0,0.9025950086780581,0.002144887259438991 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,1,0.7564264003460613,0.02984872863501939 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,2,0.9033527343998258,0.002096452391428316 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 -aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,4,0.8534145445088147,0.007033997470343221