diff --git "a/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" "b/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" new file mode 100644--- /dev/null +++ "b/cache/allbenchs_cache_5e66a88dab42480065db47711c55c458.csv" @@ -0,0 +1,21938 @@ +model,scenario,score,aggragated_from,source +flan_flan-ul2,Holmes,72.2,[],holmes_240829.csv +flan_t5_xxl,Holmes,70.5,[],holmes_240829.csv +t5_xxl_lm_adapt,Holmes,70.2,[],holmes_240829.csv +vicuna_13b_v1_5,Holmes,68.6,[],holmes_240829.csv +llama_2_70b_chat,Holmes,66.3,[],holmes_240829.csv +labradorite_13b,Holmes,66.1,[],holmes_240829.csv +llama_2_13b,Holmes,65.0,[],holmes_240829.csv +llama_2_13b_chat,Holmes,64.1,[],holmes_240829.csv +pythia_12b_deduped,Holmes,63.1,[],holmes_240829.csv +bart_base,Holmes,63.0,[],holmes_240829.csv +orca_2_13b,Holmes,62.7,[],holmes_240829.csv +pythia_6_9b_deduped,Holmes,62.3,[],holmes_240829.csv +flan-ul2,Holmes,60.5,[],holmes_240829.csv +flan_t5_xl,Holmes,60.0,[],holmes_240829.csv +t5_xl_lm_adapt,Holmes,59.5,[],holmes_240829.csv +electra_base_discriminator,Holmes,58.3,[],holmes_240829.csv +dolly_v2_12b,Holmes,58.2,[],holmes_240829.csv +pythia_12b,Holmes,58.0,[],holmes_240829.csv +tulu_2_13b,Holmes,57.6,[],holmes_240829.csv +pythia_6_9b,Holmes,56.6,[],holmes_240829.csv +deberta_v3_base,Holmes,56.0,[],holmes_240829.csv +pythia_2_8b_deduped,Holmes,56.0,[],holmes_240829.csv +llama_2_70b,Holmes,55.9,[],holmes_240829.csv +tulu_2_dpo_13b,Holmes,55.5,[],holmes_240829.csv +wizardlm_13b_v1_2,Holmes,55.4,[],holmes_240829.csv +deberta_base,Holmes,55.3,[],holmes_240829.csv +pythia_1_4b,Holmes,54.2,[],holmes_240829.csv +pythia_2_8b,Holmes,54.0,[],holmes_240829.csv +tulu_2_70b,Holmes,53.5,[],holmes_240829.csv +mistral_7b_instruct_v0_1,Holmes,52.9,[],holmes_240829.csv +albert_base_v2,Holmes,52.3,[],holmes_240829.csv +tk_instruct_11b_def,Holmes,51.7,[],holmes_240829.csv +tulu_2_dpo_70b,Holmes,51.2,[],holmes_240829.csv +flan_t5_large,Holmes,50.9,[],holmes_240829.csv +t5_base_lm_adapt,Holmes,48.7,[],holmes_240829.csv +flan_t5_base,Holmes,48.7,[],holmes_240829.csv +pythia_1b_deduped,Holmes,47.5,[],holmes_240829.csv +llama_2_7b,Holmes,47.2,[],holmes_240829.csv +pythia_1_4b_deduped,Holmes,47.2,[],holmes_240829.csv +mixtral_8x7b_instruct_v0_1,Holmes,46.5,[],holmes_240829.csv +bert_base_uncased,Holmes,45.3,[],holmes_240829.csv +mistral_7b_v0_1,Holmes,45.2,[],holmes_240829.csv +llama_2_7b_chat,Holmes,45.0,[],holmes_240829.csv +merlinite_7b,Holmes,44.1,[],holmes_240829.csv +roberta_base,Holmes,43.2,[],holmes_240829.csv +t5_large_lm_adapt,Holmes,42.4,[],holmes_240829.csv +mixtral_8x7b_v0_1,Holmes,42.2,[],holmes_240829.csv +gpt2,Holmes,40.6,[],holmes_240829.csv +pythia_410m,Holmes,40.0,[],holmes_240829.csv +flan_t5_small,Holmes,38.9,[],holmes_240829.csv +t5_small_lm_adapt,Holmes,36.0,[],holmes_240829.csv +pythia_410m_deduped,Holmes,31.3,[],holmes_240829.csv +glove_840b,Holmes,26.6,[],holmes_240829.csv +pythia_160m_deduped,Holmes,17.2,[],holmes_240829.csv +pythia_160m,Holmes,16.3,[],holmes_240829.csv +pythia_70m,Holmes,15.6,[],holmes_240829.csv +pythia_70m_deduped,Holmes,14.4,[],holmes_240829.csv +mistral_large_2_2407,eureka_information_retrieval_fact_recall,36.3,[],eureka_241002.csv +llama3_70b,eureka_information_retrieval_fact_recall,37.4,[],eureka_241002.csv +llama3_1_70b,eureka_information_retrieval_fact_recall,44.2,[],eureka_241002.csv +llama3_1_405b,eureka_information_retrieval_fact_recall,54.9,[],eureka_241002.csv +gpt_4o_2024_05_13,eureka_information_retrieval_fact_recall,53.7,[],eureka_241002.csv +gpt_4_1106_preview,eureka_information_retrieval_fact_recall,47.0,[],eureka_241002.csv +gemini_1_5_pro,eureka_information_retrieval_fact_recall,41.3,[],eureka_241002.csv +claude_3_opus,eureka_information_retrieval_fact_recall,50.5,[],eureka_241002.csv +claude_3_5_sonnet,eureka_information_retrieval_fact_recall,55.3,[],eureka_241002.csv +mistral_large_2_2407,eureka_information_retrieval_fact_precision,17.6,[],eureka_241002.csv +llama3_70b,eureka_information_retrieval_fact_precision,15.0,[],eureka_241002.csv +llama3_1_70b,eureka_information_retrieval_fact_precision,16.0,[],eureka_241002.csv +llama3_1_405b,eureka_information_retrieval_fact_precision,16.8,[],eureka_241002.csv +gpt_4o_2024_05_13,eureka_information_retrieval_fact_precision,20.3,[],eureka_241002.csv +gpt_4_1106_preview,eureka_information_retrieval_fact_precision,23.3,[],eureka_241002.csv +gemini_1_5_pro,eureka_information_retrieval_fact_precision,9.8,[],eureka_241002.csv +claude_3_opus,eureka_information_retrieval_fact_precision,18.7,[],eureka_241002.csv +claude_3_5_sonnet,eureka_information_retrieval_fact_precision,20.6,[],eureka_241002.csv +mistral_large_2_2407,eureka_instruction_following,77.3,[],eureka_241002.csv +llama3_70b,eureka_instruction_following,77.3,[],eureka_241002.csv +llama3_1_70b,eureka_instruction_following,80.8,[],eureka_241002.csv +llama3_1_405b,eureka_instruction_following,83.5,[],eureka_241002.csv +gpt_4o_2024_05_13,eureka_instruction_following,81.3,[],eureka_241002.csv +gpt_4_1106_preview,eureka_instruction_following,75.2,[],eureka_241002.csv +gemini_1_5_pro,eureka_instruction_following,75.2,[],eureka_241002.csv +claude_3_opus,eureka_instruction_following,81.9,[],eureka_241002.csv +claude_3_5_sonnet,eureka_instruction_following,81.3,[],eureka_241002.csv +mistral_large_2_2407,eureka_long_context_qa_average,84.8,[],eureka_241002.csv +llama3_70b,eureka_long_context_qa_average,86.8,[],eureka_241002.csv +llama3_1_70b,eureka_long_context_qa_average,93.9,[],eureka_241002.csv +llama3_1_405b,eureka_long_context_qa_average,96.6,[],eureka_241002.csv +gpt_4o_2024_05_13,eureka_long_context_qa_average,95.5,[],eureka_241002.csv +gpt_4_1106_preview,eureka_long_context_qa_average,92.7,[],eureka_241002.csv +gemini_1_5_pro,eureka_long_context_qa_average,87.7,[],eureka_241002.csv +claude_3_opus,eureka_long_context_qa_average,82.0,[],eureka_241002.csv +claude_3_5_sonnet,eureka_long_context_qa_average,84.5,[],eureka_241002.csv +mistral_large_2_2407,eureka_long_context_qa_longest_context_3k,66.7,[],eureka_241002.csv +llama3_70b,eureka_long_context_qa_longest_context_3k,74.9,[],eureka_241002.csv +llama3_1_70b,eureka_long_context_qa_longest_context_3k,86.4,[],eureka_241002.csv +llama3_1_405b,eureka_long_context_qa_longest_context_3k,92.5,[],eureka_241002.csv +gpt_4o_2024_05_13,eureka_long_context_qa_longest_context_3k,90.8,[],eureka_241002.csv +gpt_4_1106_preview,eureka_long_context_qa_longest_context_3k,85.5,[],eureka_241002.csv +gemini_1_5_pro,eureka_long_context_qa_longest_context_3k,75.8,[],eureka_241002.csv +claude_3_opus,eureka_long_context_qa_longest_context_3k,73.0,[],eureka_241002.csv +claude_3_5_sonnet,eureka_long_context_qa_longest_context_3k,75.6,[],eureka_241002.csv +mistral_large_2_2407,eureka_toxicity_detection,84.1,[],eureka_241002.csv +llama3_70b,eureka_toxicity_detection,87.4,[],eureka_241002.csv +llama3_1_70b,eureka_toxicity_detection,86.0,[],eureka_241002.csv +llama3_1_405b,eureka_toxicity_detection,57.1,[],eureka_241002.csv +gpt_4o_2024_05_13,eureka_toxicity_detection,86.1,[],eureka_241002.csv +gpt_4_1106_preview,eureka_toxicity_detection,84.1,[],eureka_241002.csv +gemini_1_5_pro,eureka_toxicity_detection,42.6,[],eureka_241002.csv +claude_3_opus,eureka_toxicity_detection,53.2,[],eureka_241002.csv +claude_3_5_sonnet,eureka_toxicity_detection,67.6,[],eureka_241002.csv +gpt_4o_2024_05_13,Helm Lite,0.963,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite,0.915,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite,0.915,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite,0.908,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite,0.896,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite,0.858,[],helm_lite_240829.csv +llama3_70b,Helm Lite,0.838,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite,0.827,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite,0.803,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite,0.793,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite,0.776,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite,0.767,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite,0.758,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite,0.749,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite,0.742,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite,0.733,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite,0.722,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite,0.703,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite,0.68,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite,0.659,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite,0.639,[],helm_lite_240829.csv +yi_34b,Helm Lite,0.634,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite,0.619,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite,0.615,[],helm_lite_240829.csv +claude_v1_3,Helm Lite,0.594,[],helm_lite_240829.csv +palm_2_bison,Helm Lite,0.584,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite,0.582,[],helm_lite_240829.csv +phi_3_14b,Helm Lite,0.579,[],helm_lite_240829.csv +claude_2_0,Helm Lite,0.56,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite,0.556,[],helm_lite_240829.csv +phi_3_7b,Helm Lite,0.545,[],helm_lite_240829.csv +llama_2_70b,Helm Lite,0.537,[],helm_lite_240829.csv +yi_large_preview,Helm Lite,0.53,[],helm_lite_240829.csv +command_r_plus,Helm Lite,0.509,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite,0.503,[],helm_lite_240829.csv +claude_2_1,Helm Lite,0.503,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite,0.491,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite,0.484,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite,0.464,[],helm_lite_240829.csv +llama3_8b,Helm Lite,0.441,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite,0.42,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite,0.42,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite,0.401,[],helm_lite_240829.csv +arctic_instruct,Helm Lite,0.399,[],helm_lite_240829.csv +gemma_7b,Helm Lite,0.392,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite,0.392,[],helm_lite_240829.csv +llama_65b,Helm Lite,0.39,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite,0.382,[],helm_lite_240829.csv +command,Helm Lite,0.365,[],helm_lite_240829.csv +command_r,Helm Lite,0.35,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite,0.347,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite,0.342,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite,0.341,[],helm_lite_240829.csv +jamba_instruct,Helm Lite,0.339,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite,0.338,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite,0.318,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite,0.317,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite,0.309,[],helm_lite_240829.csv +yi_6b,Helm Lite,0.289,[],helm_lite_240829.csv +llama_2_13b,Helm Lite,0.273,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite,0.254,[],helm_lite_240829.csv +falcon_40b,Helm Lite,0.249,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite,0.233,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite,0.203,[],helm_lite_240829.csv +phi_2,Helm Lite,0.202,[],helm_lite_240829.csv +llama_2_7b,Helm Lite,0.18,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite,0.172,[],helm_lite_240829.csv +command_light,Helm Lite,0.125,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite,0.093,[],helm_lite_240829.csv +falcon_7b,Helm Lite,0.078,[],helm_lite_240829.csv +olmo_7b,Helm Lite,0.063,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite,0.052,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite NarrativeQA,0.804,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite NarrativeQA,0.746,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite NarrativeQA,0.768,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite NarrativeQA,0.761,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite NarrativeQA,0.749,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite NarrativeQA,0.772,[],helm_lite_240829.csv +llama3_70b,Helm Lite NarrativeQA,0.798,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite NarrativeQA,0.727,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite NarrativeQA,0.779,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite NarrativeQA,0.783,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite NarrativeQA,0.768,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite NarrativeQA,0.779,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite NarrativeQA,0.727,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite NarrativeQA,0.706,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite NarrativeQA,0.79,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite NarrativeQA,0.783,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite NarrativeQA,0.351,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite NarrativeQA,0.583,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite NarrativeQA,0.601,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite NarrativeQA,0.752,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite NarrativeQA,0.768,[],helm_lite_240829.csv +yi_34b,Helm Lite NarrativeQA,0.782,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite NarrativeQA,0.721,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite NarrativeQA,0.589,[],helm_lite_240829.csv +claude_v1_3,Helm Lite NarrativeQA,0.723,[],helm_lite_240829.csv +palm_2_bison,Helm Lite NarrativeQA,0.718,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite NarrativeQA,0.767,[],helm_lite_240829.csv +phi_3_14b,Helm Lite NarrativeQA,0.724,[],helm_lite_240829.csv +claude_2_0,Helm Lite NarrativeQA,0.718,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite NarrativeQA,0.581,[],helm_lite_240829.csv +phi_3_7b,Helm Lite NarrativeQA,0.754,[],helm_lite_240829.csv +llama_2_70b,Helm Lite NarrativeQA,0.763,[],helm_lite_240829.csv +yi_large_preview,Helm Lite NarrativeQA,0.373,[],helm_lite_240829.csv +command_r_plus,Helm Lite NarrativeQA,0.735,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite NarrativeQA,0.731,[],helm_lite_240829.csv +claude_2_1,Helm Lite NarrativeQA,0.677,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite NarrativeQA,0.711,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite NarrativeQA,0.751,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite NarrativeQA,0.616,[],helm_lite_240829.csv +llama3_8b,Helm Lite NarrativeQA,0.754,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite NarrativeQA,0.655,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite NarrativeQA,0.111,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite NarrativeQA,0.731,[],helm_lite_240829.csv +arctic_instruct,Helm Lite NarrativeQA,0.654,[],helm_lite_240829.csv +gemma_7b,Helm Lite NarrativeQA,0.752,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite NarrativeQA,0.719,[],helm_lite_240829.csv +llama_65b,Helm Lite NarrativeQA,0.755,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite NarrativeQA,0.454,[],helm_lite_240829.csv +command,Helm Lite NarrativeQA,0.749,[],helm_lite_240829.csv +command_r,Helm Lite NarrativeQA,0.742,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite NarrativeQA,0.756,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite NarrativeQA,0.519,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite NarrativeQA,0.488,[],helm_lite_240829.csv +jamba_instruct,Helm Lite NarrativeQA,0.658,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite NarrativeQA,0.716,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite NarrativeQA,0.449,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite NarrativeQA,0.448,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite NarrativeQA,0.244,[],helm_lite_240829.csv +yi_6b,Helm Lite NarrativeQA,0.702,[],helm_lite_240829.csv +llama_2_13b,Helm Lite NarrativeQA,0.741,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite NarrativeQA,0.728,[],helm_lite_240829.csv +falcon_40b,Helm Lite NarrativeQA,0.671,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite NarrativeQA,0.716,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite NarrativeQA,0.744,[],helm_lite_240829.csv +phi_2,Helm Lite NarrativeQA,0.703,[],helm_lite_240829.csv +llama_2_7b,Helm Lite NarrativeQA,0.686,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite NarrativeQA,0.743,[],helm_lite_240829.csv +command_light,Helm Lite NarrativeQA,0.629,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite NarrativeQA,0.684,[],helm_lite_240829.csv +falcon_7b,Helm Lite NarrativeQA,0.621,[],helm_lite_240829.csv +olmo_7b,Helm Lite NarrativeQA,0.597,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite NarrativeQA,0.633,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite NaturalQuestionsOpen,0.803,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite NaturalQuestionsOpen,0.749,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite NaturalQuestionsOpen,0.79,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite NaturalQuestionsOpen,0.795,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite NaturalQuestionsOpen,0.756,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite NaturalQuestionsOpen,0.738,[],helm_lite_240829.csv +llama3_70b,Helm Lite NaturalQuestionsOpen,0.743,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite NaturalQuestionsOpen,0.776,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite NaturalQuestionsOpen,0.734,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite NaturalQuestionsOpen,0.748,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite NaturalQuestionsOpen,0.746,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite NaturalQuestionsOpen,0.726,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite NaturalQuestionsOpen,0.763,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite NaturalQuestionsOpen,0.685,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite NaturalQuestionsOpen,0.731,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite NaturalQuestionsOpen,0.723,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite NaturalQuestionsOpen,0.264,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite NaturalQuestionsOpen,0.674,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite NaturalQuestionsOpen,0.758,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite NaturalQuestionsOpen,0.752,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite NaturalQuestionsOpen,0.738,[],helm_lite_240829.csv +yi_34b,Helm Lite NaturalQuestionsOpen,0.775,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite NaturalQuestionsOpen,0.739,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite NaturalQuestionsOpen,0.777,[],helm_lite_240829.csv +claude_v1_3,Helm Lite NaturalQuestionsOpen,0.699,[],helm_lite_240829.csv +palm_2_bison,Helm Lite NaturalQuestionsOpen,0.813,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite NaturalQuestionsOpen,0.699,[],helm_lite_240829.csv +phi_3_14b,Helm Lite NaturalQuestionsOpen,0.729,[],helm_lite_240829.csv +claude_2_0,Helm Lite NaturalQuestionsOpen,0.67,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite NaturalQuestionsOpen,0.733,[],helm_lite_240829.csv +phi_3_7b,Helm Lite NaturalQuestionsOpen,0.675,[],helm_lite_240829.csv +llama_2_70b,Helm Lite NaturalQuestionsOpen,0.674,[],helm_lite_240829.csv +yi_large_preview,Helm Lite NaturalQuestionsOpen,0.586,[],helm_lite_240829.csv +command_r_plus,Helm Lite NaturalQuestionsOpen,0.711,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite NaturalQuestionsOpen,0.77,[],helm_lite_240829.csv +claude_2_1,Helm Lite NaturalQuestionsOpen,0.611,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite NaturalQuestionsOpen,0.772,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite NaturalQuestionsOpen,0.714,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite NaturalQuestionsOpen,0.731,[],helm_lite_240829.csv +llama3_8b,Helm Lite NaturalQuestionsOpen,0.681,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite NaturalQuestionsOpen,0.678,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite NaturalQuestionsOpen,0.072,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite NaturalQuestionsOpen,0.65,[],helm_lite_240829.csv +arctic_instruct,Helm Lite NaturalQuestionsOpen,0.586,[],helm_lite_240829.csv +gemma_7b,Helm Lite NaturalQuestionsOpen,0.665,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite NaturalQuestionsOpen,0.71,[],helm_lite_240829.csv +llama_65b,Helm Lite NaturalQuestionsOpen,0.672,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite NaturalQuestionsOpen,0.485,[],helm_lite_240829.csv +command,Helm Lite NaturalQuestionsOpen,0.777,[],helm_lite_240829.csv +command_r,Helm Lite NaturalQuestionsOpen,0.72,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite NaturalQuestionsOpen,0.677,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite NaturalQuestionsOpen,0.587,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite NaturalQuestionsOpen,0.55,[],helm_lite_240829.csv +jamba_instruct,Helm Lite NaturalQuestionsOpen,0.636,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite NaturalQuestionsOpen,0.687,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite NaturalQuestionsOpen,0.468,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite NaturalQuestionsOpen,0.749,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite NaturalQuestionsOpen,0.252,[],helm_lite_240829.csv +yi_6b,Helm Lite NaturalQuestionsOpen,0.748,[],helm_lite_240829.csv +llama_2_13b,Helm Lite NaturalQuestionsOpen,0.64,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite NaturalQuestionsOpen,0.65,[],helm_lite_240829.csv +falcon_40b,Helm Lite NaturalQuestionsOpen,0.676,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite NaturalQuestionsOpen,0.68,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite NaturalQuestionsOpen,0.627,[],helm_lite_240829.csv +phi_2,Helm Lite NaturalQuestionsOpen,0.68,[],helm_lite_240829.csv +llama_2_7b,Helm Lite NaturalQuestionsOpen,0.612,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite NaturalQuestionsOpen,0.656,[],helm_lite_240829.csv +command_light,Helm Lite NaturalQuestionsOpen,0.686,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite NaturalQuestionsOpen,0.611,[],helm_lite_240829.csv +falcon_7b,Helm Lite NaturalQuestionsOpen,0.58,[],helm_lite_240829.csv +olmo_7b,Helm Lite NaturalQuestionsOpen,0.603,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite NaturalQuestionsOpen,0.577,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite NaturalQuestionsClosed,0.501,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite NaturalQuestionsClosed,0.502,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite NaturalQuestionsClosed,0.457,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite NaturalQuestionsClosed,0.482,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite NaturalQuestionsClosed,0.456,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite NaturalQuestionsClosed,0.452,[],helm_lite_240829.csv +llama3_70b,Helm Lite NaturalQuestionsClosed,0.475,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite NaturalQuestionsClosed,0.39,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite NaturalQuestionsClosed,0.453,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite NaturalQuestionsClosed,0.378,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite NaturalQuestionsClosed,0.386,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite NaturalQuestionsClosed,0.478,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite NaturalQuestionsClosed,0.435,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite NaturalQuestionsClosed,0.407,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite NaturalQuestionsClosed,0.353,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite NaturalQuestionsClosed,0.332,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite NaturalQuestionsClosed,0.441,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite NaturalQuestionsClosed,0.435,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite NaturalQuestionsClosed,0.417,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite NaturalQuestionsClosed,0.428,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite NaturalQuestionsClosed,0.328,[],helm_lite_240829.csv +yi_34b,Helm Lite NaturalQuestionsClosed,0.443,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite NaturalQuestionsClosed,0.35,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite NaturalQuestionsClosed,0.353,[],helm_lite_240829.csv +claude_v1_3,Helm Lite NaturalQuestionsClosed,0.409,[],helm_lite_240829.csv +palm_2_bison,Helm Lite NaturalQuestionsClosed,0.39,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite NaturalQuestionsClosed,0.427,[],helm_lite_240829.csv +phi_3_14b,Helm Lite NaturalQuestionsClosed,0.278,[],helm_lite_240829.csv +claude_2_0,Helm Lite NaturalQuestionsClosed,0.428,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite NaturalQuestionsClosed,0.412,[],helm_lite_240829.csv +phi_3_7b,Helm Lite NaturalQuestionsClosed,0.324,[],helm_lite_240829.csv +llama_2_70b,Helm Lite NaturalQuestionsClosed,0.46,[],helm_lite_240829.csv +yi_large_preview,Helm Lite NaturalQuestionsClosed,0.428,[],helm_lite_240829.csv +command_r_plus,Helm Lite NaturalQuestionsClosed,0.343,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite NaturalQuestionsClosed,0.413,[],helm_lite_240829.csv +claude_2_1,Helm Lite NaturalQuestionsClosed,0.375,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite NaturalQuestionsClosed,0.3,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite NaturalQuestionsClosed,0.391,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite NaturalQuestionsClosed,0.343,[],helm_lite_240829.csv +llama3_8b,Helm Lite NaturalQuestionsClosed,0.378,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite NaturalQuestionsClosed,0.335,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite NaturalQuestionsClosed,0.028,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite NaturalQuestionsClosed,0.265,[],helm_lite_240829.csv +arctic_instruct,Helm Lite NaturalQuestionsClosed,0.39,[],helm_lite_240829.csv +gemma_7b,Helm Lite NaturalQuestionsClosed,0.336,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite NaturalQuestionsClosed,0.394,[],helm_lite_240829.csv +llama_65b,Helm Lite NaturalQuestionsClosed,0.433,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite NaturalQuestionsClosed,0.311,[],helm_lite_240829.csv +command,Helm Lite NaturalQuestionsClosed,0.391,[],helm_lite_240829.csv +command_r,Helm Lite NaturalQuestionsClosed,0.352,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite NaturalQuestionsClosed,0.209,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite NaturalQuestionsClosed,0.304,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite NaturalQuestionsClosed,0.284,[],helm_lite_240829.csv +jamba_instruct,Helm Lite NaturalQuestionsClosed,0.384,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite NaturalQuestionsClosed,0.367,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite NaturalQuestionsClosed,0.29,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite NaturalQuestionsClosed,0.27,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite NaturalQuestionsClosed,0.144,[],helm_lite_240829.csv +yi_6b,Helm Lite NaturalQuestionsClosed,0.31,[],helm_lite_240829.csv +llama_2_13b,Helm Lite NaturalQuestionsClosed,0.371,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite NaturalQuestionsClosed,0.385,[],helm_lite_240829.csv +falcon_40b,Helm Lite NaturalQuestionsClosed,0.392,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite NaturalQuestionsClosed,0.253,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite NaturalQuestionsClosed,0.35,[],helm_lite_240829.csv +phi_2,Helm Lite NaturalQuestionsClosed,0.155,[],helm_lite_240829.csv +llama_2_7b,Helm Lite NaturalQuestionsClosed,0.333,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite NaturalQuestionsClosed,0.299,[],helm_lite_240829.csv +command_light,Helm Lite NaturalQuestionsClosed,0.195,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite NaturalQuestionsClosed,0.253,[],helm_lite_240829.csv +falcon_7b,Helm Lite NaturalQuestionsClosed,0.285,[],helm_lite_240829.csv +olmo_7b,Helm Lite NaturalQuestionsClosed,0.259,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite NaturalQuestionsClosed,0.197,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite OpenBookQA,0.966,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite OpenBookQA,0.972,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite OpenBookQA,0.96,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite OpenBookQA,0.97,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite OpenBookQA,0.94,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite OpenBookQA,0.938,[],helm_lite_240829.csv +llama3_70b,Helm Lite OpenBookQA,0.934,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite OpenBookQA,0.954,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite OpenBookQA,0.932,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite OpenBookQA,0.902,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite OpenBookQA,0.92,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite OpenBookQA,0.882,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite OpenBookQA,0.95,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite OpenBookQA,0.938,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite OpenBookQA,0.918,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite OpenBookQA,0.928,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite OpenBookQA,0.956,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite OpenBookQA,0.938,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite OpenBookQA,0.93,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite OpenBookQA,0.878,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite OpenBookQA,0.91,[],helm_lite_240829.csv +yi_34b,Helm Lite OpenBookQA,0.92,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite OpenBookQA,0.922,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite OpenBookQA,0.932,[],helm_lite_240829.csv +claude_v1_3,Helm Lite OpenBookQA,0.908,[],helm_lite_240829.csv +palm_2_bison,Helm Lite OpenBookQA,0.878,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite OpenBookQA,0.868,[],helm_lite_240829.csv +phi_3_14b,Helm Lite OpenBookQA,0.916,[],helm_lite_240829.csv +claude_2_0,Helm Lite OpenBookQA,0.862,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite OpenBookQA,0.88,[],helm_lite_240829.csv +phi_3_7b,Helm Lite OpenBookQA,0.912,[],helm_lite_240829.csv +llama_2_70b,Helm Lite OpenBookQA,0.838,[],helm_lite_240829.csv +yi_large_preview,Helm Lite OpenBookQA,0.946,[],helm_lite_240829.csv +command_r_plus,Helm Lite OpenBookQA,0.828,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite OpenBookQA,0.828,[],helm_lite_240829.csv +claude_2_1,Helm Lite OpenBookQA,0.872,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite OpenBookQA,0.862,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite OpenBookQA,0.788,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite OpenBookQA,0.844,[],helm_lite_240829.csv +llama3_8b,Helm Lite OpenBookQA,0.766,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite OpenBookQA,0.838,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite OpenBookQA,0.918,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite OpenBookQA,0.822,[],helm_lite_240829.csv +arctic_instruct,Helm Lite OpenBookQA,0.828,[],helm_lite_240829.csv +gemma_7b,Helm Lite OpenBookQA,0.808,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite OpenBookQA,0.796,[],helm_lite_240829.csv +llama_65b,Helm Lite OpenBookQA,0.754,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite OpenBookQA,0.894,[],helm_lite_240829.csv +command,Helm Lite OpenBookQA,0.774,[],helm_lite_240829.csv +command_r,Helm Lite OpenBookQA,0.782,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite OpenBookQA,0.74,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite OpenBookQA,0.862,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite OpenBookQA,0.91,[],helm_lite_240829.csv +jamba_instruct,Helm Lite OpenBookQA,0.796,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite OpenBookQA,0.776,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite OpenBookQA,0.83,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite OpenBookQA,0.806,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite OpenBookQA,0.838,[],helm_lite_240829.csv +yi_6b,Helm Lite OpenBookQA,0.8,[],helm_lite_240829.csv +llama_2_13b,Helm Lite OpenBookQA,0.634,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite OpenBookQA,0.688,[],helm_lite_240829.csv +falcon_40b,Helm Lite OpenBookQA,0.662,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite OpenBookQA,0.79,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite OpenBookQA,0.614,[],helm_lite_240829.csv +phi_2,Helm Lite OpenBookQA,0.798,[],helm_lite_240829.csv +llama_2_7b,Helm Lite OpenBookQA,0.544,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite OpenBookQA,0.284,[],helm_lite_240829.csv +command_light,Helm Lite OpenBookQA,0.398,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite OpenBookQA,0.272,[],helm_lite_240829.csv +falcon_7b,Helm Lite OpenBookQA,0.26,[],helm_lite_240829.csv +olmo_7b,Helm Lite OpenBookQA,0.222,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite OpenBookQA,0.286,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite MMLU,0.748,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite MMLU,0.799,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite MMLU,0.735,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite MMLU,0.711,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite MMLU,0.759,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite MMLU,0.709,[],helm_lite_240829.csv +llama3_70b,Helm Lite MMLU,0.695,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite MMLU,0.769,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite MMLU,0.725,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite MMLU,0.772,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite MMLU,0.668,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite MMLU,0.701,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite MMLU,0.699,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite MMLU,0.702,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite MMLU,0.664,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite MMLU,0.703,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite MMLU,0.768,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite MMLU,0.702,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite MMLU,0.647,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite MMLU,0.621,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite MMLU,0.645,[],helm_lite_240829.csv +yi_34b,Helm Lite MMLU,0.65,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite MMLU,0.704,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite MMLU,0.628,[],helm_lite_240829.csv +claude_v1_3,Helm Lite MMLU,0.631,[],helm_lite_240829.csv +palm_2_bison,Helm Lite MMLU,0.608,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite MMLU,0.649,[],helm_lite_240829.csv +phi_3_14b,Helm Lite MMLU,0.675,[],helm_lite_240829.csv +claude_2_0,Helm Lite MMLU,0.639,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite MMLU,0.641,[],helm_lite_240829.csv +phi_3_7b,Helm Lite MMLU,0.659,[],helm_lite_240829.csv +llama_2_70b,Helm Lite MMLU,0.58,[],helm_lite_240829.csv +yi_large_preview,Helm Lite MMLU,0.712,[],helm_lite_240829.csv +command_r_plus,Helm Lite MMLU,0.59,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite MMLU,0.555,[],helm_lite_240829.csv +claude_2_1,Helm Lite MMLU,0.643,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite MMLU,0.626,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite MMLU,0.534,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite MMLU,0.631,[],helm_lite_240829.csv +llama3_8b,Helm Lite MMLU,0.602,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite MMLU,0.614,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite MMLU,0.652,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite MMLU,0.604,[],helm_lite_240829.csv +arctic_instruct,Helm Lite MMLU,0.575,[],helm_lite_240829.csv +gemma_7b,Helm Lite MMLU,0.571,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite MMLU,0.568,[],helm_lite_240829.csv +llama_65b,Helm Lite MMLU,0.584,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite MMLU,0.638,[],helm_lite_240829.csv +command,Helm Lite MMLU,0.525,[],helm_lite_240829.csv +command_r,Helm Lite MMLU,0.567,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite MMLU,0.5,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite MMLU,0.593,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite MMLU,0.643,[],helm_lite_240829.csv +jamba_instruct,Helm Lite MMLU,0.582,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite MMLU,0.584,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite MMLU,0.618,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite MMLU,0.569,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite MMLU,0.662,[],helm_lite_240829.csv +yi_6b,Helm Lite MMLU,0.53,[],helm_lite_240829.csv +llama_2_13b,Helm Lite MMLU,0.505,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite MMLU,0.483,[],helm_lite_240829.csv +falcon_40b,Helm Lite MMLU,0.507,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite MMLU,0.51,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite MMLU,0.471,[],helm_lite_240829.csv +phi_2,Helm Lite MMLU,0.518,[],helm_lite_240829.csv +llama_2_7b,Helm Lite MMLU,0.425,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite MMLU,0.316,[],helm_lite_240829.csv +command_light,Helm Lite MMLU,0.386,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite MMLU,0.248,[],helm_lite_240829.csv +falcon_7b,Helm Lite MMLU,0.288,[],helm_lite_240829.csv +olmo_7b,Helm Lite MMLU,0.305,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite MMLU,0.243,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite MathEquivalentCOT,0.829,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite MathEquivalentCOT,0.813,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite MathEquivalentCOT,0.802,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite MathEquivalentCOT,0.833,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite MathEquivalentCOT,0.827,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite MathEquivalentCOT,0.783,[],helm_lite_240829.csv +llama3_70b,Helm Lite MathEquivalentCOT,0.663,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite MathEquivalentCOT,0.79,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite MathEquivalentCOT,0.677,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite MathEquivalentCOT,0.825,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite MathEquivalentCOT,0.802,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite MathEquivalentCOT,0.656,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite MathEquivalentCOT,0.857,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite MathEquivalentCOT,0.723,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite MathEquivalentCOT,0.746,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite MathEquivalentCOT,0.753,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite MathEquivalentCOT,0.76,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite MathEquivalentCOT,0.674,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite MathEquivalentCOT,0.683,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite MathEquivalentCOT,0.58,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite MathEquivalentCOT,0.724,[],helm_lite_240829.csv +yi_34b,Helm Lite MathEquivalentCOT,0.375,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite MathEquivalentCOT,0.568,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite MathEquivalentCOT,0.733,[],helm_lite_240829.csv +claude_v1_3,Helm Lite MathEquivalentCOT,0.54,[],helm_lite_240829.csv +palm_2_bison,Helm Lite MathEquivalentCOT,0.421,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite MathEquivalentCOT,0.494,[],helm_lite_240829.csv +phi_3_14b,Helm Lite MathEquivalentCOT,0.611,[],helm_lite_240829.csv +claude_2_0,Helm Lite MathEquivalentCOT,0.603,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite MathEquivalentCOT,0.615,[],helm_lite_240829.csv +phi_3_7b,Helm Lite MathEquivalentCOT,0.703,[],helm_lite_240829.csv +llama_2_70b,Helm Lite MathEquivalentCOT,0.323,[],helm_lite_240829.csv +yi_large_preview,Helm Lite MathEquivalentCOT,0.712,[],helm_lite_240829.csv +command_r_plus,Helm Lite MathEquivalentCOT,0.403,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite MathEquivalentCOT,0.449,[],helm_lite_240829.csv +claude_2_1,Helm Lite MathEquivalentCOT,0.632,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite MathEquivalentCOT,0.686,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite MathEquivalentCOT,0.665,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite MathEquivalentCOT,0.499,[],helm_lite_240829.csv +llama3_8b,Helm Lite MathEquivalentCOT,0.391,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite MathEquivalentCOT,0.667,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite MathEquivalentCOT,0.084,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite MathEquivalentCOT,0.668,[],helm_lite_240829.csv +arctic_instruct,Helm Lite MathEquivalentCOT,0.519,[],helm_lite_240829.csv +gemma_7b,Helm Lite MathEquivalentCOT,0.5,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite MathEquivalentCOT,0.428,[],helm_lite_240829.csv +llama_65b,Helm Lite MathEquivalentCOT,0.257,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite MathEquivalentCOT,0.75,[],helm_lite_240829.csv +command,Helm Lite MathEquivalentCOT,0.236,[],helm_lite_240829.csv +command_r,Helm Lite MathEquivalentCOT,0.266,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite MathEquivalentCOT,0.703,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite MathEquivalentCOT,0.621,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite MathEquivalentCOT,0.358,[],helm_lite_240829.csv +jamba_instruct,Helm Lite MathEquivalentCOT,0.38,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite MathEquivalentCOT,0.297,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite MathEquivalentCOT,0.565,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite MathEquivalentCOT,0.561,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite MathEquivalentCOT,0.131,[],helm_lite_240829.csv +yi_6b,Helm Lite MathEquivalentCOT,0.126,[],helm_lite_240829.csv +llama_2_13b,Helm Lite MathEquivalentCOT,0.102,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite MathEquivalentCOT,0.103,[],helm_lite_240829.csv +falcon_40b,Helm Lite MathEquivalentCOT,0.128,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite MathEquivalentCOT,0.289,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite MathEquivalentCOT,0.064,[],helm_lite_240829.csv +phi_2,Helm Lite MathEquivalentCOT,0.255,[],helm_lite_240829.csv +llama_2_7b,Helm Lite MathEquivalentCOT,0.097,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite MathEquivalentCOT,0.078,[],helm_lite_240829.csv +command_light,Helm Lite MathEquivalentCOT,0.098,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite MathEquivalentCOT,0.04,[],helm_lite_240829.csv +falcon_7b,Helm Lite MathEquivalentCOT,0.044,[],helm_lite_240829.csv +olmo_7b,Helm Lite MathEquivalentCOT,0.029,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite MathEquivalentCOT,0.026,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite GSM8K,0.905,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite GSM8K,0.949,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite GSM8K,0.932,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite GSM8K,0.824,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite GSM8K,0.949,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite GSM8K,0.938,[],helm_lite_240829.csv +llama3_70b,Helm Lite GSM8K,0.805,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite GSM8K,0.92,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite GSM8K,0.912,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite GSM8K,0.836,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite GSM8K,0.843,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite GSM8K,0.8,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite GSM8K,0.668,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite GSM8K,0.831,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite GSM8K,0.812,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite GSM8K,0.785,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite GSM8K,0.924,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite GSM8K,0.831,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite GSM8K,0.799,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite GSM8K,0.735,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite GSM8K,0.762,[],helm_lite_240829.csv +yi_34b,Helm Lite GSM8K,0.648,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite GSM8K,0.815,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite GSM8K,0.773,[],helm_lite_240829.csv +claude_v1_3,Helm Lite GSM8K,0.784,[],helm_lite_240829.csv +palm_2_bison,Helm Lite GSM8K,0.61,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite GSM8K,0.622,[],helm_lite_240829.csv +phi_3_14b,Helm Lite GSM8K,0.878,[],helm_lite_240829.csv +claude_2_0,Helm Lite GSM8K,0.583,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite GSM8K,0.795,[],helm_lite_240829.csv +llama_2_70b,Helm Lite GSM8K,0.567,[],helm_lite_240829.csv +yi_large_preview,Helm Lite GSM8K,0.69,[],helm_lite_240829.csv +command_r_plus,Helm Lite GSM8K,0.738,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite GSM8K,0.615,[],helm_lite_240829.csv +claude_2_1,Helm Lite GSM8K,0.604,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite GSM8K,0.693,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite GSM8K,0.816,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite GSM8K,0.721,[],helm_lite_240829.csv +llama3_8b,Helm Lite GSM8K,0.499,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite GSM8K,0.501,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite GSM8K,0.907,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite GSM8K,0.782,[],helm_lite_240829.csv +arctic_instruct,Helm Lite GSM8K,0.768,[],helm_lite_240829.csv +gemma_7b,Helm Lite GSM8K,0.559,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite GSM8K,0.479,[],helm_lite_240829.csv +llama_65b,Helm Lite GSM8K,0.489,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite GSM8K,0.694,[],helm_lite_240829.csv +command,Helm Lite GSM8K,0.452,[],helm_lite_240829.csv +command_r,Helm Lite GSM8K,0.551,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite GSM8K,0.798,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite GSM8K,0.734,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite GSM8K,0.671,[],helm_lite_240829.csv +jamba_instruct,Helm Lite GSM8K,0.67,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite GSM8K,0.377,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite GSM8K,0.706,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite GSM8K,0.6,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite GSM8K,0.699,[],helm_lite_240829.csv +yi_6b,Helm Lite GSM8K,0.375,[],helm_lite_240829.csv +llama_2_13b,Helm Lite GSM8K,0.266,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite GSM8K,0.239,[],helm_lite_240829.csv +falcon_40b,Helm Lite GSM8K,0.267,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite GSM8K,0.538,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite GSM8K,0.159,[],helm_lite_240829.csv +phi_2,Helm Lite GSM8K,0.581,[],helm_lite_240829.csv +llama_2_7b,Helm Lite GSM8K,0.154,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite GSM8K,0.137,[],helm_lite_240829.csv +command_light,Helm Lite GSM8K,0.149,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite GSM8K,0.075,[],helm_lite_240829.csv +falcon_7b,Helm Lite GSM8K,0.055,[],helm_lite_240829.csv +olmo_7b,Helm Lite GSM8K,0.044,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite GSM8K,0.028,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite LegalBench,0.733,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite LegalBench,0.707,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite LegalBench,0.713,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite LegalBench,0.727,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite LegalBench,0.707,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite LegalBench,0.687,[],helm_lite_240829.csv +llama3_70b,Helm Lite LegalBench,0.733,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite LegalBench,0.712,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite LegalBench,0.646,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite LegalBench,0.757,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite LegalBench,0.653,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite LegalBench,0.708,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite LegalBench,0.626,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite LegalBench,0.709,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite LegalBench,0.7,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite LegalBench,0.661,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite LegalBench,0.662,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite LegalBench,0.677,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite LegalBench,0.694,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite LegalBench,0.644,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite LegalBench,0.639,[],helm_lite_240829.csv +yi_34b,Helm Lite LegalBench,0.618,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite LegalBench,0.624,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite LegalBench,0.636,[],helm_lite_240829.csv +claude_v1_3,Helm Lite LegalBench,0.629,[],helm_lite_240829.csv +palm_2_bison,Helm Lite LegalBench,0.645,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite LegalBench,0.63,[],helm_lite_240829.csv +phi_3_14b,Helm Lite LegalBench,0.593,[],helm_lite_240829.csv +claude_2_0,Helm Lite LegalBench,0.643,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite LegalBench,0.637,[],helm_lite_240829.csv +phi_3_7b,Helm Lite LegalBench,0.584,[],helm_lite_240829.csv +llama_2_70b,Helm Lite LegalBench,0.673,[],helm_lite_240829.csv +yi_large_preview,Helm Lite LegalBench,0.519,[],helm_lite_240829.csv +command_r_plus,Helm Lite LegalBench,0.672,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite LegalBench,0.622,[],helm_lite_240829.csv +claude_2_1,Helm Lite LegalBench,0.643,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite LegalBench,0.593,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite LegalBench,0.475,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite LegalBench,0.586,[],helm_lite_240829.csv +llama3_8b,Helm Lite LegalBench,0.637,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite LegalBench,0.528,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite LegalBench,0.49,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite LegalBench,0.415,[],helm_lite_240829.csv +arctic_instruct,Helm Lite LegalBench,0.588,[],helm_lite_240829.csv +gemma_7b,Helm Lite LegalBench,0.581,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite LegalBench,0.58,[],helm_lite_240829.csv +llama_65b,Helm Lite LegalBench,0.48,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite LegalBench,0.479,[],helm_lite_240829.csv +command,Helm Lite LegalBench,0.578,[],helm_lite_240829.csv +command_r,Helm Lite LegalBench,0.507,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite LegalBench,0.342,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite LegalBench,0.389,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite LegalBench,0.426,[],helm_lite_240829.csv +jamba_instruct,Helm Lite LegalBench,0.54,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite LegalBench,0.58,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite LegalBench,0.452,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite LegalBench,0.523,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite LegalBench,0.46,[],helm_lite_240829.csv +yi_6b,Helm Lite LegalBench,0.519,[],helm_lite_240829.csv +llama_2_13b,Helm Lite LegalBench,0.591,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite LegalBench,0.533,[],helm_lite_240829.csv +falcon_40b,Helm Lite LegalBench,0.442,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite LegalBench,0.331,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite LegalBench,0.468,[],helm_lite_240829.csv +phi_2,Helm Lite LegalBench,0.334,[],helm_lite_240829.csv +llama_2_7b,Helm Lite LegalBench,0.502,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite LegalBench,0.452,[],helm_lite_240829.csv +command_light,Helm Lite LegalBench,0.397,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite LegalBench,0.421,[],helm_lite_240829.csv +falcon_7b,Helm Lite LegalBench,0.346,[],helm_lite_240829.csv +olmo_7b,Helm Lite LegalBench,0.341,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite LegalBench,0.332,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite MedQA,0.857,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite MedQA,0.825,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite MedQA,0.815,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite MedQA,0.783,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite MedQA,0.805,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite MedQA,0.769,[],helm_lite_240829.csv +llama3_70b,Helm Lite MedQA,0.777,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite MedQA,0.746,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite MedQA,0.775,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite MedQA,0.692,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite MedQA,0.748,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite MedQA,0.704,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite MedQA,0.817,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite MedQA,0.684,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite MedQA,0.684,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite MedQA,0.68,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite MedQA,0.775,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite MedQA,0.684,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite MedQA,0.67,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite MedQA,0.598,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite MedQA,0.63,[],helm_lite_240829.csv +yi_34b,Helm Lite MedQA,0.656,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite MedQA,0.64,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite MedQA,0.656,[],helm_lite_240829.csv +claude_v1_3,Helm Lite MedQA,0.618,[],helm_lite_240829.csv +palm_2_bison,Helm Lite MedQA,0.547,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite MedQA,0.652,[],helm_lite_240829.csv +phi_3_14b,Helm Lite MedQA,0.696,[],helm_lite_240829.csv +claude_2_0,Helm Lite MedQA,0.652,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite MedQA,0.628,[],helm_lite_240829.csv +phi_3_7b,Helm Lite MedQA,0.672,[],helm_lite_240829.csv +llama_2_70b,Helm Lite MedQA,0.618,[],helm_lite_240829.csv +yi_large_preview,Helm Lite MedQA,0.66,[],helm_lite_240829.csv +command_r_plus,Helm Lite MedQA,0.567,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite MedQA,0.531,[],helm_lite_240829.csv +claude_2_1,Helm Lite MedQA,0.644,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite MedQA,0.515,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite MedQA,0.483,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite MedQA,0.559,[],helm_lite_240829.csv +llama3_8b,Helm Lite MedQA,0.581,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite MedQA,0.622,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite MedQA,0.684,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite MedQA,0.59,[],helm_lite_240829.csv +arctic_instruct,Helm Lite MedQA,0.581,[],helm_lite_240829.csv +gemma_7b,Helm Lite MedQA,0.513,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite MedQA,0.525,[],helm_lite_240829.csv +llama_65b,Helm Lite MedQA,0.507,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite MedQA,0.499,[],helm_lite_240829.csv +command,Helm Lite MedQA,0.445,[],helm_lite_240829.csv +command_r,Helm Lite MedQA,0.555,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite MedQA,0.245,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite MedQA,0.616,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite MedQA,0.694,[],helm_lite_240829.csv +jamba_instruct,Helm Lite MedQA,0.519,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite MedQA,0.525,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite MedQA,0.61,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite MedQA,0.479,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite MedQA,0.702,[],helm_lite_240829.csv +yi_6b,Helm Lite MedQA,0.497,[],helm_lite_240829.csv +llama_2_13b,Helm Lite MedQA,0.392,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite MedQA,0.431,[],helm_lite_240829.csv +falcon_40b,Helm Lite MedQA,0.419,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite MedQA,0.517,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite MedQA,0.39,[],helm_lite_240829.csv +phi_2,Helm Lite MedQA,0.41,[],helm_lite_240829.csv +llama_2_7b,Helm Lite MedQA,0.392,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite MedQA,0.276,[],helm_lite_240829.csv +command_light,Helm Lite MedQA,0.312,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite MedQA,0.276,[],helm_lite_240829.csv +falcon_7b,Helm Lite MedQA,0.254,[],helm_lite_240829.csv +olmo_7b,Helm Lite MedQA,0.229,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite MedQA,0.26,[],helm_lite_240829.csv +gpt_4o_2024_05_13,Helm Lite WMT2014,0.231,[],helm_lite_240829.csv +claude_3_5_sonnet_20240620,Helm Lite WMT2014,0.229,[],helm_lite_240829.csv +gpt_4_0613,Helm Lite WMT2014,0.211,[],helm_lite_240829.csv +gpt_4_turbo_2024_04_09,Helm Lite WMT2014,0.218,[],helm_lite_240829.csv +llama3_1_instruct_turbo_405b,Helm Lite WMT2014,0.238,[],helm_lite_240829.csv +llama3_1_instruct_turbo_70b,Helm Lite WMT2014,0.223,[],helm_lite_240829.csv +llama3_70b,Helm Lite WMT2014,0.225,[],helm_lite_240829.csv +qwen2_instruct_72b,Helm Lite WMT2014,0.207,[],helm_lite_240829.csv +mistral_large_2_2407,Helm Lite WMT2014,0.192,[],helm_lite_240829.csv +gemini_1_5_pro_001,Helm Lite WMT2014,0.189,[],helm_lite_240829.csv +gpt_4o_mini_2024_07_18,Helm Lite WMT2014,0.206,[],helm_lite_240829.csv +mixtral_8x22b,Helm Lite WMT2014,0.209,[],helm_lite_240829.csv +gpt_4_turbo_1106_preview,Helm Lite WMT2014,0.205,[],helm_lite_240829.csv +palmyra_x_v3_72b,Helm Lite WMT2014,0.262,[],helm_lite_240829.csv +gemma_2_instruct_27b,Helm Lite WMT2014,0.214,[],helm_lite_240829.csv +gemini_1_5_flash_001,Helm Lite WMT2014,0.225,[],helm_lite_240829.csv +claude_3_opus_20240229,Helm Lite WMT2014,0.24,[],helm_lite_240829.csv +palm_2_unicorn,Helm Lite WMT2014,0.26,[],helm_lite_240829.csv +qwen1_5_72b,Helm Lite WMT2014,0.201,[],helm_lite_240829.csv +palmyra_x_v2_33b,Helm Lite WMT2014,0.239,[],helm_lite_240829.csv +gemma_2_instruct_9b,Helm Lite WMT2014,0.201,[],helm_lite_240829.csv +yi_34b,Helm Lite WMT2014,0.172,[],helm_lite_240829.csv +qwen1_5_chat_110b,Helm Lite WMT2014,0.192,[],helm_lite_240829.csv +qwen1_5_32b,Helm Lite WMT2014,0.193,[],helm_lite_240829.csv +claude_v1_3,Helm Lite WMT2014,0.219,[],helm_lite_240829.csv +palm_2_bison,Helm Lite WMT2014,0.241,[],helm_lite_240829.csv +mixtral_8x7b_32k_seqlen,Helm Lite WMT2014,0.19,[],helm_lite_240829.csv +phi_3_14b,Helm Lite WMT2014,0.17,[],helm_lite_240829.csv +claude_2_0,Helm Lite WMT2014,0.219,[],helm_lite_240829.csv +deepseek_llm_chat_67b,Helm Lite WMT2014,0.186,[],helm_lite_240829.csv +phi_3_7b,Helm Lite WMT2014,0.154,[],helm_lite_240829.csv +llama_2_70b,Helm Lite WMT2014,0.196,[],helm_lite_240829.csv +yi_large_preview,Helm Lite WMT2014,0.176,[],helm_lite_240829.csv +command_r_plus,Helm Lite WMT2014,0.203,[],helm_lite_240829.csv +gpt_3_5_text_davinci_003,Helm Lite WMT2014,0.191,[],helm_lite_240829.csv +claude_2_1,Helm Lite WMT2014,0.204,[],helm_lite_240829.csv +qwen1_5_14b,Helm Lite WMT2014,0.178,[],helm_lite_240829.csv +gemini_1_0_pro_002,Helm Lite WMT2014,0.194,[],helm_lite_240829.csv +claude_instant_1_2,Helm Lite WMT2014,0.194,[],helm_lite_240829.csv +llama3_8b,Helm Lite WMT2014,0.183,[],helm_lite_240829.csv +gpt_3_5_turbo_0613,Helm Lite WMT2014,0.187,[],helm_lite_240829.csv +claude_3_sonnet_20240229,Helm Lite WMT2014,0.218,[],helm_lite_240829.csv +mistral_nemo_2402,Helm Lite WMT2014,0.177,[],helm_lite_240829.csv +arctic_instruct,Helm Lite WMT2014,0.172,[],helm_lite_240829.csv +gemma_7b,Helm Lite WMT2014,0.187,[],helm_lite_240829.csv +gpt_3_5_text_davinci_002,Helm Lite WMT2014,0.174,[],helm_lite_240829.csv +llama_65b,Helm Lite WMT2014,0.189,[],helm_lite_240829.csv +mistral_large_2402,Helm Lite WMT2014,0.182,[],helm_lite_240829.csv +command,Helm Lite WMT2014,0.088,[],helm_lite_240829.csv +command_r,Helm Lite WMT2014,0.149,[],helm_lite_240829.csv +llama3_1_instruct_turbo_8b,Helm Lite WMT2014,0.181,[],helm_lite_240829.csv +mistral_small_2402,Helm Lite WMT2014,0.169,[],helm_lite_240829.csv +dbrx_instructruct,Helm Lite WMT2014,0.131,[],helm_lite_240829.csv +jamba_instruct,Helm Lite WMT2014,0.164,[],helm_lite_240829.csv +mistral_v0_1_7b,Helm Lite WMT2014,0.16,[],helm_lite_240829.csv +mistral_medium_2312,Helm Lite WMT2014,0.169,[],helm_lite_240829.csv +qwen1_5_7b,Helm Lite WMT2014,0.153,[],helm_lite_240829.csv +claude_3_haiku_20240307,Helm Lite WMT2014,0.148,[],helm_lite_240829.csv +yi_6b,Helm Lite WMT2014,0.117,[],helm_lite_240829.csv +llama_2_13b,Helm Lite WMT2014,0.167,[],helm_lite_240829.csv +jurassic_2_jumbo_178b,Helm Lite WMT2014,0.114,[],helm_lite_240829.csv +falcon_40b,Helm Lite WMT2014,0.162,[],helm_lite_240829.csv +mistral_instruct_v0_3_7b,Helm Lite WMT2014,0.142,[],helm_lite_240829.csv +jurassic_2_grande_17b,Helm Lite WMT2014,0.102,[],helm_lite_240829.csv +phi_2,Helm Lite WMT2014,0.038,[],helm_lite_240829.csv +llama_2_7b,Helm Lite WMT2014,0.144,[],helm_lite_240829.csv +luminous_supreme_70b,Helm Lite WMT2014,0.102,[],helm_lite_240829.csv +command_light,Helm Lite WMT2014,0.023,[],helm_lite_240829.csv +luminous_extended_30b,Helm Lite WMT2014,0.083,[],helm_lite_240829.csv +falcon_7b,Helm Lite WMT2014,0.094,[],helm_lite_240829.csv +olmo_7b,Helm Lite WMT2014,0.097,[],helm_lite_240829.csv +luminous_base_13b,Helm Lite WMT2014,0.066,[],helm_lite_240829.csv +alphamonarch_7b,HF OpenLLM v2,17.59,,hf_open_llm_v2_240829.csv +alphamonarch_7b,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv +alphamonarch_7b,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv +alphamonarch_7b,HFv2 IFEval,49.39,,hf_open_llm_v2_240829.csv +alphamonarch_7b,HFv2 MMLU Pro,16.36,,hf_open_llm_v2_240829.csv +alphamonarch_7b,HFv2 Math Level 5,3.85,,hf_open_llm_v2_240829.csv +alphamonarch_7b,HFv2 MuSR,9.32,,hf_open_llm_v2_240829.csv +arcee_spark,HF OpenLLM v2,25.33,,hf_open_llm_v2_240829.csv +arcee_spark,HFv2 BBH,36.92,,hf_open_llm_v2_240829.csv +arcee_spark,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv +arcee_spark,HFv2 IFEval,57.18,,hf_open_llm_v2_240829.csv +arcee_spark,HFv2 MMLU Pro,31.26,,hf_open_llm_v2_240829.csv +arcee_spark,HFv2 Math Level 5,10.73,,hf_open_llm_v2_240829.csv +arcee_spark,HFv2 MuSR,8.4,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HF OpenLLM v2,12.2,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HFv2 BBH,4.38,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HFv2 IFEval,43.72,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HFv2 MMLU Pro,13.54,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HFv2 Math Level 5,4.23,,hf_open_llm_v2_240829.csv +autotrain_llama3_orpo_v2,HFv2 MuSR,5.1,,hf_open_llm_v2_240829.csv +aya_23_35b,HF OpenLLM v2,24.62,,hf_open_llm_v2_240829.csv +aya_23_35b,HFv2 BBH,34.86,,hf_open_llm_v2_240829.csv +aya_23_35b,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv +aya_23_35b,HFv2 IFEval,64.62,,hf_open_llm_v2_240829.csv +aya_23_35b,HFv2 MMLU Pro,26.18,,hf_open_llm_v2_240829.csv +aya_23_35b,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv +aya_23_35b,HFv2 MuSR,13.47,,hf_open_llm_v2_240829.csv +aya_23_8b,HF OpenLLM v2,15.97,,hf_open_llm_v2_240829.csv +aya_23_8b,HFv2 BBH,20.2,,hf_open_llm_v2_240829.csv +aya_23_8b,HFv2 GPQA,4.59,,hf_open_llm_v2_240829.csv +aya_23_8b,HFv2 IFEval,46.99,,hf_open_llm_v2_240829.csv +aya_23_8b,HFv2 MMLU Pro,14.2,,hf_open_llm_v2_240829.csv +aya_23_8b,HFv2 Math Level 5,1.44,,hf_open_llm_v2_240829.csv +aya_23_8b,HFv2 MuSR,8.42,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HF OpenLLM v2,24.55,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HFv2 BBH,31.37,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HFv2 GPQA,7.72,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HFv2 IFEval,62.62,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HFv2 MMLU Pro,27.56,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv +bagelmisterytour_v2_8x7b,HFv2 MuSR,10.32,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HF OpenLLM v2,31.42,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HFv2 BBH,51.03,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HFv2 GPQA,10.18,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HFv2 IFEval,47.99,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HFv2 MMLU Pro,41.37,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HFv2 Math Level 5,17.45,,hf_open_llm_v2_240829.csv +barcenas_14b_phi_3_medium_orpo,HFv2 MuSR,20.53,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HF OpenLLM v2,26.38,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HFv2 BBH,28.6,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HFv2 IFEval,73.72,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HFv2 MMLU Pro,31.44,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv +barcenas_llama3_8b_orpo,HFv2 MuSR,11.17,,hf_open_llm_v2_240829.csv +bloom_1b1,HF OpenLLM v2,3.96,,hf_open_llm_v2_240829.csv +bloom_1b1,HFv2 BBH,4.04,,hf_open_llm_v2_240829.csv +bloom_1b1,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +bloom_1b1,HFv2 IFEval,13.73,,hf_open_llm_v2_240829.csv +bloom_1b1,HFv2 MMLU Pro,1.2,,hf_open_llm_v2_240829.csv +bloom_1b1,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv +bloom_1b1,HFv2 MuSR,3.42,,hf_open_llm_v2_240829.csv +bloom_1b7,HF OpenLLM v2,3.97,,hf_open_llm_v2_240829.csv +bloom_1b7,HFv2 BBH,4.4,,hf_open_llm_v2_240829.csv +bloom_1b7,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv +bloom_1b7,HFv2 IFEval,10.44,,hf_open_llm_v2_240829.csv +bloom_1b7,HFv2 MMLU Pro,0.96,,hf_open_llm_v2_240829.csv +bloom_1b7,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv +bloom_1b7,HFv2 MuSR,6.84,,hf_open_llm_v2_240829.csv +bloom_3b,HF OpenLLM v2,4.26,,hf_open_llm_v2_240829.csv +bloom_3b,HFv2 BBH,3.42,,hf_open_llm_v2_240829.csv +bloom_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +bloom_3b,HFv2 IFEval,12.71,,hf_open_llm_v2_240829.csv +bloom_3b,HFv2 MMLU Pro,1.48,,hf_open_llm_v2_240829.csv +bloom_3b,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv +bloom_3b,HFv2 MuSR,7.89,,hf_open_llm_v2_240829.csv +bloom_560m,HF OpenLLM v2,3.46,,hf_open_llm_v2_240829.csv +bloom_560m,HFv2 BBH,2.89,,hf_open_llm_v2_240829.csv +bloom_560m,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv +bloom_560m,HFv2 IFEval,6.2,,hf_open_llm_v2_240829.csv +bloom_560m,HFv2 MMLU Pro,1.83,,hf_open_llm_v2_240829.csv +bloom_560m,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv +bloom_560m,HFv2 MuSR,8.19,,hf_open_llm_v2_240829.csv +bloom_7b1,HF OpenLLM v2,3.71,,hf_open_llm_v2_240829.csv +bloom_7b1,HFv2 BBH,4.04,,hf_open_llm_v2_240829.csv +bloom_7b1,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv +bloom_7b1,HFv2 IFEval,13.22,,hf_open_llm_v2_240829.csv +bloom_7b1,HFv2 MMLU Pro,1.16,,hf_open_llm_v2_240829.csv +bloom_7b1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +bloom_7b1,HFv2 MuSR,1.92,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HF OpenLLM v2,28.39,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HFv2 BBH,44.15,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HFv2 GPQA,7.94,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HFv2 IFEval,56.97,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HFv2 MMLU Pro,39.53,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HFv2 Math Level 5,14.43,,hf_open_llm_v2_240829.csv +blossom_v5_1_34b,HFv2 MuSR,7.3,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HF OpenLLM v2,24.68,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HFv2 BBH,34.2,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HFv2 GPQA,11.41,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HFv2 IFEval,50.86,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HFv2 MMLU Pro,33.1,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HFv2 Math Level 5,10.5,,hf_open_llm_v2_240829.csv +blossom_v5_1_9b,HFv2 MuSR,8.02,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HF OpenLLM v2,8.84,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HFv2 BBH,16.19,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HFv2 IFEval,14.83,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HFv2 MMLU Pro,15.0,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv +btlm_7b_base_v0_2,HFv2 MuSR,5.54,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HF OpenLLM v2,30.86,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HFv2 BBH,39.92,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HFv2 GPQA,7.38,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HFv2 IFEval,76.64,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HFv2 MMLU Pro,33.24,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HFv2 Math Level 5,7.55,,hf_open_llm_v2_240829.csv +c4ai_command_r_plus,HFv2 MuSR,20.42,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HF OpenLLM v2,25.35,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HFv2 BBH,34.56,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HFv2 IFEval,67.48,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HFv2 MMLU Pro,26.33,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +c4ai_command_r_v0_1,HFv2 MuSR,16.13,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HF OpenLLM v2,21.27,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HFv2 BBH,29.52,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HFv2 IFEval,50.91,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HFv2 MMLU Pro,21.66,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HFv2 Math Level 5,5.89,,hf_open_llm_v2_240829.csv +calm3_22b_chat,HFv2 MuSR,16.08,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HF OpenLLM v2,24.51,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HFv2 BBH,38.12,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HFv2 IFEval,55.25,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HFv2 MMLU Pro,30.51,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv +calme_2_1_phi3_4b,HFv2 MuSR,8.26,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HF OpenLLM v2,43.61,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HFv2 BBH,57.33,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HFv2 GPQA,17.45,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HFv2 IFEval,81.63,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HFv2 MMLU Pro,49.05,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HFv2 Math Level 5,36.03,,hf_open_llm_v2_240829.csv +calme_2_1_qwen2_72b,HFv2 MuSR,20.15,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HF OpenLLM v2,37.98,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HFv2 BBH,48.57,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HFv2 GPQA,12.19,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HFv2 IFEval,82.08,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HFv2 MMLU Pro,46.74,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HFv2 Math Level 5,22.96,,hf_open_llm_v2_240829.csv +calme_2_2_llama3_70b,HFv2 MuSR,15.3,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HF OpenLLM v2,23.21,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HFv2 BBH,37.73,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HFv2 GPQA,9.51,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HFv2 IFEval,50.69,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HFv2 MMLU Pro,31.27,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HFv2 Math Level 5,2.34,,hf_open_llm_v2_240829.csv +calme_2_2_phi3_4b,HFv2 MuSR,7.7,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HF OpenLLM v2,43.4,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HFv2 BBH,56.8,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HFv2 GPQA,16.55,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HFv2 IFEval,80.08,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HFv2 MMLU Pro,49.27,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HFv2 Math Level 5,41.16,,hf_open_llm_v2_240829.csv +calme_2_2_qwen2_72b,HFv2 MuSR,16.52,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HF OpenLLM v2,23.02,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HFv2 BBH,37.66,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HFv2 IFEval,49.26,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HFv2 MMLU Pro,31.42,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HFv2 Math Level 5,2.95,,hf_open_llm_v2_240829.csv +calme_2_3_phi3_4b,HFv2 MuSR,7.75,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HF OpenLLM v2,32.18,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HFv2 BBH,48.4,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HFv2 GPQA,11.97,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HFv2 IFEval,50.27,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HFv2 MMLU Pro,46.71,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HFv2 Math Level 5,22.66,,hf_open_llm_v2_240829.csv +calme_2_4_llama3_70b,HFv2 MuSR,13.1,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HF OpenLLM v2,22.36,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HFv2 BBH,33.06,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HFv2 GPQA,6.94,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HFv2 IFEval,54.15,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HFv2 MMLU Pro,25.29,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HFv2 Math Level 5,5.51,,hf_open_llm_v2_240829.csv +carbonbeagle_11b,HFv2 MuSR,9.19,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HF OpenLLM v2,21.29,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HFv2 BBH,33.99,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HFv2 GPQA,6.6,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HFv2 IFEval,52.12,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HFv2 MMLU Pro,26.19,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HFv2 Math Level 5,4.76,,hf_open_llm_v2_240829.csv +carbonbeagle_11b_truthy,HFv2 MuSR,4.11,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HF OpenLLM v2,27.63,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HFv2 BBH,37.16,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HFv2 GPQA,9.62,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HFv2 IFEval,56.23,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HFv2 MMLU Pro,33.21,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HFv2 Math Level 5,14.5,,hf_open_llm_v2_240829.csv +chocolatine_3b_instruct_dpo_revised,HFv2 MuSR,15.1,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HF OpenLLM v2,22.03,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HFv2 BBH,29.96,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HFv2 GPQA,6.71,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HFv2 IFEval,47.33,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HFv2 MMLU Pro,31.88,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HFv2 Math Level 5,7.55,,hf_open_llm_v2_240829.csv +chocolatine_8b_instruct_dpo_v1_0,HFv2 MuSR,8.74,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HF OpenLLM v2,7.02,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HFv2 BBH,7.55,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HFv2 IFEval,22.94,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HFv2 MMLU Pro,3.09,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +codegemma_1_1_2b,HFv2 MuSR,5.93,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HF OpenLLM v2,27.77,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HFv2 BBH,29.66,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HFv2 IFEval,83.12,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HFv2 MMLU Pro,28.8,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HFv2 Math Level 5,15.86,,hf_open_llm_v2_240829.csv +configurable_llama3_1_8b_instruct,HFv2 MuSR,5.93,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HF OpenLLM v2,23.77,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HFv2 BBH,35.33,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HFv2 GPQA,12.42,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HFv2 IFEval,43.23,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HFv2 MMLU Pro,33.5,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HFv2 Math Level 5,6.12,,hf_open_llm_v2_240829.csv +configurable_yi_1_5_9b_chat,HFv2 MuSR,12.02,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HF OpenLLM v2,22.52,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HFv2 BBH,32.39,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HFv2 GPQA,6.94,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HFv2 IFEval,58.34,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HFv2 MMLU Pro,26.38,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HFv2 Math Level 5,3.7,,hf_open_llm_v2_240829.csv +configurablebeagle_11b,HFv2 MuSR,7.38,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HF OpenLLM v2,19.46,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HFv2 BBH,23.16,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HFv2 IFEval,54.11,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HFv2 MMLU Pro,22.5,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv +configurablehermes_7b,HFv2 MuSR,9.11,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HF OpenLLM v2,19.05,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HFv2 BBH,27.45,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HFv2 IFEval,51.0,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HFv2 MMLU Pro,24.15,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +configurablesolar_10_7b,HFv2 MuSR,5.19,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HF OpenLLM v2,25.2,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HFv2 BBH,35.96,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HFv2 GPQA,12.19,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HFv2 IFEval,54.16,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HFv2 MMLU Pro,29.81,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HFv2 Math Level 5,6.87,,hf_open_llm_v2_240829.csv +dbrx_instructruct,HFv2 MuSR,12.2,,hf_open_llm_v2_240829.csv +dclm_7b,HF OpenLLM v2,13.99,,hf_open_llm_v2_240829.csv +dclm_7b,HFv2 BBH,19.76,,hf_open_llm_v2_240829.csv +dclm_7b,HFv2 GPQA,8.72,,hf_open_llm_v2_240829.csv +dclm_7b,HFv2 IFEval,21.73,,hf_open_llm_v2_240829.csv +dclm_7b,HFv2 MMLU Pro,23.45,,hf_open_llm_v2_240829.csv +dclm_7b,HFv2 Math Level 5,2.95,,hf_open_llm_v2_240829.csv +dclm_7b,HFv2 MuSR,7.31,,hf_open_llm_v2_240829.csv +decilm_7b,HF OpenLLM v2,14.95,,hf_open_llm_v2_240829.csv +decilm_7b,HFv2 BBH,21.25,,hf_open_llm_v2_240829.csv +decilm_7b,HFv2 GPQA,6.04,,hf_open_llm_v2_240829.csv +decilm_7b,HFv2 IFEval,28.13,,hf_open_llm_v2_240829.csv +decilm_7b,HFv2 MMLU Pro,18.8,,hf_open_llm_v2_240829.csv +decilm_7b,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv +decilm_7b,HFv2 MuSR,13.05,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HF OpenLLM v2,17.43,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HFv2 BBH,23.89,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HFv2 IFEval,48.8,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HFv2 MMLU Pro,17.87,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HFv2 Math Level 5,2.79,,hf_open_llm_v2_240829.csv +decilm_7b_instruct,HFv2 MuSR,5.99,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HF OpenLLM v2,26.87,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HFv2 BBH,33.23,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HFv2 GPQA,8.84,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HFv2 IFEval,55.87,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HFv2 MMLU Pro,32.71,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HFv2 Math Level 5,6.65,,hf_open_llm_v2_240829.csv +deepseek_llm_67b_chat,HFv2 MuSR,23.93,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HF OpenLLM v2,8.1,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HFv2 BBH,9.77,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HFv2 IFEval,21.79,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HFv2 MMLU Pro,8.96,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_base,HFv2 MuSR,3.76,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HF OpenLLM v2,14.77,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HFv2 BBH,11.26,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HFv2 IFEval,41.71,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HFv2 MMLU Pro,12.59,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv +deepseek_llm_7b_chat,HFv2 MuSR,19.21,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HF OpenLLM v2,7.37,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HFv2 BBH,8.36,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HFv2 IFEval,24.5,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HFv2 MMLU Pro,5.61,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HFv2 Math Level 5,1.81,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_base,HFv2 MuSR,3.36,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HF OpenLLM v2,10.14,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HFv2 BBH,6.57,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HFv2 IFEval,36.63,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HFv2 MMLU Pro,10.71,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv +deepseek_moe_16b_chat,HFv2 MuSR,5.26,,hf_open_llm_v2_240829.csv +dialogpt_medium,HF OpenLLM v2,5.25,,hf_open_llm_v2_240829.csv +dialogpt_medium,HFv2 BBH,2.56,,hf_open_llm_v2_240829.csv +dialogpt_medium,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv +dialogpt_medium,HFv2 IFEval,14.79,,hf_open_llm_v2_240829.csv +dialogpt_medium,HFv2 MMLU Pro,1.32,,hf_open_llm_v2_240829.csv +dialogpt_medium,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +dialogpt_medium,HFv2 MuSR,12.28,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HF OpenLLM v2,16.58,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HFv2 BBH,19.69,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HFv2 GPQA,7.05,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HFv2 IFEval,44.12,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HFv2 MMLU Pro,17.83,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv +dictalm2_0_instruct,HFv2 MuSR,9.72,,hf_open_llm_v2_240829.csv +distilgpt2,HF OpenLLM v2,3.9,,hf_open_llm_v2_240829.csv +distilgpt2,HFv2 BBH,2.84,,hf_open_llm_v2_240829.csv +distilgpt2,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +distilgpt2,HFv2 IFEval,6.11,,hf_open_llm_v2_240829.csv +distilgpt2,HFv2 MMLU Pro,2.08,,hf_open_llm_v2_240829.csv +distilgpt2,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +distilgpt2,HFv2 MuSR,11.16,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HF OpenLLM v2,6.89,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HFv2 BBH,4.78,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HFv2 IFEval,22.24,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HFv2 MMLU Pro,2.95,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HFv2 Math Level 5,1.36,,hf_open_llm_v2_240829.csv +dolly_v1_6b,HFv2 MuSR,8.12,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HF OpenLLM v2,25.66,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HFv2 BBH,49.72,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HFv2 GPQA,10.29,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HFv2 IFEval,42.48,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HFv2 MMLU Pro,39.5,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_phi_3_medium,HFv2 MuSR,11.41,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HF OpenLLM v2,32.0,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HFv2 BBH,47.7,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HFv2 GPQA,16.0,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HFv2 IFEval,40.38,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HFv2 MMLU Pro,49.52,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HFv2 Math Level 5,21.37,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_72b,HFv2 MuSR,17.04,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HF OpenLLM v2,20.96,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HFv2 BBH,27.91,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HFv2 IFEval,35.35,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HFv2 MMLU Pro,33.9,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HFv2 Math Level 5,11.56,,hf_open_llm_v2_240829.csv +dolphin_2_9_2_qwen2_7b,HFv2 MuSR,11.66,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HF OpenLLM v2,19.31,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HFv2 BBH,26.91,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HFv2 GPQA,4.7,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HFv2 IFEval,41.26,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HFv2 MMLU Pro,20.23,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HFv2 Math Level 5,4.83,,hf_open_llm_v2_240829.csv +dolphin_2_9_3_mistral_7b_32k,HFv2 MuSR,17.93,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HF OpenLLM v2,18.3,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HFv2 BBH,27.86,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HFv2 IFEval,38.5,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HFv2 MMLU Pro,19.68,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HFv2 Math Level 5,5.06,,hf_open_llm_v2_240829.csv +dolphin_2_9_llama3_8b,HFv2 MuSR,13.79,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HF OpenLLM v2,16.73,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HFv2 BBH,14.3,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HFv2 IFEval,47.08,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HFv2 MMLU Pro,13.99,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv +einstein_v4_7b,HFv2 MuSR,19.02,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HF OpenLLM v2,19.05,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 BBH,29.69,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 IFEval,39.27,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 MMLU Pro,23.25,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 Math Level 5,5.59,,hf_open_llm_v2_240829.csv +einstein_v6_1_developed_by_weyaxi_llama3_8b,HFv2 MuSR,13.39,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HF OpenLLM v2,19.99,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HFv2 BBH,29.38,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HFv2 IFEval,45.68,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HFv2 MMLU Pro,23.68,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv +einstein_v6_1_llama3_8b,HFv2 MuSR,11.23,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HF OpenLLM v2,13.08,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HFv2 BBH,13.58,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HFv2 IFEval,44.01,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HFv2 MMLU Pro,10.74,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +ende_chat_0_0_7,HFv2 MuSR,6.03,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HF OpenLLM v2,15.18,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HFv2 BBH,16.88,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HFv2 IFEval,41.89,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HFv2 MMLU Pro,18.16,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +ennoai_pro_french_llama3_8b_v0_4,HFv2 MuSR,10.76,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HF OpenLLM v2,12.17,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HFv2 BBH,17.51,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HFv2 IFEval,31.95,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HFv2 MMLU Pro,12.79,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv +ennoai_pro_llama3_8b,HFv2 MuSR,9.08,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HF OpenLLM v2,21.4,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HFv2 BBH,17.98,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HFv2 IFEval,71.93,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HFv2 MMLU Pro,28.63,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv +exaone_3_0_7_8b_instruct,HFv2 MuSR,3.3,,hf_open_llm_v2_240829.csv +falcon_11b,HF OpenLLM v2,13.78,,hf_open_llm_v2_240829.csv +falcon_11b,HFv2 BBH,21.94,,hf_open_llm_v2_240829.csv +falcon_11b,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv +falcon_11b,HFv2 IFEval,32.61,,hf_open_llm_v2_240829.csv +falcon_11b,HFv2 MMLU Pro,15.44,,hf_open_llm_v2_240829.csv +falcon_11b,HFv2 Math Level 5,2.34,,hf_open_llm_v2_240829.csv +falcon_11b,HFv2 MuSR,7.53,,hf_open_llm_v2_240829.csv +falcon_40b,HF OpenLLM v2,11.33,,hf_open_llm_v2_240829.csv +falcon_40b,HFv2 BBH,16.58,,hf_open_llm_v2_240829.csv +falcon_40b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv +falcon_40b,HFv2 IFEval,24.96,,hf_open_llm_v2_240829.csv +falcon_40b,HFv2 MMLU Pro,16.72,,hf_open_llm_v2_240829.csv +falcon_40b,HFv2 Math Level 5,1.36,,hf_open_llm_v2_240829.csv +falcon_40b,HFv2 MuSR,5.19,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HF OpenLLM v2,10.41,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HFv2 BBH,17.22,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HFv2 IFEval,24.54,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HFv2 MMLU Pro,14.02,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv +falcon_40b_instruct,HFv2 MuSR,5.16,,hf_open_llm_v2_240829.csv +falcon_7b,HF OpenLLM v2,5.1,,hf_open_llm_v2_240829.csv +falcon_7b,HFv2 BBH,5.96,,hf_open_llm_v2_240829.csv +falcon_7b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +falcon_7b,HFv2 IFEval,18.21,,hf_open_llm_v2_240829.csv +falcon_7b,HFv2 MMLU Pro,1.39,,hf_open_llm_v2_240829.csv +falcon_7b,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv +falcon_7b,HFv2 MuSR,4.5,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HF OpenLLM v2,5.02,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HFv2 BBH,4.82,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HFv2 IFEval,19.69,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HFv2 MMLU Pro,1.73,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +falcon_7b_instruct,HFv2 MuSR,3.25,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HF OpenLLM v2,15.04,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HFv2 BBH,19.88,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HFv2 GPQA,8.05,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HFv2 IFEval,33.36,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HFv2 MMLU Pro,14.47,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HFv2 Math Level 5,3.63,,hf_open_llm_v2_240829.csv +falcon_mamba_7b,HFv2 MuSR,10.86,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HF OpenLLM v2,13.55,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HFv2 BBH,30.02,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HFv2 GPQA,5.03,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HFv2 IFEval,23.93,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HFv2 MMLU Pro,16.59,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv +flan_flan-ul2,HFv2 MuSR,5.58,,hf_open_llm_v2_240829.csv +flan_t5_base,HF OpenLLM v2,6.24,,hf_open_llm_v2_240829.csv +flan_t5_base,HFv2 BBH,11.34,,hf_open_llm_v2_240829.csv +flan_t5_base,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +flan_t5_base,HFv2 IFEval,18.91,,hf_open_llm_v2_240829.csv +flan_t5_base,HFv2 MMLU Pro,3.97,,hf_open_llm_v2_240829.csv +flan_t5_base,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +flan_t5_base,HFv2 MuSR,3.22,,hf_open_llm_v2_240829.csv +flan_t5_large,HF OpenLLM v2,9.42,,hf_open_llm_v2_240829.csv +flan_t5_large,HFv2 BBH,17.51,,hf_open_llm_v2_240829.csv +flan_t5_large,HFv2 GPQA,0.11,,hf_open_llm_v2_240829.csv +flan_t5_large,HFv2 IFEval,22.01,,hf_open_llm_v2_240829.csv +flan_t5_large,HFv2 MMLU Pro,7.88,,hf_open_llm_v2_240829.csv +flan_t5_large,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +flan_t5_large,HFv2 MuSR,9.01,,hf_open_llm_v2_240829.csv +flan_t5_small,HF OpenLLM v2,6.0,,hf_open_llm_v2_240829.csv +flan_t5_small,HFv2 BBH,6.36,,hf_open_llm_v2_240829.csv +flan_t5_small,HFv2 GPQA,1.45,,hf_open_llm_v2_240829.csv +flan_t5_small,HFv2 IFEval,15.24,,hf_open_llm_v2_240829.csv +flan_t5_small,HFv2 MMLU Pro,2.59,,hf_open_llm_v2_240829.csv +flan_t5_small,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +flan_t5_small,HFv2 MuSR,10.37,,hf_open_llm_v2_240829.csv +flan_t5_xl,HF OpenLLM v2,11.59,,hf_open_llm_v2_240829.csv +flan_t5_xl,HFv2 BBH,22.84,,hf_open_llm_v2_240829.csv +flan_t5_xl,HFv2 GPQA,0.34,,hf_open_llm_v2_240829.csv +flan_t5_xl,HFv2 IFEval,22.37,,hf_open_llm_v2_240829.csv +flan_t5_xl,HFv2 MMLU Pro,12.74,,hf_open_llm_v2_240829.csv +flan_t5_xl,HFv2 Math Level 5,0.08,,hf_open_llm_v2_240829.csv +flan_t5_xl,HFv2 MuSR,11.85,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HF OpenLLM v2,13.49,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HFv2 BBH,30.12,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HFv2 IFEval,22.0,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HFv2 MMLU Pro,14.92,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +flan_t5_xxl,HFv2 MuSR,11.19,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HF OpenLLM v2,7.69,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HFv2 BBH,7.4,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HFv2 GPQA,1.79,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HFv2 IFEval,27.66,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HFv2 MMLU Pro,4.13,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HFv2 Math Level 5,1.28,,hf_open_llm_v2_240829.csv +fox_1_1_6b,HFv2 MuSR,3.87,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HF OpenLLM v2,7.78,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HFv2 BBH,5.86,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HFv2 IFEval,30.67,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HFv2 MMLU Pro,5.37,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv +gemma_1_1_2b_it,HFv2 MuSR,2.02,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HF OpenLLM v2,17.4,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HFv2 BBH,15.93,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HFv2 IFEval,50.39,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HFv2 MMLU Pro,17.6,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HFv2 Math Level 5,3.17,,hf_open_llm_v2_240829.csv +gemma_1_1_7b_it,HFv2 MuSR,11.51,,hf_open_llm_v2_240829.csv +gemma_2_27b,HF OpenLLM v2,23.64,,hf_open_llm_v2_240829.csv +gemma_2_27b,HFv2 BBH,37.39,,hf_open_llm_v2_240829.csv +gemma_2_27b,HFv2 GPQA,13.42,,hf_open_llm_v2_240829.csv +gemma_2_27b,HFv2 IFEval,24.75,,hf_open_llm_v2_240829.csv +gemma_2_27b,HFv2 MMLU Pro,37.45,,hf_open_llm_v2_240829.csv +gemma_2_27b,HFv2 Math Level 5,14.88,,hf_open_llm_v2_240829.csv +gemma_2_27b,HFv2 MuSR,13.92,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HF OpenLLM v2,32.31,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HFv2 BBH,49.27,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HFv2 GPQA,16.67,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HFv2 IFEval,79.78,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HFv2 MMLU Pro,38.35,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +gemma_2_27b_it,HFv2 MuSR,9.11,,hf_open_llm_v2_240829.csv +gemma_2_2b,HF OpenLLM v2,10.13,,hf_open_llm_v2_240829.csv +gemma_2_2b,HFv2 BBH,11.76,,hf_open_llm_v2_240829.csv +gemma_2_2b,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv +gemma_2_2b,HFv2 IFEval,19.93,,hf_open_llm_v2_240829.csv +gemma_2_2b,HFv2 MMLU Pro,13.11,,hf_open_llm_v2_240829.csv +gemma_2_2b,HFv2 Math Level 5,2.87,,hf_open_llm_v2_240829.csv +gemma_2_2b,HFv2 MuSR,11.43,,hf_open_llm_v2_240829.csv +gemma_2_9b,HF OpenLLM v2,20.93,,hf_open_llm_v2_240829.csv +gemma_2_9b,HFv2 BBH,34.1,,hf_open_llm_v2_240829.csv +gemma_2_9b,HFv2 GPQA,10.51,,hf_open_llm_v2_240829.csv +gemma_2_9b,HFv2 IFEval,20.4,,hf_open_llm_v2_240829.csv +gemma_2_9b,HFv2 MMLU Pro,34.48,,hf_open_llm_v2_240829.csv +gemma_2_9b,HFv2 Math Level 5,11.78,,hf_open_llm_v2_240829.csv +gemma_2_9b,HFv2 MuSR,14.3,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HF OpenLLM v2,28.86,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HFv2 BBH,42.14,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HFv2 GPQA,14.77,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HFv2 IFEval,74.36,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HFv2 MMLU Pro,31.95,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HFv2 Math Level 5,0.23,,hf_open_llm_v2_240829.csv +gemma_2_9b_it,HFv2 MuSR,9.74,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HF OpenLLM v2,21.16,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HFv2 BBH,40.09,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HFv2 GPQA,11.41,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HFv2 IFEval,32.07,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HFv2 MMLU Pro,33.06,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_simpo,HFv2 MuSR,10.34,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HF OpenLLM v2,21.22,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HFv2 BBH,42.17,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HFv2 GPQA,11.3,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HFv2 IFEval,31.0,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HFv2 MMLU Pro,31.89,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter2,HFv2 MuSR,10.94,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HF OpenLLM v2,20.55,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HFv2 BBH,41.68,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HFv2 GPQA,12.64,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HFv2 IFEval,30.15,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HFv2 MMLU Pro,31.71,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1,HFv2 MuSR,7.15,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HF OpenLLM v2,20.1,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 BBH,41.1,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 GPQA,12.08,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 IFEval,29.42,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 MMLU Pro,31.11,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +gemma_2_9b_it_sppo_iter_1_evol_1,HFv2 MuSR,6.9,,hf_open_llm_v2_240829.csv +gemma_2b,HF OpenLLM v2,7.31,,hf_open_llm_v2_240829.csv +gemma_2b,HFv2 BBH,8.47,,hf_open_llm_v2_240829.csv +gemma_2b,HFv2 GPQA,0.67,,hf_open_llm_v2_240829.csv +gemma_2b,HFv2 IFEval,20.38,,hf_open_llm_v2_240829.csv +gemma_2b,HFv2 MMLU Pro,4.06,,hf_open_llm_v2_240829.csv +gemma_2b,HFv2 Math Level 5,2.72,,hf_open_llm_v2_240829.csv +gemma_2b,HFv2 MuSR,7.56,,hf_open_llm_v2_240829.csv +gemma_2b_it,HF OpenLLM v2,7.22,,hf_open_llm_v2_240829.csv +gemma_2b_it,HFv2 BBH,5.21,,hf_open_llm_v2_240829.csv +gemma_2b_it,HFv2 GPQA,3.8,,hf_open_llm_v2_240829.csv +gemma_2b_it,HFv2 IFEval,26.9,,hf_open_llm_v2_240829.csv +gemma_2b_it,HFv2 MMLU Pro,3.92,,hf_open_llm_v2_240829.csv +gemma_2b_it,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv +gemma_2b_it,HFv2 MuSR,3.03,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HF OpenLLM v2,7.17,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HFv2 BBH,7.95,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HFv2 IFEval,24.78,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HFv2 MMLU Pro,3.4,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv +gemma_2b_orpo,HFv2 MuSR,4.13,,hf_open_llm_v2_240829.csv +gemma_7b,HF OpenLLM v2,15.28,,hf_open_llm_v2_240829.csv +gemma_7b,HFv2 BBH,21.12,,hf_open_llm_v2_240829.csv +gemma_7b,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv +gemma_7b,HFv2 IFEval,26.59,,hf_open_llm_v2_240829.csv +gemma_7b,HFv2 MMLU Pro,21.64,,hf_open_llm_v2_240829.csv +gemma_7b,HFv2 Math Level 5,6.42,,hf_open_llm_v2_240829.csv +gemma_7b,HFv2 MuSR,10.98,,hf_open_llm_v2_240829.csv +gemma_7b_it,HF OpenLLM v2,12.83,,hf_open_llm_v2_240829.csv +gemma_7b_it,HFv2 BBH,11.88,,hf_open_llm_v2_240829.csv +gemma_7b_it,HFv2 GPQA,4.59,,hf_open_llm_v2_240829.csv +gemma_7b_it,HFv2 IFEval,38.68,,hf_open_llm_v2_240829.csv +gemma_7b_it,HFv2 MMLU Pro,7.72,,hf_open_llm_v2_240829.csv +gemma_7b_it,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv +gemma_7b_it,HFv2 MuSR,12.53,,hf_open_llm_v2_240829.csv +glm_4_9b,HF OpenLLM v2,18.01,,hf_open_llm_v2_240829.csv +glm_4_9b,HFv2 BBH,35.81,,hf_open_llm_v2_240829.csv +glm_4_9b,HFv2 GPQA,8.84,,hf_open_llm_v2_240829.csv +glm_4_9b,HFv2 IFEval,14.26,,hf_open_llm_v2_240829.csv +glm_4_9b,HFv2 MMLU Pro,34.94,,hf_open_llm_v2_240829.csv +glm_4_9b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +glm_4_9b,HFv2 MuSR,14.19,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HF OpenLLM v2,10.97,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HFv2 BBH,25.21,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HFv2 GPQA,8.5,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HFv2 IFEval,0.0,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HFv2 MMLU Pro,24.07,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +glm_4_9b_chat,HFv2 MuSR,8.06,,hf_open_llm_v2_240829.csv +go_bruins_v2,HF OpenLLM v2,15.27,,hf_open_llm_v2_240829.csv +go_bruins_v2,HFv2 BBH,12.69,,hf_open_llm_v2_240829.csv +go_bruins_v2,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv +go_bruins_v2,HFv2 IFEval,40.96,,hf_open_llm_v2_240829.csv +go_bruins_v2,HFv2 MMLU Pro,19.57,,hf_open_llm_v2_240829.csv +go_bruins_v2,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv +go_bruins_v2,HFv2 MuSR,10.99,,hf_open_llm_v2_240829.csv +gpt2,HF OpenLLM v2,6.54,,hf_open_llm_v2_240829.csv +gpt2,HFv2 BBH,9.2,,hf_open_llm_v2_240829.csv +gpt2,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv +gpt2,HFv2 IFEval,18.08,,hf_open_llm_v2_240829.csv +gpt2,HFv2 MMLU Pro,1.84,,hf_open_llm_v2_240829.csv +gpt2,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv +gpt2,HFv2 MuSR,18.33,,hf_open_llm_v2_240829.csv +gpt2_large,HF OpenLLM v2,5.48,,hf_open_llm_v2_240829.csv +gpt2_large,HFv2 BBH,3.25,,hf_open_llm_v2_240829.csv +gpt2_large,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +gpt2_large,HFv2 IFEval,20.48,,hf_open_llm_v2_240829.csv +gpt2_large,HFv2 MMLU Pro,1.58,,hf_open_llm_v2_240829.csv +gpt2_large,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +gpt2_large,HFv2 MuSR,5.66,,hf_open_llm_v2_240829.csv +gpt2_medium,HF OpenLLM v2,5.81,,hf_open_llm_v2_240829.csv +gpt2_medium,HFv2 BBH,2.72,,hf_open_llm_v2_240829.csv +gpt2_medium,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv +gpt2_medium,HFv2 IFEval,22.08,,hf_open_llm_v2_240829.csv +gpt2_medium,HFv2 MMLU Pro,2.02,,hf_open_llm_v2_240829.csv +gpt2_medium,HFv2 Math Level 5,0.23,,hf_open_llm_v2_240829.csv +gpt2_medium,HFv2 MuSR,6.16,,hf_open_llm_v2_240829.csv +gpt2_xl,HF OpenLLM v2,4.98,,hf_open_llm_v2_240829.csv +gpt2_xl,HFv2 BBH,2.58,,hf_open_llm_v2_240829.csv +gpt2_xl,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv +gpt2_xl,HFv2 IFEval,20.39,,hf_open_llm_v2_240829.csv +gpt2_xl,HFv2 MMLU Pro,1.46,,hf_open_llm_v2_240829.csv +gpt2_xl,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv +gpt2_xl,HFv2 MuSR,4.04,,hf_open_llm_v2_240829.csv +gpt_j_6b,HF OpenLLM v2,6.55,,hf_open_llm_v2_240829.csv +gpt_j_6b,HFv2 BBH,4.91,,hf_open_llm_v2_240829.csv +gpt_j_6b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +gpt_j_6b,HFv2 IFEval,25.22,,hf_open_llm_v2_240829.csv +gpt_j_6b,HFv2 MMLU Pro,2.68,,hf_open_llm_v2_240829.csv +gpt_j_6b,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv +gpt_j_6b,HFv2 MuSR,5.25,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HF OpenLLM v2,4.38,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HFv2 BBH,3.44,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HFv2 IFEval,19.05,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HFv2 MMLU Pro,0.28,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv +gpt_neo_125m,HFv2 MuSR,2.62,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HF OpenLLM v2,5.33,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HFv2 BBH,3.02,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HFv2 IFEval,20.79,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HFv2 MMLU Pro,1.82,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +gpt_neo_1_3b,HFv2 MuSR,4.87,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HF OpenLLM v2,6.34,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HFv2 BBH,4.18,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HFv2 IFEval,25.9,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HFv2 MMLU Pro,1.81,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv +gpt_neo_2_7b,HFv2 MuSR,3.52,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HF OpenLLM v2,5.99,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HFv2 BBH,4.93,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HFv2 IFEval,25.87,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HFv2 MMLU Pro,1.73,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +gpt_neox_20b,HFv2 MuSR,2.82,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HF OpenLLM v2,4.68,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HFv2 BBH,6.89,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HFv2 IFEval,14.7,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HFv2 MMLU Pro,3.06,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +gpt_sw3_40b,HFv2 MuSR,2.84,,hf_open_llm_v2_240829.csv +granite_7b_base,HF OpenLLM v2,7.75,,hf_open_llm_v2_240829.csv +granite_7b_base,HFv2 BBH,9.05,,hf_open_llm_v2_240829.csv +granite_7b_base,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +granite_7b_base,HFv2 IFEval,24.14,,hf_open_llm_v2_240829.csv +granite_7b_base,HFv2 MMLU Pro,9.27,,hf_open_llm_v2_240829.csv +granite_7b_base,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +granite_7b_base,HFv2 MuSR,3.4,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HF OpenLLM v2,19.15,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HFv2 BBH,27.9,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HFv2 IFEval,53.1,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HFv2 MMLU Pro,18.67,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HFv2 Math Level 5,2.19,,hf_open_llm_v2_240829.csv +gritlm_7b_kto,HFv2 MuSR,6.64,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HF OpenLLM v2,25.62,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HFv2 BBH,40.83,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HFv2 GPQA,6.15,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HFv2 IFEval,57.14,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HFv2 MMLU Pro,29.42,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HFv2 Math Level 5,8.53,,hf_open_llm_v2_240829.csv +gritlm_8x7b_kto,HFv2 MuSR,11.67,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HF OpenLLM v2,11.36,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HFv2 BBH,8.84,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HFv2 IFEval,36.29,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HFv2 MMLU Pro,13.65,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HFv2 Math Level 5,2.79,,hf_open_llm_v2_240829.csv +h2o_danube3_4b_chat,HFv2 MuSR,5.23,,hf_open_llm_v2_240829.csv +hare1_0_beta,HF OpenLLM v2,12.38,,hf_open_llm_v2_240829.csv +hare1_0_beta,HFv2 BBH,14.09,,hf_open_llm_v2_240829.csv +hare1_0_beta,HFv2 GPQA,0.67,,hf_open_llm_v2_240829.csv +hare1_0_beta,HFv2 IFEval,34.71,,hf_open_llm_v2_240829.csv +hare1_0_beta,HFv2 MMLU Pro,8.35,,hf_open_llm_v2_240829.csv +hare1_0_beta,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv +hare1_0_beta,HFv2 MuSR,15.72,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HF OpenLLM v2,1.95,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HFv2 BBH,1.72,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HFv2 IFEval,0.12,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HFv2 MMLU Pro,1.04,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +hare_1_1b_base,HFv2 MuSR,7.58,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HF OpenLLM v2,10.78,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HFv2 BBH,5.14,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HFv2 IFEval,36.33,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HFv2 MMLU Pro,8.08,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +hare_1_1b_base_0_5v,HFv2 MuSR,12.44,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HF OpenLLM v2,13.81,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HFv2 BBH,16.86,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HFv2 GPQA,3.47,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HFv2 IFEval,30.21,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HFv2 MMLU Pro,17.27,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HFv2 Math Level 5,5.06,,hf_open_llm_v2_240829.csv +hebrew_gemma_11b_instruct,HFv2 MuSR,9.97,,hf_open_llm_v2_240829.csv +helpingai_15b,HF OpenLLM v2,4.52,,hf_open_llm_v2_240829.csv +helpingai_15b,HFv2 BBH,1.82,,hf_open_llm_v2_240829.csv +helpingai_15b,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv +helpingai_15b,HFv2 IFEval,20.3,,hf_open_llm_v2_240829.csv +helpingai_15b,HFv2 MMLU Pro,1.24,,hf_open_llm_v2_240829.csv +helpingai_15b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +helpingai_15b,HFv2 MuSR,2.73,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HF OpenLLM v2,21.63,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HFv2 BBH,30.67,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HFv2 IFEval,53.62,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HFv2 MMLU Pro,22.8,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HFv2 Math Level 5,5.74,,hf_open_llm_v2_240829.csv +hermes_2_pro_llama3_8b,HFv2 MuSR,11.25,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HF OpenLLM v2,21.64,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HFv2 BBH,29.43,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HFv2 IFEval,56.68,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HFv2 MMLU Pro,21.63,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HFv2 Math Level 5,4.83,,hf_open_llm_v2_240829.csv +hermes_2_pro_mistral_7b,HFv2 MuSR,14.13,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HF OpenLLM v2,37.31,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HFv2 BBH,53.77,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HFv2 GPQA,14.88,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HFv2 IFEval,76.61,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HFv2 MMLU Pro,41.41,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HFv2 Math Level 5,13.75,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_70b,HFv2 MuSR,23.43,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HF OpenLLM v2,23.49,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HFv2 BBH,30.72,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HFv2 IFEval,61.7,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HFv2 MMLU Pro,23.77,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HFv2 Math Level 5,4.76,,hf_open_llm_v2_240829.csv +hermes_3_llama3_1_8b,HFv2 MuSR,13.62,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HF OpenLLM v2,25.17,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HFv2 BBH,29.96,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HFv2 IFEval,66.69,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HFv2 MMLU Pro,27.52,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HFv2 Math Level 5,13.67,,hf_open_llm_v2_240829.csv +humanish_rp_llama3_1_8b,HFv2 MuSR,8.27,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HF OpenLLM v2,21.47,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HFv2 BBH,28.99,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HFv2 IFEval,60.5,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HFv2 MMLU Pro,25.02,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HFv2 Math Level 5,5.29,,hf_open_llm_v2_240829.csv +infinity_instruct_3m_0625_llama3_8b,HFv2 MuSR,5.67,,hf_open_llm_v2_240829.csv +instructlm_500m,HF OpenLLM v2,2.85,,hf_open_llm_v2_240829.csv +instructlm_500m,HFv2 BBH,2.32,,hf_open_llm_v2_240829.csv +instructlm_500m,HFv2 GPQA,0.89,,hf_open_llm_v2_240829.csv +instructlm_500m,HFv2 IFEval,10.28,,hf_open_llm_v2_240829.csv +instructlm_500m,HFv2 MMLU Pro,1.57,,hf_open_llm_v2_240829.csv +instructlm_500m,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +instructlm_500m,HFv2 MuSR,2.07,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HF OpenLLM v2,8.58,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HFv2 BBH,13.63,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HFv2 IFEval,21.98,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HFv2 MMLU Pro,6.54,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HFv2 Math Level 5,1.13,,hf_open_llm_v2_240829.csv +internlm2_1_8b,HFv2 MuSR,8.23,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HF OpenLLM v2,12.11,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HFv2 BBH,21.03,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HFv2 IFEval,38.49,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HFv2 MMLU Pro,3.32,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +internlm2_5_1_8b_chat,HFv2 MuSR,4.42,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HF OpenLLM v2,32.08,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HFv2 BBH,62.83,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HFv2 GPQA,9.51,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HFv2 IFEval,70.1,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HFv2 MMLU Pro,33.31,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +internlm2_5_20b_chat,HFv2 MuSR,16.74,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HF OpenLLM v2,30.46,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HFv2 BBH,57.67,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HFv2 IFEval,61.4,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HFv2 MMLU Pro,30.42,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HFv2 Math Level 5,8.31,,hf_open_llm_v2_240829.csv +internlm2_5_7b_chat,HFv2 MuSR,14.35,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HF OpenLLM v2,10.5,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HFv2 BBH,20.67,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HFv2 GPQA,2.13,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HFv2 IFEval,23.87,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HFv2 MMLU Pro,9.33,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv +internlm2_chat_1_8b,HFv2 MuSR,4.61,,hf_open_llm_v2_240829.csv +jamba_v0_1,HF OpenLLM v2,9.1,,hf_open_llm_v2_240829.csv +jamba_v0_1,HFv2 BBH,10.72,,hf_open_llm_v2_240829.csv +jamba_v0_1,HFv2 GPQA,2.46,,hf_open_llm_v2_240829.csv +jamba_v0_1,HFv2 IFEval,20.26,,hf_open_llm_v2_240829.csv +jamba_v0_1,HFv2 MMLU Pro,16.45,,hf_open_llm_v2_240829.csv +jamba_v0_1,HFv2 Math Level 5,0.98,,hf_open_llm_v2_240829.csv +jamba_v0_1,HFv2 MuSR,3.71,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HF OpenLLM v2,15.57,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HFv2 BBH,25.79,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HFv2 IFEval,25.53,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HFv2 MMLU Pro,25.74,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HFv2 Math Level 5,4.68,,hf_open_llm_v2_240829.csv +josiev4o_8b_stage1_v4,HFv2 MuSR,6.08,,hf_open_llm_v2_240829.csv +k2,HF OpenLLM v2,14.53,,hf_open_llm_v2_240829.csv +k2,HFv2 BBH,28.22,,hf_open_llm_v2_240829.csv +k2,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv +k2,HFv2 IFEval,22.52,,hf_open_llm_v2_240829.csv +k2,HFv2 MMLU Pro,22.27,,hf_open_llm_v2_240829.csv +k2,HFv2 Math Level 5,2.04,,hf_open_llm_v2_240829.csv +k2,HFv2 MuSR,8.55,,hf_open_llm_v2_240829.csv +k2_chat,HF OpenLLM v2,22.93,,hf_open_llm_v2_240829.csv +k2_chat,HFv2 BBH,33.79,,hf_open_llm_v2_240829.csv +k2_chat,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv +k2_chat,HFv2 IFEval,51.52,,hf_open_llm_v2_240829.csv +k2_chat,HFv2 MMLU Pro,26.34,,hf_open_llm_v2_240829.csv +k2_chat,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv +k2_chat,HFv2 MuSR,16.82,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HF OpenLLM v2,11.48,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HFv2 BBH,14.59,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HFv2 IFEval,32.78,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HFv2 MMLU Pro,7.4,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv +lion_gemma_2b_dpo_v1_0,HFv2 MuSR,9.83,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HF OpenLLM v2,11.36,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HFv2 BBH,14.02,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HFv2 IFEval,30.66,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HFv2 MMLU Pro,7.69,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HFv2 Math Level 5,3.7,,hf_open_llm_v2_240829.csv +lion_gemma_2b_odpo_v1_0,HFv2 MuSR,12.06,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HF OpenLLM v2,12.33,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HFv2 BBH,14.12,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HFv2 IFEval,36.92,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HFv2 MMLU Pro,8.69,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HFv2 Math Level 5,5.14,,hf_open_llm_v2_240829.csv +lion_gemma_2b_sft_v1_0,HFv2 MuSR,8.31,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HF OpenLLM v2,21.34,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HFv2 BBH,30.36,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HFv2 IFEval,49.57,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HFv2 MMLU Pro,24.65,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HFv2 Math Level 5,9.06,,hf_open_llm_v2_240829.csv +lion_llama3_8b_dpo_v1_0,HFv2 MuSR,10.28,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HF OpenLLM v2,19.29,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HFv2 BBH,30.46,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HFv2 GPQA,4.7,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HFv2 IFEval,39.68,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HFv2 MMLU Pro,23.92,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HFv2 Math Level 5,7.25,,hf_open_llm_v2_240829.csv +lion_llama3_8b_odpo_v1_0,HFv2 MuSR,9.72,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HF OpenLLM v2,20.26,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HFv2 BBH,30.88,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HFv2 GPQA,3.69,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HFv2 IFEval,38.17,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HFv2 MMLU Pro,24.86,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HFv2 Math Level 5,8.46,,hf_open_llm_v2_240829.csv +lion_llama3_8b_sft_v1_0,HFv2 MuSR,15.48,,hf_open_llm_v2_240829.csv +llama3_1_70b,HF OpenLLM v2,25.91,,hf_open_llm_v2_240829.csv +llama3_1_70b,HFv2 BBH,46.4,,hf_open_llm_v2_240829.csv +llama3_1_70b,HFv2 GPQA,18.34,,hf_open_llm_v2_240829.csv +llama3_1_70b,HFv2 IFEval,16.84,,hf_open_llm_v2_240829.csv +llama3_1_70b,HFv2 MMLU Pro,40.6,,hf_open_llm_v2_240829.csv +llama3_1_70b,HFv2 Math Level 5,16.69,,hf_open_llm_v2_240829.csv +llama3_1_70b,HFv2 MuSR,16.58,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HF OpenLLM v2,41.74,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HFv2 BBH,55.93,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HFv2 GPQA,14.21,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HFv2 IFEval,86.69,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HFv2 MMLU Pro,47.88,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HFv2 Math Level 5,28.02,,hf_open_llm_v2_240829.csv +llama3_1_70b_instruct,HFv2 MuSR,17.69,,hf_open_llm_v2_240829.csv +llama3_1_8b,HF OpenLLM v2,13.78,,hf_open_llm_v2_240829.csv +llama3_1_8b,HFv2 BBH,25.29,,hf_open_llm_v2_240829.csv +llama3_1_8b,HFv2 GPQA,6.15,,hf_open_llm_v2_240829.csv +llama3_1_8b,HFv2 IFEval,12.7,,hf_open_llm_v2_240829.csv +llama3_1_8b,HFv2 MMLU Pro,24.95,,hf_open_llm_v2_240829.csv +llama3_1_8b,HFv2 Math Level 5,4.61,,hf_open_llm_v2_240829.csv +llama3_1_8b,HFv2 MuSR,8.98,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HF OpenLLM v2,18.05,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HFv2 BBH,24.09,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HFv2 IFEval,53.28,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HFv2 MMLU Pro,15.82,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HFv2 Math Level 5,5.66,,hf_open_llm_v2_240829.csv +llama3_1_8b_fireplace2,HFv2 MuSR,4.22,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HF OpenLLM v2,27.91,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HFv2 BBH,29.89,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HFv2 GPQA,2.35,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HFv2 IFEval,78.56,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HFv2 MMLU Pro,30.68,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HFv2 Math Level 5,17.6,,hf_open_llm_v2_240829.csv +llama3_1_8b_instruct,HFv2 MuSR,8.41,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HF OpenLLM v2,20.74,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 BBH,28.02,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 IFEval,45.21,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 MMLU Pro,28.5,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 Math Level 5,8.84,,hf_open_llm_v2_240829.csv +llama3_1_instruct_nsfw_pretrained_e1_plus_reddit,HFv2 MuSR,8.3,,hf_open_llm_v2_240829.csv +llama3_70b,HF OpenLLM v2,26.37,,hf_open_llm_v2_240829.csv +llama3_70b,HFv2 BBH,48.71,,hf_open_llm_v2_240829.csv +llama3_70b,HFv2 GPQA,19.69,,hf_open_llm_v2_240829.csv +llama3_70b,HFv2 IFEval,16.03,,hf_open_llm_v2_240829.csv +llama3_70b,HFv2 MMLU Pro,41.21,,hf_open_llm_v2_240829.csv +llama3_70b,HFv2 Math Level 5,16.54,,hf_open_llm_v2_240829.csv +llama3_70b,HFv2 MuSR,16.01,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HF OpenLLM v2,36.18,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HFv2 BBH,50.19,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HFv2 GPQA,4.92,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HFv2 IFEval,80.99,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HFv2 MMLU Pro,46.74,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HFv2 Math Level 5,23.34,,hf_open_llm_v2_240829.csv +llama3_70b_instruct,HFv2 MuSR,10.92,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HF OpenLLM v2,30.45,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HFv2 BBH,46.71,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HFv2 GPQA,10.74,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HFv2 IFEval,61.22,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HFv2 MMLU Pro,43.31,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HFv2 Math Level 5,7.1,,hf_open_llm_v2_240829.csv +llama3_70b_shiningvaliant2,HFv2 MuSR,13.64,,hf_open_llm_v2_240829.csv +llama3_8b,HF OpenLLM v2,13.41,,hf_open_llm_v2_240829.csv +llama3_8b,HFv2 BBH,24.5,,hf_open_llm_v2_240829.csv +llama3_8b,HFv2 GPQA,7.38,,hf_open_llm_v2_240829.csv +llama3_8b,HFv2 IFEval,14.55,,hf_open_llm_v2_240829.csv +llama3_8b,HFv2 MMLU Pro,24.55,,hf_open_llm_v2_240829.csv +llama3_8b,HFv2 Math Level 5,3.25,,hf_open_llm_v2_240829.csv +llama3_8b,HFv2 MuSR,6.24,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HF OpenLLM v2,23.91,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HFv2 BBH,28.24,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HFv2 IFEval,74.08,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HFv2 MMLU Pro,29.6,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HFv2 Math Level 5,8.69,,hf_open_llm_v2_240829.csv +llama3_8b_instruct,HFv2 MuSR,5.4,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HF OpenLLM v2,18.12,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HFv2 BBH,21.01,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HFv2 GPQA,3.69,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HFv2 IFEval,44.56,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HFv2 MMLU Pro,21.56,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HFv2 Math Level 5,4.38,,hf_open_llm_v2_240829.csv +llama3_8b_instruct_gradient_1048k,HFv2 MuSR,13.52,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HF OpenLLM v2,16.47,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HFv2 BBH,26.69,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HFv2 IFEval,41.18,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HFv2 MMLU Pro,22.29,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HFv2 Math Level 5,3.4,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_1,HFv2 MuSR,1.92,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HF OpenLLM v2,16.89,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HFv2 BBH,24.31,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HFv2 IFEval,44.97,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HFv2 MMLU Pro,23.71,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv +llama3_8b_magpie_align_v0_3,HFv2 MuSR,3.74,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HF OpenLLM v2,24.16,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HFv2 BBH,26.79,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HFv2 IFEval,66.69,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HFv2 MMLU Pro,27.94,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HFv2 Math Level 5,8.23,,hf_open_llm_v2_240829.csv +llama3_cantonese_8b_instruct,HFv2 MuSR,9.48,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HF OpenLLM v2,24.48,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HFv2 BBH,29.76,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HFv2 IFEval,70.46,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HFv2 MMLU Pro,29.84,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_cpo_simpo,HFv2 MuSR,3.42,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HF OpenLLM v2,24.71,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HFv2 BBH,28.23,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HFv2 IFEval,73.47,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HFv2 MMLU Pro,30.37,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HFv2 Math Level 5,7.1,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_simpo,HFv2 MuSR,3.74,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HF OpenLLM v2,23.78,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HFv2 BBH,29.87,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HFv2 IFEval,69.89,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HFv2 MMLU Pro,29.91,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HFv2 Math Level 5,8.76,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter2,HFv2 MuSR,2.0,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HF OpenLLM v2,23.06,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HFv2 BBH,29.72,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HFv2 IFEval,67.03,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HFv2 MMLU Pro,29.53,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HFv2 Math Level 5,7.18,,hf_open_llm_v2_240829.csv +llama3_instruct_8b_sppo_iter3,HFv2 MuSR,2.89,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HF OpenLLM v2,20.09,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HFv2 BBH,26.93,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HFv2 GPQA,1.68,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HFv2 IFEval,51.13,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HFv2 MMLU Pro,28.82,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HFv2 Math Level 5,8.38,,hf_open_llm_v2_240829.csv +llama3_korean_bllossom_8b,HFv2 MuSR,3.63,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HF OpenLLM v2,15.93,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HFv2 BBH,16.34,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HFv2 GPQA,2.46,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HFv2 IFEval,44.89,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HFv2 MMLU Pro,21.48,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HFv2 Math Level 5,3.63,,hf_open_llm_v2_240829.csv +llama3_neuralhercules_5_0_8b,HFv2 MuSR,6.78,,hf_open_llm_v2_240829.csv +llama3_refueled,HF OpenLLM v2,22.73,,hf_open_llm_v2_240829.csv +llama3_refueled,HFv2 BBH,41.72,,hf_open_llm_v2_240829.csv +llama3_refueled,HFv2 GPQA,6.6,,hf_open_llm_v2_240829.csv +llama3_refueled,HFv2 IFEval,46.2,,hf_open_llm_v2_240829.csv +llama3_refueled,HFv2 MMLU Pro,23.28,,hf_open_llm_v2_240829.csv +llama3_refueled,HFv2 Math Level 5,3.93,,hf_open_llm_v2_240829.csv +llama3_refueled,HFv2 MuSR,14.64,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HF OpenLLM v2,36.54,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HFv2 BBH,49.62,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HFv2 GPQA,6.82,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HFv2 IFEval,80.87,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HFv2 MMLU Pro,46.78,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HFv2 Math Level 5,22.66,,hf_open_llm_v2_240829.csv +llama3_tenyxchat_70b,HFv2 MuSR,12.52,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HF OpenLLM v2,4.1,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HFv2 BBH,3.17,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HFv2 IFEval,15.75,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HFv2 MMLU Pro,1.51,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +llama_160m_chat_v1,HFv2 MuSR,3.17,,hf_open_llm_v2_240829.csv +llama_2_13b,HF OpenLLM v2,10.99,,hf_open_llm_v2_240829.csv +llama_2_13b,HFv2 BBH,17.22,,hf_open_llm_v2_240829.csv +llama_2_13b,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv +llama_2_13b,HFv2 IFEval,24.82,,hf_open_llm_v2_240829.csv +llama_2_13b,HFv2 MMLU Pro,15.31,,hf_open_llm_v2_240829.csv +llama_2_13b,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv +llama_2_13b,HFv2 MuSR,3.39,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HF OpenLLM v2,11.0,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HFv2 BBH,7.16,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HFv2 IFEval,39.85,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HFv2 MMLU Pro,10.26,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +llama_2_13b_chat,HFv2 MuSR,8.16,,hf_open_llm_v2_240829.csv +llama_2_70b,HF OpenLLM v2,18.25,,hf_open_llm_v2_240829.csv +llama_2_70b,HFv2 BBH,35.9,,hf_open_llm_v2_240829.csv +llama_2_70b,HFv2 GPQA,7.05,,hf_open_llm_v2_240829.csv +llama_2_70b,HFv2 IFEval,24.07,,hf_open_llm_v2_240829.csv +llama_2_70b,HFv2 MMLU Pro,30.2,,hf_open_llm_v2_240829.csv +llama_2_70b,HFv2 Math Level 5,2.49,,hf_open_llm_v2_240829.csv +llama_2_70b,HFv2 MuSR,9.78,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HF OpenLLM v2,12.73,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HFv2 BBH,4.61,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HFv2 IFEval,49.58,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HFv2 MMLU Pro,15.92,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv +llama_2_70b_chat,HFv2 MuSR,3.48,,hf_open_llm_v2_240829.csv +llama_2_7b,HF OpenLLM v2,8.72,,hf_open_llm_v2_240829.csv +llama_2_7b,HFv2 BBH,10.35,,hf_open_llm_v2_240829.csv +llama_2_7b,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv +llama_2_7b,HFv2 IFEval,25.19,,hf_open_llm_v2_240829.csv +llama_2_7b,HFv2 MMLU Pro,9.57,,hf_open_llm_v2_240829.csv +llama_2_7b,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv +llama_2_7b,HFv2 MuSR,3.76,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HF OpenLLM v2,9.4,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HFv2 BBH,4.49,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HFv2 IFEval,39.65,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HFv2 MMLU Pro,7.52,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +llama_2_7b_chat,HFv2 MuSR,3.48,,hf_open_llm_v2_240829.csv +llama_65b,HF OpenLLM v2,13.54,,hf_open_llm_v2_240829.csv +llama_65b,HFv2 BBH,25.25,,hf_open_llm_v2_240829.csv +llama_65b,HFv2 GPQA,3.47,,hf_open_llm_v2_240829.csv +llama_65b,HFv2 IFEval,25.26,,hf_open_llm_v2_240829.csv +llama_65b,HFv2 MMLU Pro,23.08,,hf_open_llm_v2_240829.csv +llama_65b,HFv2 Math Level 5,2.19,,hf_open_llm_v2_240829.csv +llama_65b,HFv2 MuSR,1.97,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HF OpenLLM v2,15.14,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HFv2 BBH,19.49,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HFv2 IFEval,44.86,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HFv2 MMLU Pro,10.51,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv +llama_pro_8b_instruct,HFv2 MuSR,11.11,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HF OpenLLM v2,22.86,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HFv2 BBH,48.02,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HFv2 GPQA,6.82,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HFv2 IFEval,36.93,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HFv2 MMLU Pro,26.7,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HFv2 Math Level 5,6.19,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_0,HFv2 MuSR,12.51,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HF OpenLLM v2,23.44,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HFv2 BBH,47.77,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HFv2 GPQA,7.72,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HFv2 IFEval,41.15,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HFv2 MMLU Pro,27.48,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv +luxia_21_4b_alignment_v1_2,HFv2 MuSR,14.9,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HF OpenLLM v2,42.17,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HFv2 BBH,57.65,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HFv2 GPQA,18.79,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HFv2 IFEval,76.06,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HFv2 MMLU Pro,49.64,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HFv2 Math Level 5,35.27,,hf_open_llm_v2_240829.csv +magnum_72b_v1,HFv2 MuSR,15.62,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HF OpenLLM v2,24.38,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HFv2 BBH,26.82,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HFv2 GPQA,7.94,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HFv2 IFEval,64.62,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HFv2 MMLU Pro,28.22,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HFv2 Math Level 5,8.91,,hf_open_llm_v2_240829.csv +maid_yuzu_v7,HFv2 MuSR,9.77,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HF OpenLLM v2,8.81,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HFv2 BBH,10.06,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HFv2 IFEval,33.03,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HFv2 MMLU Pro,1.82,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HFv2 Math Level 5,0.83,,hf_open_llm_v2_240829.csv +matter_0_2_7b_dpo,HFv2 MuSR,5.87,,hf_open_llm_v2_240829.csv +merlinite_7b,HF OpenLLM v2,16.74,,hf_open_llm_v2_240829.csv +merlinite_7b,HFv2 BBH,29.98,,hf_open_llm_v2_240829.csv +merlinite_7b,HFv2 GPQA,6.26,,hf_open_llm_v2_240829.csv +merlinite_7b,HFv2 IFEval,24.99,,hf_open_llm_v2_240829.csv +merlinite_7b,HFv2 MMLU Pro,22.98,,hf_open_llm_v2_240829.csv +merlinite_7b,HFv2 Math Level 5,2.34,,hf_open_llm_v2_240829.csv +merlinite_7b,HFv2 MuSR,13.88,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HF OpenLLM v2,3.85,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HFv2 BBH,2.44,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HFv2 IFEval,13.76,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HFv2 MMLU Pro,1.48,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +minueza_32m_ultrachat,HFv2 MuSR,4.64,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HF OpenLLM v2,17.0,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HFv2 BBH,22.48,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HFv2 GPQA,5.15,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HFv2 IFEval,44.46,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HFv2 MMLU Pro,18.63,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter2,HFv2 MuSR,9.8,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HF OpenLLM v2,16.36,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HFv2 BBH,21.82,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HFv2 IFEval,43.51,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HFv2 MMLU Pro,18.42,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv +mistral7b_pairrm_sppo_iter3,HFv2 MuSR,9.49,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HF OpenLLM v2,12.67,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HFv2 BBH,7.65,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HFv2 IFEval,44.87,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HFv2 MMLU Pro,15.72,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_1,HFv2 MuSR,6.13,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HF OpenLLM v2,18.44,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HFv2 BBH,22.91,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HFv2 GPQA,3.47,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HFv2 IFEval,54.96,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HFv2 MMLU Pro,19.08,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_2,HFv2 MuSR,7.61,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HF OpenLLM v2,19.11,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HFv2 BBH,25.57,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HFv2 IFEval,54.65,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HFv2 MMLU Pro,23.06,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HFv2 Math Level 5,3.17,,hf_open_llm_v2_240829.csv +mistral_7b_instruct_v0_3,HFv2 MuSR,4.3,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HF OpenLLM v2,17.62,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HFv2 BBH,25.84,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HFv2 GPQA,2.91,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HFv2 IFEval,49.78,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HFv2 MMLU Pro,18.37,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HFv2 Math Level 5,2.95,,hf_open_llm_v2_240829.csv +mistral_7b_openorca,HFv2 MuSR,5.89,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HF OpenLLM v2,14.52,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HFv2 BBH,22.17,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HFv2 IFEval,23.86,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HFv2 MMLU Pro,22.36,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HFv2 Math Level 5,2.49,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1,HFv2 MuSR,10.68,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HF OpenLLM v2,14.26,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv2 BBH,22.4,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv2 IFEval,21.79,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv2 MMLU Pro,22.22,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv2 Math Level 5,2.72,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv2 MuSR,8.81,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HF OpenLLM v2,8.71,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv2 BBH,9.23,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv2 IFEval,21.33,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv2 MMLU Pro,14.56,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv2 Math Level 5,2.27,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv2 MuSR,2.19,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HF OpenLLM v2,13.73,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HFv2 BBH,20.44,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HFv2 GPQA,7.16,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HFv2 IFEval,23.94,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HFv2 MMLU Pro,22.24,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HFv2 Math Level 5,2.79,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_3_over_8,HFv2 MuSR,5.79,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HF OpenLLM v2,12.16,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HFv2 BBH,17.54,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HFv2 IFEval,21.18,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HFv2 MMLU Pro,21.75,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HFv2 Math Level 5,2.19,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_5_over_16,HFv2 MuSR,6.14,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HF OpenLLM v2,14.15,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HFv2 BBH,21.04,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HFv2 GPQA,7.16,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HFv2 IFEval,22.94,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HFv2 MMLU Pro,22.56,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HFv2 Math Level 5,3.25,,hf_open_llm_v2_240829.csv +mistral_7b_v0_1_signtensors_7_over_16,HFv2 MuSR,7.93,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HF OpenLLM v2,14.15,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HFv2 IFEval,22.66,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HFv2 MMLU Pro,21.7,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv +mistral_7b_v0_2,HFv2 MuSR,8.36,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HF OpenLLM v2,14.15,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HFv2 BBH,23.95,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HFv2 GPQA,5.59,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HFv2 IFEval,22.66,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HFv2 MMLU Pro,21.7,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HFv2 Math Level 5,2.64,,hf_open_llm_v2_240829.csv +mistral_7b_v0_3,HFv2 MuSR,8.36,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HF OpenLLM v2,15.08,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HFv2 BBH,29.37,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HFv2 IFEval,16.3,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HFv2 MMLU Pro,27.46,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HFv2 Math Level 5,4.98,,hf_open_llm_v2_240829.csv +mistral_nemo_base_2407,HFv2 MuSR,6.52,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HF OpenLLM v2,22.27,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HFv2 BBH,27.11,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HFv2 GPQA,8.72,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HFv2 IFEval,62.61,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HFv2 MMLU Pro,26.37,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv +mistral_nemo_instruct_2407,HFv2 MuSR,8.48,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HF OpenLLM v2,17.6,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HFv2 BBH,32.52,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HFv2 GPQA,8.61,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HFv2 IFEval,19.46,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HFv2 MMLU Pro,30.92,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv +mistral_nemo_minitron_8b_base,HFv2 MuSR,9.77,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HF OpenLLM v2,12.08,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HFv2 BBH,15.59,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HFv2 IFEval,37.7,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HFv2 MMLU Pro,14.46,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv +mistral_v0_3_7b_orpo,HFv2 MuSR,2.97,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HF OpenLLM v2,33.89,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HFv2 BBH,44.11,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HFv2 GPQA,16.44,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HFv2 IFEval,71.84,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HFv2 MMLU Pro,38.7,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HFv2 Math Level 5,18.73,,hf_open_llm_v2_240829.csv +mixtral_8x22b_instruct_v0_1,HFv2 MuSR,13.49,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HF OpenLLM v2,25.49,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HFv2 BBH,45.59,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HFv2 GPQA,16.78,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HFv2 IFEval,25.83,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HFv2 MMLU Pro,40.44,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HFv2 Math Level 5,16.84,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_1,HFv2 MuSR,7.46,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HF OpenLLM v2,25.55,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HFv2 BBH,45.73,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HFv2 GPQA,17.0,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HFv2 IFEval,25.83,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HFv2 MMLU Pro,40.44,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HFv2 Math Level 5,16.84,,hf_open_llm_v2_240829.csv +mixtral_8x22b_v0_3,HFv2 MuSR,7.46,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HF OpenLLM v2,24.35,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HFv2 BBH,34.02,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HFv2 IFEval,53.95,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HFv2 MMLU Pro,29.36,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HFv2 Math Level 5,9.06,,hf_open_llm_v2_240829.csv +mixtral_8x7b_instruct_v0_1,HFv2 MuSR,12.11,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HF OpenLLM v2,19.33,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HFv2 BBH,30.29,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HFv2 GPQA,8.5,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HFv2 IFEval,24.15,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HFv2 MMLU Pro,31.66,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HFv2 Math Level 5,8.76,,hf_open_llm_v2_240829.csv +mixtral_8x7b_v0_1,HFv2 MuSR,12.58,,hf_open_llm_v2_240829.csv +mpt_7b,HF OpenLLM v2,5.98,,hf_open_llm_v2_240829.csv +mpt_7b,HFv2 BBH,6.55,,hf_open_llm_v2_240829.csv +mpt_7b,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv +mpt_7b,HFv2 IFEval,21.52,,hf_open_llm_v2_240829.csv +mpt_7b,HFv2 MMLU Pro,2.29,,hf_open_llm_v2_240829.csv +mpt_7b,HFv2 Math Level 5,1.28,,hf_open_llm_v2_240829.csv +mpt_7b,HFv2 MuSR,2.9,,hf_open_llm_v2_240829.csv +multiverse_70b,HF OpenLLM v2,31.73,,hf_open_llm_v2_240829.csv +multiverse_70b,HFv2 BBH,46.14,,hf_open_llm_v2_240829.csv +multiverse_70b,HFv2 GPQA,13.87,,hf_open_llm_v2_240829.csv +multiverse_70b,HFv2 IFEval,52.49,,hf_open_llm_v2_240829.csv +multiverse_70b,HFv2 MMLU Pro,42.89,,hf_open_llm_v2_240829.csv +multiverse_70b,HFv2 Math Level 5,16.16,,hf_open_llm_v2_240829.csv +multiverse_70b,HFv2 MuSR,18.82,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HF OpenLLM v2,18.83,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HFv2 BBH,23.96,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HFv2 IFEval,49.35,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HFv2 MMLU Pro,17.79,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HFv2 Math Level 5,4.76,,hf_open_llm_v2_240829.csv +neuralbeagle14_7b,HFv2 MuSR,12.89,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HF OpenLLM v2,17.52,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HFv2 BBH,22.39,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HFv2 IFEval,52.76,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HFv2 MMLU Pro,22.85,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HFv2 Math Level 5,3.47,,hf_open_llm_v2_240829.csv +neuralllama3_8b_orpo_v0_3,HFv2 MuSR,3.65,,hf_open_llm_v2_240829.csv +notus_7b_v1,HF OpenLLM v2,18.37,,hf_open_llm_v2_240829.csv +notus_7b_v1,HFv2 BBH,22.75,,hf_open_llm_v2_240829.csv +notus_7b_v1,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv +notus_7b_v1,HFv2 IFEval,50.82,,hf_open_llm_v2_240829.csv +notus_7b_v1,HFv2 MMLU Pro,22.26,,hf_open_llm_v2_240829.csv +notus_7b_v1,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv +notus_7b_v1,HFv2 MuSR,6.59,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HF OpenLLM v2,24.23,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HFv2 BBH,34.76,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HFv2 GPQA,7.83,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HFv2 IFEval,54.22,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HFv2 MMLU Pro,29.56,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HFv2 Math Level 5,8.46,,hf_open_llm_v2_240829.csv +notux_8x7b_v1,HFv2 MuSR,10.53,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HF OpenLLM v2,21.01,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HFv2 BBH,27.79,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HFv2 GPQA,5.7,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HFv2 IFEval,57.63,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HFv2 MMLU Pro,22.39,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HFv2 Math Level 5,4.23,,hf_open_llm_v2_240829.csv +nous_hermes_2_mistral_7b_dpo,HFv2 MuSR,8.33,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HF OpenLLM v2,27.13,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HFv2 BBH,37.11,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HFv2 GPQA,9.51,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HFv2 IFEval,58.97,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HFv2 MMLU Pro,29.62,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HFv2 Math Level 5,10.88,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,HFv2 MuSR,16.68,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HF OpenLLM v2,21.78,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HFv2 BBH,30.59,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HFv2 GPQA,6.94,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HFv2 IFEval,57.31,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HFv2 MMLU Pro,22.96,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv +nous_hermes_2_mixtral_8x7b_sft,HFv2 MuSR,11.14,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HF OpenLLM v2,23.32,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HFv2 BBH,34.99,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HFv2 GPQA,5.82,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HFv2 IFEval,52.79,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HFv2 MMLU Pro,27.31,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HFv2 Math Level 5,5.21,,hf_open_llm_v2_240829.csv +nous_hermes_2_solar_10_7b,HFv2 MuSR,13.83,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HF OpenLLM v2,1.63,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HFv2 BBH,1.89,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HFv2 IFEval,2.57,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HFv2 MMLU Pro,1.8,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +nucleus_22b_token_500b,HFv2 MuSR,3.55,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HF OpenLLM v2,12.3,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HFv2 BBH,17.58,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HFv2 IFEval,40.07,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HFv2 MMLU Pro,6.79,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HFv2 Math Level 5,1.74,,hf_open_llm_v2_240829.csv +nxcode_cq_7b_orpo,HFv2 MuSR,7.05,,hf_open_llm_v2_240829.csv +olmo_1b,HF OpenLLM v2,6.47,,hf_open_llm_v2_240829.csv +olmo_1b,HFv2 BBH,3.2,,hf_open_llm_v2_240829.csv +olmo_1b,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv +olmo_1b,HFv2 IFEval,21.82,,hf_open_llm_v2_240829.csv +olmo_1b,HFv2 MMLU Pro,1.93,,hf_open_llm_v2_240829.csv +olmo_1b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv +olmo_1b,HFv2 MuSR,9.56,,hf_open_llm_v2_240829.csv +olmo_7b,HF OpenLLM v2,6.78,,hf_open_llm_v2_240829.csv +olmo_7b,HFv2 BBH,5.76,,hf_open_llm_v2_240829.csv +olmo_7b,HFv2 GPQA,3.02,,hf_open_llm_v2_240829.csv +olmo_7b,HFv2 IFEval,27.19,,hf_open_llm_v2_240829.csv +olmo_7b,HFv2 MMLU Pro,1.92,,hf_open_llm_v2_240829.csv +olmo_7b,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +olmo_7b,HFv2 MuSR,2.08,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HF OpenLLM v2,10.76,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HFv2 BBH,13.16,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HFv2 IFEval,34.73,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HFv2 MMLU Pro,8.72,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HFv2 Math Level 5,0.83,,hf_open_llm_v2_240829.csv +olmo_7b_instruct,HFv2 MuSR,4.33,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HF OpenLLM v2,24.07,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HFv2 BBH,29.06,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HFv2 IFEval,66.57,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HFv2 MMLU Pro,25.67,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HFv2 Math Level 5,9.37,,hf_open_llm_v2_240829.csv +openbuddy_llama3_1_8b_v22_2_131k,HFv2 MuSR,9.81,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HF OpenLLM v2,19.9,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HFv2 BBH,26.12,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HFv2 GPQA,2.8,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HFv2 IFEval,55.7,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HFv2 MMLU Pro,21.72,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HFv2 Math Level 5,2.72,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_1_8k,HFv2 MuSR,10.35,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HF OpenLLM v2,21.84,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HFv2 BBH,27.25,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HFv2 IFEval,61.92,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HFv2 MMLU Pro,25.54,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HFv2 Math Level 5,6.5,,hf_open_llm_v2_240829.csv +openbuddy_llama3_8b_v21_2_32k,HFv2 MuSR,5.93,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HF OpenLLM v2,22.12,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv2 BBH,24.54,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv2 GPQA,7.27,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv2 IFEval,54.93,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv2 MMLU Pro,31.16,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv2 Math Level 5,9.52,,hf_open_llm_v2_240829.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv2 MuSR,5.28,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HF OpenLLM v2,19.14,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HFv2 BBH,26.29,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HFv2 GPQA,7.61,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HFv2 IFEval,37.53,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HFv2 MMLU Pro,24.3,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HFv2 Math Level 5,7.78,,hf_open_llm_v2_240829.csv +openbuddy_zero_14b_v22_3_32k,HFv2 MuSR,11.34,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HF OpenLLM v2,11.55,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HFv2 BBH,15.29,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HFv2 IFEval,38.02,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HFv2 MMLU Pro,11.49,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv +openbuddy_zero_3b_v21_2_32k,HFv2 MuSR,2.25,,hf_open_llm_v2_240829.csv +openchat_3_5,HF OpenLLM v2,21.52,,hf_open_llm_v2_240829.csv +openchat_3_5,HFv2 BBH,21.58,,hf_open_llm_v2_240829.csv +openchat_3_5,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv +openchat_3_5,HFv2 IFEval,59.31,,hf_open_llm_v2_240829.csv +openchat_3_5,HFv2 MMLU Pro,23.93,,hf_open_llm_v2_240829.csv +openchat_3_5,HFv2 Math Level 5,6.57,,hf_open_llm_v2_240829.csv +openchat_3_5,HFv2 MuSR,11.26,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HF OpenLLM v2,22.56,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HFv2 BBH,23.24,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HFv2 GPQA,6.82,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HFv2 IFEval,60.37,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HFv2 MMLU Pro,23.81,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HFv2 Math Level 5,6.87,,hf_open_llm_v2_240829.csv +openchat_3_5_1210,HFv2 MuSR,14.28,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HF OpenLLM v2,21.22,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HFv2 BBH,27.77,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HFv2 GPQA,4.47,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HFv2 IFEval,55.71,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HFv2 MMLU Pro,22.83,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv +openhermes_2_5_mistral_7b,HFv2 MuSR,12.06,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HF OpenLLM v2,21.33,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HFv2 BBH,29.25,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HFv2 GPQA,4.47,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HFv2 IFEval,52.86,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HFv2 MMLU Pro,21.46,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HFv2 Math Level 5,3.85,,hf_open_llm_v2_240829.csv +openhermes_2_mistral_7b,HFv2 MuSR,16.06,,hf_open_llm_v2_240829.csv +opt_1_3b,HF OpenLLM v2,5.25,,hf_open_llm_v2_240829.csv +opt_1_3b,HFv2 BBH,3.65,,hf_open_llm_v2_240829.csv +opt_1_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +opt_1_3b,HFv2 IFEval,23.83,,hf_open_llm_v2_240829.csv +opt_1_3b,HFv2 MMLU Pro,1.19,,hf_open_llm_v2_240829.csv +opt_1_3b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv +opt_1_3b,HFv2 MuSR,2.08,,hf_open_llm_v2_240829.csv +opt_30b,HF OpenLLM v2,6.2,,hf_open_llm_v2_240829.csv +opt_30b,HFv2 BBH,3.5,,hf_open_llm_v2_240829.csv +opt_30b,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv +opt_30b,HFv2 IFEval,24.53,,hf_open_llm_v2_240829.csv +opt_30b,HFv2 MMLU Pro,1.82,,hf_open_llm_v2_240829.csv +opt_30b,HFv2 Math Level 5,0.6,,hf_open_llm_v2_240829.csv +opt_30b,HFv2 MuSR,4.19,,hf_open_llm_v2_240829.csv +orpollama3_8b,HF OpenLLM v2,14.87,,hf_open_llm_v2_240829.csv +orpollama3_8b,HFv2 BBH,21.95,,hf_open_llm_v2_240829.csv +orpollama3_8b,HFv2 GPQA,3.91,,hf_open_llm_v2_240829.csv +orpollama3_8b,HFv2 IFEval,36.53,,hf_open_llm_v2_240829.csv +orpollama3_8b,HFv2 MMLU Pro,18.95,,hf_open_llm_v2_240829.csv +orpollama3_8b,HFv2 Math Level 5,3.85,,hf_open_llm_v2_240829.csv +orpollama3_8b,HFv2 MuSR,4.01,,hf_open_llm_v2_240829.csv +phi_1,HF OpenLLM v2,5.52,,hf_open_llm_v2_240829.csv +phi_1,HFv2 BBH,4.27,,hf_open_llm_v2_240829.csv +phi_1,HFv2 GPQA,2.01,,hf_open_llm_v2_240829.csv +phi_1,HFv2 IFEval,20.68,,hf_open_llm_v2_240829.csv +phi_1,HFv2 MMLU Pro,1.8,,hf_open_llm_v2_240829.csv +phi_1,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +phi_1,HFv2 MuSR,3.7,,hf_open_llm_v2_240829.csv +phi_1_5,HF OpenLLM v2,7.06,,hf_open_llm_v2_240829.csv +phi_1_5,HFv2 BBH,7.47,,hf_open_llm_v2_240829.csv +phi_1_5,HFv2 GPQA,2.35,,hf_open_llm_v2_240829.csv +phi_1_5,HFv2 IFEval,20.33,,hf_open_llm_v2_240829.csv +phi_1_5,HFv2 MMLU Pro,7.68,,hf_open_llm_v2_240829.csv +phi_1_5,HFv2 Math Level 5,1.13,,hf_open_llm_v2_240829.csv +phi_1_5,HFv2 MuSR,3.39,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HF OpenLLM v2,6.64,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HFv2 BBH,4.82,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HFv2 IFEval,24.02,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HFv2 MMLU Pro,6.24,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +phi_1_5_instruct_v0_1,HFv2 MuSR,3.4,,hf_open_llm_v2_240829.csv +phi_2,HF OpenLLM v2,15.45,,hf_open_llm_v2_240829.csv +phi_2,HFv2 BBH,28.04,,hf_open_llm_v2_240829.csv +phi_2,HFv2 GPQA,2.91,,hf_open_llm_v2_240829.csv +phi_2,HFv2 IFEval,27.39,,hf_open_llm_v2_240829.csv +phi_2,HFv2 MMLU Pro,18.09,,hf_open_llm_v2_240829.csv +phi_2,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv +phi_2,HFv2 MuSR,13.84,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HF OpenLLM v2,14.22,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HFv2 BBH,26.36,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HFv2 IFEval,36.81,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HFv2 MMLU Pro,13.85,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +phi_2_instruct_v0_1,HFv2 MuSR,5.04,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HF OpenLLM v2,27.4,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HFv2 BBH,36.75,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HFv2 GPQA,11.97,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HFv2 IFEval,57.75,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HFv2 MMLU Pro,32.91,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HFv2 Math Level 5,14.95,,hf_open_llm_v2_240829.csv +phi_3_5_mini_instruct,HFv2 MuSR,10.1,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HF OpenLLM v2,35.1,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HFv2 BBH,48.77,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HFv2 GPQA,14.09,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HFv2 IFEval,69.25,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HFv2 MMLU Pro,40.64,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HFv2 Math Level 5,20.54,,hf_open_llm_v2_240829.csv +phi_3_5_moe_instruct,HFv2 MuSR,17.33,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HF OpenLLM v2,32.67,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HFv2 BBH,49.38,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HFv2 GPQA,11.52,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HFv2 IFEval,64.23,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HFv2 MMLU Pro,40.84,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HFv2 Math Level 5,16.99,,hf_open_llm_v2_240829.csv +phi_3_medium_4k_instruct,HFv2 MuSR,13.05,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HF OpenLLM v2,25.49,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HFv2 BBH,37.1,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HFv2 IFEval,59.76,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HFv2 MMLU Pro,30.38,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HFv2 Math Level 5,8.91,,hf_open_llm_v2_240829.csv +phi_3_mini_128k_instruct,HFv2 MuSR,7.71,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HF OpenLLM v2,27.2,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HFv2 BBH,39.27,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HFv2 GPQA,10.96,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HFv2 IFEval,56.13,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HFv2 MMLU Pro,33.58,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HFv2 Math Level 5,14.2,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct,HFv2 MuSR,13.12,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HF OpenLLM v2,25.87,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HFv2 BBH,39.15,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HFv2 GPQA,10.74,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HFv2 IFEval,57.14,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HFv2 MMLU Pro,31.78,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HFv2 Math Level 5,7.63,,hf_open_llm_v2_240829.csv +phi_3_mini_4k_instruct_cpo_simpo,HFv2 MuSR,8.78,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HF OpenLLM v2,28.59,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HFv2 BBH,45.63,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HFv2 GPQA,8.95,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HFv2 IFEval,63.68,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HFv2 MMLU Pro,38.78,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +phi_3_small_128k_instruct,HFv2 MuSR,14.5,,hf_open_llm_v2_240829.csv +pythia_12b,HF OpenLLM v2,5.93,,hf_open_llm_v2_240829.csv +pythia_12b,HFv2 BBH,4.99,,hf_open_llm_v2_240829.csv +pythia_12b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +pythia_12b,HFv2 IFEval,24.71,,hf_open_llm_v2_240829.csv +pythia_12b,HFv2 MMLU Pro,1.21,,hf_open_llm_v2_240829.csv +pythia_12b,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv +pythia_12b,HFv2 MuSR,3.79,,hf_open_llm_v2_240829.csv +pythia_160m,HF OpenLLM v2,5.62,,hf_open_llm_v2_240829.csv +pythia_160m,HFv2 BBH,2.2,,hf_open_llm_v2_240829.csv +pythia_160m,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv +pythia_160m,HFv2 IFEval,18.16,,hf_open_llm_v2_240829.csv +pythia_160m,HFv2 MMLU Pro,1.33,,hf_open_llm_v2_240829.csv +pythia_160m,HFv2 Math Level 5,0.23,,hf_open_llm_v2_240829.csv +pythia_160m,HFv2 MuSR,10.68,,hf_open_llm_v2_240829.csv +pythia_2_8b,HF OpenLLM v2,5.44,,hf_open_llm_v2_240829.csv +pythia_2_8b,HFv2 BBH,5.08,,hf_open_llm_v2_240829.csv +pythia_2_8b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +pythia_2_8b,HFv2 IFEval,21.73,,hf_open_llm_v2_240829.csv +pythia_2_8b,HFv2 MMLU Pro,1.52,,hf_open_llm_v2_240829.csv +pythia_2_8b,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +pythia_2_8b,HFv2 MuSR,3.64,,hf_open_llm_v2_240829.csv +pythia_410m,HF OpenLLM v2,5.11,,hf_open_llm_v2_240829.csv +pythia_410m,HFv2 BBH,2.72,,hf_open_llm_v2_240829.csv +pythia_410m,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +pythia_410m,HFv2 IFEval,21.95,,hf_open_llm_v2_240829.csv +pythia_410m,HFv2 MMLU Pro,1.42,,hf_open_llm_v2_240829.csv +pythia_410m,HFv2 Math Level 5,0.3,,hf_open_llm_v2_240829.csv +pythia_410m,HFv2 MuSR,3.06,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HF OpenLLM v2,3.82,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 BBH,1.82,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 IFEval,15.72,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 MMLU Pro,1.87,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +pythia_410m_roberta_lr_8e7_kl_01_steps_12000_rlhf_model,HFv2 MuSR,2.25,,hf_open_llm_v2_240829.csv +pythia_6_9b,HF OpenLLM v2,5.85,,hf_open_llm_v2_240829.csv +pythia_6_9b,HFv2 BBH,5.88,,hf_open_llm_v2_240829.csv +pythia_6_9b,HFv2 GPQA,0.22,,hf_open_llm_v2_240829.csv +pythia_6_9b,HFv2 IFEval,22.81,,hf_open_llm_v2_240829.csv +pythia_6_9b,HFv2 MMLU Pro,1.63,,hf_open_llm_v2_240829.csv +pythia_6_9b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv +pythia_6_9b,HFv2 MuSR,3.81,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HF OpenLLM v2,5.14,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HFv2 BBH,5.04,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HFv2 GPQA,0.56,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HFv2 IFEval,17.06,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HFv2 MMLU Pro,3.41,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b,HFv2 MuSR,4.3,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HF OpenLLM v2,5.56,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HFv2 BBH,4.32,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HFv2 IFEval,18.07,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HFv2 MMLU Pro,2.36,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +qwen1_5_0_5b_chat,HFv2 MuSR,6.06,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HF OpenLLM v2,29.56,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HFv2 BBH,44.28,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HFv2 GPQA,13.65,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HFv2 IFEval,34.22,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HFv2 MMLU Pro,48.45,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HFv2 Math Level 5,23.04,,hf_open_llm_v2_240829.csv +qwen1_5_110b,HFv2 MuSR,13.71,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HF OpenLLM v2,29.22,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HFv2 BBH,44.98,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HFv2 GPQA,12.19,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HFv2 IFEval,59.39,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HFv2 MMLU Pro,42.5,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +qwen1_5_110b_chat,HFv2 MuSR,16.29,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HF OpenLLM v2,20.22,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HFv2 BBH,30.06,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HFv2 IFEval,29.05,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HFv2 MMLU Pro,29.37,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HFv2 Math Level 5,16.47,,hf_open_llm_v2_240829.csv +qwen1_5_14b,HFv2 MuSR,10.46,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HF OpenLLM v2,21.02,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HFv2 BBH,32.76,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HFv2 IFEval,47.68,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HFv2 MMLU Pro,29.09,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +qwen1_5_14b_chat,HFv2 MuSR,13.93,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HF OpenLLM v2,9.12,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HFv2 BBH,9.76,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HFv2 GPQA,7.38,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HFv2 IFEval,21.54,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HFv2 MMLU Pro,9.8,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HFv2 Math Level 5,2.27,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b,HFv2 MuSR,3.96,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HF OpenLLM v2,9.01,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HFv2 BBH,5.91,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HFv2 IFEval,20.19,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HFv2 MMLU Pro,8.93,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv +qwen1_5_1_8b_chat,HFv2 MuSR,12.18,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HF OpenLLM v2,26.69,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HFv2 BBH,38.98,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HFv2 IFEval,32.97,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HFv2 MMLU Pro,38.89,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HFv2 Math Level 5,26.66,,hf_open_llm_v2_240829.csv +qwen1_5_32b,HFv2 MuSR,12.04,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HF OpenLLM v2,27.1,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HFv2 BBH,44.55,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HFv2 IFEval,55.32,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HFv2 MMLU Pro,38.41,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HFv2 Math Level 5,6.65,,hf_open_llm_v2_240829.csv +qwen1_5_32b_chat,HFv2 MuSR,10.2,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HF OpenLLM v2,11.29,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HFv2 BBH,16.25,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HFv2 GPQA,3.58,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HFv2 IFEval,24.45,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HFv2 MMLU Pro,16.22,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv +qwen1_5_4b,HFv2 MuSR,4.82,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HF OpenLLM v2,12.33,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HFv2 BBH,16.3,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HFv2 IFEval,31.57,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HFv2 MMLU Pro,15.51,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HFv2 Math Level 5,0.98,,hf_open_llm_v2_240829.csv +qwen1_5_4b_chat,HFv2 MuSR,7.36,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HF OpenLLM v2,15.22,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HFv2 BBH,23.08,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HFv2 IFEval,26.84,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HFv2 MMLU Pro,21.29,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv +qwen1_5_7b,HFv2 MuSR,9.16,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HF OpenLLM v2,16.58,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HFv2 BBH,22.38,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HFv2 GPQA,7.05,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HFv2 IFEval,43.71,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HFv2 MMLU Pro,21.68,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +qwen1_5_7b_chat,HFv2 MuSR,4.64,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HF OpenLLM v2,12.42,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HFv2 BBH,18.84,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HFv2 IFEval,26.6,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HFv2 MMLU Pro,19.75,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b,HFv2 MuSR,7.97,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HF OpenLLM v2,14.82,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HFv2 BBH,20.04,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HFv2 GPQA,3.24,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HFv2 IFEval,37.95,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HFv2 MMLU Pro,21.37,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +qwen1_5_moe_a2_7b_chat,HFv2 MuSR,6.33,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HF OpenLLM v2,7.06,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HFv2 BBH,7.99,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HFv2 IFEval,18.67,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HFv2 MMLU Pro,7.76,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv +qwen2_0_5b,HFv2 MuSR,4.6,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HF OpenLLM v2,6.39,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HFv2 BBH,5.88,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HFv2 IFEval,22.47,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HFv2 MMLU Pro,5.9,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HFv2 Math Level 5,1.66,,hf_open_llm_v2_240829.csv +qwen2_0_5b_instruct,HFv2 MuSR,2.41,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HF OpenLLM v2,10.32,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HFv2 BBH,11.78,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HFv2 IFEval,21.13,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HFv2 MMLU Pro,17.24,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HFv2 Math Level 5,6.27,,hf_open_llm_v2_240829.csv +qwen2_1_5b,HFv2 MuSR,3.59,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HF OpenLLM v2,13.92,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HFv2 BBH,13.7,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HFv2 GPQA,1.57,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HFv2 IFEval,33.71,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HFv2 MMLU Pro,16.68,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HFv2 Math Level 5,5.82,,hf_open_llm_v2_240829.csv +qwen2_1_5b_instruct,HFv2 MuSR,12.03,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HF OpenLLM v2,25.03,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HFv2 BBH,38.88,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HFv2 GPQA,7.49,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HFv2 IFEval,31.13,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HFv2 MMLU Pro,43.51,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HFv2 Math Level 5,18.66,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b,HFv2 MuSR,10.54,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HF OpenLLM v2,29.6,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HFv2 BBH,41.79,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HFv2 GPQA,10.85,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HFv2 IFEval,63.38,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HFv2 MMLU Pro,39.73,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv +qwen2_57b_a14b_instruct,HFv2 MuSR,14.18,,hf_open_llm_v2_240829.csv +qwen2_72b,HF OpenLLM v2,35.13,,hf_open_llm_v2_240829.csv +qwen2_72b,HFv2 BBH,51.86,,hf_open_llm_v2_240829.csv +qwen2_72b,HFv2 GPQA,19.24,,hf_open_llm_v2_240829.csv +qwen2_72b,HFv2 IFEval,38.24,,hf_open_llm_v2_240829.csv +qwen2_72b,HFv2 MMLU Pro,52.56,,hf_open_llm_v2_240829.csv +qwen2_72b,HFv2 Math Level 5,29.15,,hf_open_llm_v2_240829.csv +qwen2_72b,HFv2 MuSR,19.73,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HF OpenLLM v2,42.49,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HFv2 BBH,57.48,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HFv2 GPQA,16.33,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HFv2 IFEval,79.89,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HFv2 MMLU Pro,48.92,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HFv2 Math Level 5,35.12,,hf_open_llm_v2_240829.csv +qwen2_72b_instruct,HFv2 MuSR,17.17,,hf_open_llm_v2_240829.csv +qwen2_7b,HF OpenLLM v2,23.66,,hf_open_llm_v2_240829.csv +qwen2_7b,HFv2 BBH,34.71,,hf_open_llm_v2_240829.csv +qwen2_7b,HFv2 GPQA,7.27,,hf_open_llm_v2_240829.csv +qwen2_7b,HFv2 IFEval,31.49,,hf_open_llm_v2_240829.csv +qwen2_7b,HFv2 MMLU Pro,35.37,,hf_open_llm_v2_240829.csv +qwen2_7b,HFv2 Math Level 5,18.81,,hf_open_llm_v2_240829.csv +qwen2_7b,HFv2 MuSR,14.32,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HF OpenLLM v2,24.76,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HFv2 BBH,37.81,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HFv2 IFEval,56.79,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HFv2 MMLU Pro,31.64,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HFv2 Math Level 5,8.61,,hf_open_llm_v2_240829.csv +qwen2_7b_instruct,HFv2 MuSR,7.37,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HF OpenLLM v2,23.5,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HFv2 BBH,32.45,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HFv2 GPQA,6.04,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HFv2 IFEval,54.35,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HFv2 MMLU Pro,31.59,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HFv2 Math Level 5,8.76,,hf_open_llm_v2_240829.csv +qwen2_cantonese_7b_instruct,HFv2 MuSR,7.81,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HF OpenLLM v2,6.94,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HFv2 BBH,4.82,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HFv2 IFEval,30.17,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HFv2 MMLU Pro,1.96,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HFv2 Math Level 5,1.59,,hf_open_llm_v2_240829.csv +recurrentgemma_2b,HFv2 MuSR,3.1,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HF OpenLLM v2,7.92,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HFv2 BBH,7.98,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HFv2 GPQA,0.45,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HFv2 IFEval,29.49,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HFv2 MMLU Pro,4.47,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv +recurrentgemma_2b_it,HFv2 MuSR,3.62,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HF OpenLLM v2,13.5,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HFv2 BBH,15.32,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HFv2 GPQA,4.7,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HFv2 IFEval,31.16,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HFv2 MMLU Pro,17.83,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HFv2 Math Level 5,5.36,,hf_open_llm_v2_240829.csv +recurrentgemma_9b,HFv2 MuSR,6.6,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HF OpenLLM v2,19.12,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HFv2 BBH,21.62,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HFv2 GPQA,2.68,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HFv2 IFEval,50.1,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HFv2 MMLU Pro,20.48,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HFv2 Math Level 5,6.04,,hf_open_llm_v2_240829.csv +recurrentgemma_9b_it,HFv2 MuSR,13.77,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HF OpenLLM v2,5.46,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HFv2 BBH,5.09,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HFv2 GPQA,0.67,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HFv2 IFEval,20.82,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HFv2 MMLU Pro,2.19,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HFv2 Math Level 5,0.98,,hf_open_llm_v2_240829.csv +redpajama_incite_7b_base,HFv2 MuSR,3.02,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HF OpenLLM v2,5.43,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HFv2 BBH,3.52,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HFv2 IFEval,22.94,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HFv2 MMLU Pro,1.24,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HFv2 Math Level 5,0.91,,hf_open_llm_v2_240829.csv +redpajama_incite_base_3b_v1,HFv2 MuSR,4.0,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HF OpenLLM v2,4.02,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HFv2 BBH,3.67,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HFv2 GPQA,0.34,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HFv2 IFEval,1.45,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HFv2 MMLU Pro,1.85,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HFv2 Math Level 5,5.51,,hf_open_llm_v2_240829.csv +rhea_72b_v0_5,HFv2 MuSR,11.32,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HF OpenLLM v2,23.94,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HFv2 BBH,28.55,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HFv2 GPQA,1.45,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HFv2 IFEval,73.2,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HFv2 MMLU Pro,30.09,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HFv2 Math Level 5,8.69,,hf_open_llm_v2_240829.csv +roleplay_llama3_8b,HFv2 MuSR,1.68,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HF OpenLLM v2,21.81,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HFv2 BBH,25.37,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HFv2 IFEval,69.58,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HFv2 MMLU Pro,28.41,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HFv2 Math Level 5,6.19,,hf_open_llm_v2_240829.csv +rys_llama3_8b_instruct,HFv2 MuSR,0.29,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HF OpenLLM v2,34.37,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HFv2 BBH,49.07,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HFv2 GPQA,1.45,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HFv2 IFEval,76.86,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HFv2 MMLU Pro,45.66,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HFv2 Math Level 5,21.22,,hf_open_llm_v2_240829.csv +rys_llama3_huge_instruct,HFv2 MuSR,11.93,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HF OpenLLM v2,35.78,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HFv2 BBH,49.67,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HFv2 GPQA,5.26,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HFv2 IFEval,80.51,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HFv2 MMLU Pro,45.97,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HFv2 Math Level 5,21.83,,hf_open_llm_v2_240829.csv +rys_llama3_large_instruct,HFv2 MuSR,11.45,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HF OpenLLM v2,28.38,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HFv2 BBH,46.75,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HFv2 GPQA,13.98,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HFv2 IFEval,43.91,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HFv2 MMLU Pro,42.74,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HFv2 Math Level 5,11.78,,hf_open_llm_v2_240829.csv +rys_phi_3_medium_4k_instruct,HFv2 MuSR,11.09,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HF OpenLLM v2,19.71,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HFv2 BBH,31.82,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HFv2 GPQA,8.17,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HFv2 IFEval,45.73,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HFv2 MMLU Pro,23.93,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +sauerkrautlm_una_solar_instruct,HFv2 MuSR,8.6,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HF OpenLLM v2,23.63,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HFv2 BBH,33.8,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HFv2 GPQA,6.49,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HFv2 IFEval,43.77,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HFv2 MMLU Pro,32.16,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HFv2 Math Level 5,15.11,,hf_open_llm_v2_240829.csv +seallms_v3_7b_chat,HFv2 MuSR,10.47,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HF OpenLLM v2,5.51,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HFv2 BBH,4.74,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HFv2 IFEval,21.98,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HFv2 MMLU Pro,1.9,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HFv2 Math Level 5,0.83,,hf_open_llm_v2_240829.csv +sheared_llama_1_3b,HFv2 MuSR,3.58,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HF OpenLLM v2,6.31,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HFv2 BBH,5.66,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HFv2 IFEval,24.17,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HFv2 MMLU Pro,2.08,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HFv2 Math Level 5,0.53,,hf_open_llm_v2_240829.csv +sheared_llama_2_7b,HFv2 MuSR,2.09,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HF OpenLLM v2,19.32,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HFv2 BBH,16.69,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HFv2 IFEval,53.68,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HFv2 MMLU Pro,23.15,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HFv2 Math Level 5,5.97,,hf_open_llm_v2_240829.csv +silicon_maid_7b,HFv2 MuSR,11.09,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HF OpenLLM v2,23.76,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HFv2 BBH,34.26,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HFv2 GPQA,10.63,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HFv2 IFEval,50.16,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HFv2 MMLU Pro,39.37,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +smaug_34b_v0_1,HFv2 MuSR,8.13,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HF OpenLLM v2,34.72,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HFv2 BBH,49.07,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HFv2 GPQA,6.15,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HFv2 IFEval,77.61,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HFv2 MMLU Pro,41.83,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HFv2 Math Level 5,21.22,,hf_open_llm_v2_240829.csv +smaug_llama3_70b_instruct_32k,HFv2 MuSR,12.43,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HF OpenLLM v2,41.08,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HFv2 BBH,56.27,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HFv2 GPQA,14.88,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HFv2 IFEval,78.25,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HFv2 MMLU Pro,46.56,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HFv2 Math Level 5,35.35,,hf_open_llm_v2_240829.csv +smaug_qwen2_72b_instruct,HFv2 MuSR,15.18,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HF OpenLLM v2,3.92,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HFv2 BBH,3.2,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HFv2 GPQA,1.01,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HFv2 IFEval,13.84,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HFv2 MMLU Pro,1.19,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +smol_llama_101m_gqa,HFv2 MuSR,4.28,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HF OpenLLM v2,6.4,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HFv2 BBH,3.04,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HFv2 GPQA,0.78,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HFv2 IFEval,23.86,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HFv2 MMLU Pro,1.66,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +smol_llama_220m_gqa,HFv2 MuSR,9.07,,hf_open_llm_v2_240829.csv +smollm_135m,HF OpenLLM v2,6.84,,hf_open_llm_v2_240829.csv +smollm_135m,HFv2 BBH,3.29,,hf_open_llm_v2_240829.csv +smollm_135m,HFv2 GPQA,1.12,,hf_open_llm_v2_240829.csv +smollm_135m,HFv2 IFEval,21.25,,hf_open_llm_v2_240829.csv +smollm_135m,HFv2 MMLU Pro,1.36,,hf_open_llm_v2_240829.csv +smollm_135m,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +smollm_135m,HFv2 MuSR,13.34,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HF OpenLLM v2,4.23,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HFv2 BBH,2.08,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HFv2 IFEval,15.96,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HFv2 MMLU Pro,1.84,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +smollm_135m_instruct,HFv2 MuSR,3.62,,hf_open_llm_v2_240829.csv +smollm_1_7b,HF OpenLLM v2,5.43,,hf_open_llm_v2_240829.csv +smollm_1_7b,HFv2 BBH,4.41,,hf_open_llm_v2_240829.csv +smollm_1_7b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +smollm_1_7b,HFv2 IFEval,23.62,,hf_open_llm_v2_240829.csv +smollm_1_7b,HFv2 MMLU Pro,1.64,,hf_open_llm_v2_240829.csv +smollm_1_7b,HFv2 Math Level 5,0.76,,hf_open_llm_v2_240829.csv +smollm_1_7b,HFv2 MuSR,2.13,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HF OpenLLM v2,5.14,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HFv2 BBH,2.08,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HFv2 GPQA,1.34,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HFv2 IFEval,23.48,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HFv2 MMLU Pro,1.85,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +smollm_1_7b_instruct,HFv2 MuSR,2.08,,hf_open_llm_v2_240829.csv +smollm_360m,HF OpenLLM v2,6.15,,hf_open_llm_v2_240829.csv +smollm_360m,HFv2 BBH,3.28,,hf_open_llm_v2_240829.csv +smollm_360m,HFv2 GPQA,2.35,,hf_open_llm_v2_240829.csv +smollm_360m,HFv2 IFEval,21.34,,hf_open_llm_v2_240829.csv +smollm_360m,HFv2 MMLU Pro,1.37,,hf_open_llm_v2_240829.csv +smollm_360m,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv +smollm_360m,HFv2 MuSR,8.09,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HF OpenLLM v2,19.63,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HFv2 BBH,31.87,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HFv2 GPQA,7.83,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HFv2 IFEval,47.37,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HFv2 MMLU Pro,23.76,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +solar_10_7b_instruct_v1_0,HFv2 MuSR,6.94,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HF OpenLLM v2,16.77,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HFv2 BBH,29.79,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HFv2 GPQA,4.14,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HFv2 IFEval,24.21,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HFv2 MMLU Pro,26.67,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HFv2 Math Level 5,2.11,,hf_open_llm_v2_240829.csv +solar_10_7b_v1_0,HFv2 MuSR,13.68,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HF OpenLLM v2,13.86,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HFv2 BBH,22.69,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HFv2 GPQA,3.8,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HFv2 IFEval,15.69,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HFv2 MMLU Pro,23.02,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HFv2 Math Level 5,3.47,,hf_open_llm_v2_240829.csv +stablelm_2_12b,HFv2 MuSR,14.49,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HF OpenLLM v2,16.22,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HFv2 BBH,25.25,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HFv2 GPQA,2.24,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HFv2 IFEval,40.82,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HFv2 MMLU Pro,19.27,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HFv2 Math Level 5,2.04,,hf_open_llm_v2_240829.csv +stablelm_2_12b_chat,HFv2 MuSR,7.73,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HF OpenLLM v2,5.22,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HFv2 BBH,8.63,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HFv2 IFEval,11.57,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HFv2 MMLU Pro,5.15,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HFv2 Math Level 5,0.15,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b,HFv2 MuSR,5.79,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HF OpenLLM v2,8.63,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HFv2 BBH,7.49,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HFv2 IFEval,30.6,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HFv2 MMLU Pro,6.91,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HFv2 Math Level 5,1.06,,hf_open_llm_v2_240829.csv +stablelm_2_1_6b_chat,HFv2 MuSR,5.71,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HF OpenLLM v2,9.26,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HFv2 BBH,6.71,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HFv2 IFEval,32.79,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HFv2 MMLU Pro,7.93,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HFv2 Math Level 5,2.11,,hf_open_llm_v2_240829.csv +stablelm_2_zephyr_1_6b,HFv2 MuSR,5.99,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HF OpenLLM v2,7.26,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HFv2 BBH,9.01,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HFv2 IFEval,22.03,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HFv2 MMLU Pro,7.43,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +stablelm_3b_4e1t,HFv2 MuSR,4.42,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HF OpenLLM v2,12.33,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HFv2 BBH,14.76,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HFv2 IFEval,36.83,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HFv2 MMLU Pro,8.53,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HFv2 Math Level 5,4.08,,hf_open_llm_v2_240829.csv +stablelm_zephyr_3b,HFv2 MuSR,9.79,,hf_open_llm_v2_240829.csv +starcoder2_15b,HF OpenLLM v2,12.44,,hf_open_llm_v2_240829.csv +starcoder2_15b,HFv2 BBH,20.37,,hf_open_llm_v2_240829.csv +starcoder2_15b,HFv2 GPQA,3.13,,hf_open_llm_v2_240829.csv +starcoder2_15b,HFv2 IFEval,27.8,,hf_open_llm_v2_240829.csv +starcoder2_15b,HFv2 MMLU Pro,15.03,,hf_open_llm_v2_240829.csv +starcoder2_15b,HFv2 Math Level 5,5.36,,hf_open_llm_v2_240829.csv +starcoder2_15b,HFv2 MuSR,2.93,,hf_open_llm_v2_240829.csv +starcoder2_3b,HF OpenLLM v2,6.54,,hf_open_llm_v2_240829.csv +starcoder2_3b,HFv2 BBH,8.91,,hf_open_llm_v2_240829.csv +starcoder2_3b,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +starcoder2_3b,HFv2 IFEval,20.37,,hf_open_llm_v2_240829.csv +starcoder2_3b,HFv2 MMLU Pro,7.07,,hf_open_llm_v2_240829.csv +starcoder2_3b,HFv2 Math Level 5,1.44,,hf_open_llm_v2_240829.csv +starcoder2_3b,HFv2 MuSR,1.43,,hf_open_llm_v2_240829.csv +starcoder2_7b,HF OpenLLM v2,8.21,,hf_open_llm_v2_240829.csv +starcoder2_7b,HFv2 BBH,11.4,,hf_open_llm_v2_240829.csv +starcoder2_7b,HFv2 GPQA,0.22,,hf_open_llm_v2_240829.csv +starcoder2_7b,HFv2 IFEval,22.09,,hf_open_llm_v2_240829.csv +starcoder2_7b,HFv2 MMLU Pro,7.14,,hf_open_llm_v2_240829.csv +starcoder2_7b,HFv2 Math Level 5,2.57,,hf_open_llm_v2_240829.csv +starcoder2_7b,HFv2 MuSR,5.82,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HF OpenLLM v2,20.64,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HFv2 BBH,21.95,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HFv2 GPQA,6.26,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HFv2 IFEval,54.8,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HFv2 MMLU Pro,24.13,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HFv2 Math Level 5,7.18,,hf_open_llm_v2_240829.csv +starling_lm_7b_alpha,HFv2 MuSR,9.5,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HF OpenLLM v2,19.53,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 BBH,25.08,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 GPQA,1.23,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 IFEval,58.17,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 MMLU Pro,25.66,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 Math Level 5,3.02,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_full,HFv2 MuSR,4.04,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HF OpenLLM v2,21.28,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 BBH,26.35,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 IFEval,62.49,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 MMLU Pro,29.04,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 Math Level 5,7.7,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_half,HFv2 MuSR,2.11,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HF OpenLLM v2,23.37,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 BBH,27.67,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 GPQA,3.02,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 IFEval,66.37,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 MMLU Pro,29.83,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 Math Level 5,8.53,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top25,HFv2 MuSR,4.81,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HF OpenLLM v2,23.43,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 BBH,28.06,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 GPQA,3.02,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 IFEval,66.87,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 MMLU Pro,30.77,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 Math Level 5,6.57,,hf_open_llm_v2_240829.csv +suzume_llama3_8b_multilingual_orpo_borda_top75,HFv2 MuSR,5.31,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HF OpenLLM v2,4.7,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HFv2 BBH,3.21,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HFv2 GPQA,0.0,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HFv2 IFEval,20.01,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HFv2 MMLU Pro,0.54,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HFv2 Math Level 5,0.45,,hf_open_llm_v2_240829.csv +tinyllama_v1_1,HFv2 MuSR,3.98,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HF OpenLLM v2,4.5,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HFv2 BBH,2.27,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HFv2 GPQA,1.9,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HFv2 IFEval,18.56,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HFv2 MMLU Pro,1.01,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HFv2 Math Level 5,0.0,,hf_open_llm_v2_240829.csv +tinyyi_7b_test,HFv2 MuSR,3.22,,hf_open_llm_v2_240829.csv +windyflollm,HF OpenLLM v2,14.17,,hf_open_llm_v2_240829.csv +windyflollm,HFv2 BBH,24.4,,hf_open_llm_v2_240829.csv +windyflollm,HFv2 GPQA,3.36,,hf_open_llm_v2_240829.csv +windyflollm,HFv2 IFEval,26.69,,hf_open_llm_v2_240829.csv +windyflollm,HFv2 MMLU Pro,17.57,,hf_open_llm_v2_240829.csv +windyflollm,HFv2 Math Level 5,1.13,,hf_open_llm_v2_240829.csv +windyflollm,HFv2 MuSR,11.86,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HF OpenLLM v2,25.43,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HFv2 BBH,42.75,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HFv2 GPQA,15.44,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HFv2 IFEval,28.41,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HFv2 MMLU Pro,40.73,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HFv2 Math Level 5,14.05,,hf_open_llm_v2_240829.csv +yi_1_5_34b,HFv2 MuSR,11.22,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HF OpenLLM v2,26.4,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HFv2 BBH,43.38,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HFv2 GPQA,15.1,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HFv2 IFEval,31.19,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HFv2 MMLU Pro,41.21,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HFv2 Math Level 5,13.44,,hf_open_llm_v2_240829.csv +yi_1_5_34b_32k,HFv2 MuSR,14.08,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HF OpenLLM v2,32.63,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HFv2 BBH,44.26,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HFv2 GPQA,15.32,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HFv2 IFEval,60.67,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HFv2 MMLU Pro,39.12,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HFv2 Math Level 5,23.34,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat,HFv2 MuSR,13.06,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HF OpenLLM v2,28.98,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HFv2 BBH,44.54,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HFv2 GPQA,11.74,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HFv2 IFEval,45.64,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HFv2 MMLU Pro,39.38,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HFv2 Math Level 5,18.81,,hf_open_llm_v2_240829.csv +yi_1_5_34b_chat_16k,HFv2 MuSR,13.74,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HF OpenLLM v2,16.53,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HFv2 BBH,22.03,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HFv2 GPQA,8.5,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HFv2 IFEval,26.17,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HFv2 MMLU Pro,23.82,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HFv2 Math Level 5,5.36,,hf_open_llm_v2_240829.csv +yi_1_5_6b,HFv2 MuSR,13.31,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HF OpenLLM v2,22.05,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HFv2 BBH,23.55,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HFv2 IFEval,48.02,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HFv2 MMLU Pro,24.41,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HFv2 Math Level 5,12.54,,hf_open_llm_v2_240829.csv +yi_1_5_6b_chat,HFv2 MuSR,14.7,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HF OpenLLM v2,21.95,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HFv2 BBH,30.5,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HFv2 GPQA,17.23,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HFv2 IFEval,29.36,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HFv2 MMLU Pro,32.4,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HFv2 Math Level 5,10.2,,hf_open_llm_v2_240829.csv +yi_1_5_9b,HFv2 MuSR,12.03,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HF OpenLLM v2,19.61,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HFv2 BBH,28.94,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HFv2 GPQA,14.54,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HFv2 IFEval,23.03,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HFv2 MMLU Pro,30.72,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HFv2 Math Level 5,9.59,,hf_open_llm_v2_240829.csv +yi_1_5_9b_32k,HFv2 MuSR,10.83,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HF OpenLLM v2,27.71,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HFv2 BBH,36.95,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HFv2 GPQA,11.3,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HFv2 IFEval,60.46,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HFv2 MMLU Pro,33.06,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HFv2 Math Level 5,11.63,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat,HFv2 MuSR,12.84,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HF OpenLLM v2,22.9,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HFv2 BBH,31.5,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HFv2 GPQA,7.83,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HFv2 IFEval,42.14,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HFv2 MMLU Pro,33.26,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HFv2 Math Level 5,12.61,,hf_open_llm_v2_240829.csv +yi_1_5_9b_chat_16k,HFv2 MuSR,10.04,,hf_open_llm_v2_240829.csv +yi_34b,HF OpenLLM v2,22.26,,hf_open_llm_v2_240829.csv +yi_34b,HFv2 BBH,35.54,,hf_open_llm_v2_240829.csv +yi_34b,HFv2 GPQA,15.55,,hf_open_llm_v2_240829.csv +yi_34b,HFv2 IFEval,30.46,,hf_open_llm_v2_240829.csv +yi_34b,HFv2 MMLU Pro,37.91,,hf_open_llm_v2_240829.csv +yi_34b,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv +yi_34b,HFv2 MuSR,9.65,,hf_open_llm_v2_240829.csv +yi_34b_200k,HF OpenLLM v2,19.8,,hf_open_llm_v2_240829.csv +yi_34b_200k,HFv2 BBH,36.02,,hf_open_llm_v2_240829.csv +yi_34b_200k,HFv2 GPQA,14.21,,hf_open_llm_v2_240829.csv +yi_34b_200k,HFv2 IFEval,15.42,,hf_open_llm_v2_240829.csv +yi_34b_200k,HFv2 MMLU Pro,39.27,,hf_open_llm_v2_240829.csv +yi_34b_200k,HFv2 Math Level 5,4.46,,hf_open_llm_v2_240829.csv +yi_34b_200k,HFv2 MuSR,9.41,,hf_open_llm_v2_240829.csv +yi_34b_chat,HF OpenLLM v2,23.9,,hf_open_llm_v2_240829.csv +yi_34b_chat,HFv2 BBH,37.62,,hf_open_llm_v2_240829.csv +yi_34b_chat,HFv2 GPQA,11.74,,hf_open_llm_v2_240829.csv +yi_34b_chat,HFv2 IFEval,46.99,,hf_open_llm_v2_240829.csv +yi_34b_chat,HFv2 MMLU Pro,34.37,,hf_open_llm_v2_240829.csv +yi_34b_chat,HFv2 Math Level 5,4.31,,hf_open_llm_v2_240829.csv +yi_34b_chat,HFv2 MuSR,8.36,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HF OpenLLM v2,25.91,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HFv2 BBH,31.26,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HFv2 GPQA,9.62,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HFv2 IFEval,53.19,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HFv2 MMLU Pro,40.85,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HFv2 Math Level 5,6.19,,hf_open_llm_v2_240829.csv +yi_34bx2_moe_60b_dpo,HFv2 MuSR,14.32,,hf_open_llm_v2_240829.csv +yi_6b,HF OpenLLM v2,13.6,,hf_open_llm_v2_240829.csv +yi_6b,HFv2 BBH,19.41,,hf_open_llm_v2_240829.csv +yi_6b,HFv2 GPQA,2.57,,hf_open_llm_v2_240829.csv +yi_6b,HFv2 IFEval,28.93,,hf_open_llm_v2_240829.csv +yi_6b,HFv2 MMLU Pro,22.12,,hf_open_llm_v2_240829.csv +yi_6b,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv +yi_6b,HFv2 MuSR,7.04,,hf_open_llm_v2_240829.csv +yi_6b_200k,HF OpenLLM v2,11.9,,hf_open_llm_v2_240829.csv +yi_6b_200k,HFv2 BBH,20.15,,hf_open_llm_v2_240829.csv +yi_6b_200k,HFv2 GPQA,4.25,,hf_open_llm_v2_240829.csv +yi_6b_200k,HFv2 IFEval,8.43,,hf_open_llm_v2_240829.csv +yi_6b_200k,HFv2 MMLU Pro,20.49,,hf_open_llm_v2_240829.csv +yi_6b_200k,HFv2 Math Level 5,1.21,,hf_open_llm_v2_240829.csv +yi_6b_200k,HFv2 MuSR,16.84,,hf_open_llm_v2_240829.csv +yi_6b_chat,HF OpenLLM v2,14.0,,hf_open_llm_v2_240829.csv +yi_6b_chat,HFv2 BBH,17.0,,hf_open_llm_v2_240829.csv +yi_6b_chat,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv +yi_6b_chat,HFv2 IFEval,33.95,,hf_open_llm_v2_240829.csv +yi_6b_chat,HFv2 MMLU Pro,22.9,,hf_open_llm_v2_240829.csv +yi_6b_chat,HFv2 Math Level 5,0.68,,hf_open_llm_v2_240829.csv +yi_6b_chat,HFv2 MuSR,3.57,,hf_open_llm_v2_240829.csv +yi_9b,HF OpenLLM v2,17.61,,hf_open_llm_v2_240829.csv +yi_9b,HFv2 BBH,27.63,,hf_open_llm_v2_240829.csv +yi_9b,HFv2 GPQA,9.06,,hf_open_llm_v2_240829.csv +yi_9b,HFv2 IFEval,27.09,,hf_open_llm_v2_240829.csv +yi_9b,HFv2 MMLU Pro,28.6,,hf_open_llm_v2_240829.csv +yi_9b,HFv2 Math Level 5,4.38,,hf_open_llm_v2_240829.csv +yi_9b,HFv2 MuSR,8.91,,hf_open_llm_v2_240829.csv +yi_9b_200k,HF OpenLLM v2,17.59,,hf_open_llm_v2_240829.csv +yi_9b_200k,HFv2 BBH,26.49,,hf_open_llm_v2_240829.csv +yi_9b_200k,HFv2 GPQA,8.72,,hf_open_llm_v2_240829.csv +yi_9b_200k,HFv2 IFEval,23.27,,hf_open_llm_v2_240829.csv +yi_9b_200k,HFv2 MMLU Pro,29.13,,hf_open_llm_v2_240829.csv +yi_9b_200k,HFv2 Math Level 5,5.82,,hf_open_llm_v2_240829.csv +yi_9b_200k,HFv2 MuSR,12.11,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HF OpenLLM v2,18.53,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HFv2 BBH,23.96,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HFv2 GPQA,6.38,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HFv2 IFEval,51.91,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HFv2 MMLU Pro,19.94,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HFv2 Math Level 5,1.51,,hf_open_llm_v2_240829.csv +zephyr_7b_alpha,HFv2 MuSR,7.5,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HF OpenLLM v2,17.72,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HFv2 BBH,21.49,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HFv2 GPQA,5.37,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HFv2 IFEval,49.5,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HFv2 MMLU Pro,19.79,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HFv2 Math Level 5,2.42,,hf_open_llm_v2_240829.csv +zephyr_7b_beta,HFv2 MuSR,7.73,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HF OpenLLM v2,15.78,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HFv2 BBH,23.75,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HFv2 GPQA,5.93,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HFv2 IFEval,33.64,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HFv2 MMLU Pro,20.53,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HFv2 Math Level 5,6.65,,hf_open_llm_v2_240829.csv +zephyr_7b_gemma_v0_1,HFv2 MuSR,4.18,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HF OpenLLM v2,33.77,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HFv2 BBH,47.5,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HFv2 GPQA,17.11,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HFv2 IFEval,65.11,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HFv2 MMLU Pro,39.85,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HFv2 Math Level 5,18.35,,hf_open_llm_v2_240829.csv +zephyr_orpo_141b_a35b_v0_1,HFv2 MuSR,14.72,,hf_open_llm_v2_240829.csv +llama2_7b_chat,tablebench_overall_dp,16.98,[],tablebench_241002.csv +codellama_7b_instruct,tablebench_overall_dp,17.01,[],tablebench_241002.csv +gemma_7b_instruct,tablebench_overall_dp,14.82,[],tablebench_241002.csv +mistral_7b_instruct,tablebench_overall_dp,19.15,[],tablebench_241002.csv +deepseek_coder_7b_instruct,tablebench_overall_dp,13.82,[],tablebench_241002.csv +codeqwen1_5_7b_chat,tablebench_overall_dp,16.76,[],tablebench_241002.csv +qwen1_5_7b_chat,tablebench_overall_dp,15.84,[],tablebench_241002.csv +qwen2_7b_instruct,tablebench_overall_dp,21.23,[],tablebench_241002.csv +structlm_7b,tablebench_overall_dp,12.06,[],tablebench_241002.csv +map_neo_7b_instruct,tablebench_overall_dp,12.66,[],tablebench_241002.csv +llama3_8b_chat,tablebench_overall_dp,27.28,[],tablebench_241002.csv +llama3_1_8b_instruct,tablebench_overall_dp,23.47,[],tablebench_241002.csv +llama2_13b_chat,tablebench_overall_dp,18.58,[],tablebench_241002.csv +structlm_13b,tablebench_overall_dp,11.52,[],tablebench_241002.csv +wizardlm_13b,tablebench_overall_dp,20.8,[],tablebench_241002.csv +qwen1_5_14b_chat,tablebench_overall_dp,17.76,[],tablebench_241002.csv +qwen1_5_32b_chat,tablebench_overall_dp,20.21,[],tablebench_241002.csv +deepseek_coder_33b_instruct,tablebench_overall_dp,9.74,[],tablebench_241002.csv +codellama34b_instruct,tablebench_overall_dp,21.6,[],tablebench_241002.csv +structlm_34b,tablebench_overall_dp,0.6,[],tablebench_241002.csv +mixtral_8x7b_instruct,tablebench_overall_dp,24.98,[],tablebench_241002.csv +qwen1_5_72b_chat,tablebench_overall_dp,28.45,[],tablebench_241002.csv +qwen2_72b_instruct,tablebench_overall_dp,32.52,[],tablebench_241002.csv +qwen1_5_110b_chat,tablebench_overall_dp,29.72,[],tablebench_241002.csv +llama3_70b_chat,tablebench_overall_dp,30.91,[],tablebench_241002.csv +llama3_1_70b_instruct,tablebench_overall_dp,33.63,[],tablebench_241002.csv +gpt_3_5_turbo,tablebench_overall_dp,27.75,[],tablebench_241002.csv +qwen_max,tablebench_overall_dp,29.63,[],tablebench_241002.csv +yi_large,tablebench_overall_dp,32.43,[],tablebench_241002.csv +glm_4,tablebench_overall_dp,31.23,[],tablebench_241002.csv +deepseek_chat_v2,tablebench_overall_dp,40.65,[],tablebench_241002.csv +deepseek_coder_v2,tablebench_overall_dp,35.21,[],tablebench_241002.csv +gpt_4_turbo,tablebench_overall_dp,40.38,[],tablebench_241002.csv +gpt_4o,tablebench_overall_dp,42.73,[],tablebench_241002.csv +tablellm_codeqwen_7b,tablebench_overall_dp,26.08,[],tablebench_241002.csv +tablellm_deepseek_coder_7b,tablebench_overall_dp,27.98,[],tablebench_241002.csv +tablellm_llama3_1_8b,tablebench_overall_dp,27.19,[],tablebench_241002.csv +tablellm_llama3_8b,tablebench_overall_dp,26.93,[],tablebench_241002.csv +tablellm_qwen2_7b,tablebench_overall_dp,27.14,[],tablebench_241002.csv +gemma_2b_it,trustworthy_average,67.18,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_average,66.87,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_average,60.62,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_average,74.72,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_average,80.61,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_average,62.29,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_average,72.45,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_average,69.24,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_average,82.96,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_average,76.31,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_average,59.49,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_average,56.58,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_average,80.61,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_non_toxicity,77.07,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_non_toxicity,75.52,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_non_toxicity,28.0,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_non_toxicity,80.0,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_non_toxicity,77.53,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_non_toxicity,40.0,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_non_toxicity,47.0,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_non_toxicity,41.0,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_non_toxicity,86.46,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_non_toxicity,59.02,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_non_toxicity,39.0,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_non_toxicity,18.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_non_toxicity,77.53,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_non_stereotype,73.33,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_non_stereotype,100.0,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_non_stereotype,81.0,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_non_stereotype,97.6,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_non_stereotype,98.33,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_non_stereotype,84.6,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_non_stereotype,87.0,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_non_stereotype,77.0,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_non_stereotype,99.67,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_non_stereotype,87.34,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_non_stereotype,87.0,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_non_stereotype,73.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_non_stereotype,98.33,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_advglue_pp,43.21,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_advglue_pp,43.43,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_advglue_pp,52.16,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_advglue_pp,51.01,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_advglue_pp,67.28,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_advglue_pp,46.2,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_advglue_pp,56.69,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_advglue_pp,64.04,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_advglue_pp,51.36,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_advglue_pp,50.25,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_advglue_pp,43.98,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_advglue_pp,44.81,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_advglue_pp,67.28,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_ood,51.43,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_ood,61.78,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_ood,59.1,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_ood,75.65,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_ood,70.85,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_ood,64.26,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_ood,73.58,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_ood,87.55,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_ood,86.59,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_ood,79.07,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_ood,51.45,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_ood,54.21,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_ood,70.85,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_adv_demo,35.55,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_adv_demo,33.33,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_adv_demo,57.99,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_adv_demo,55.54,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_adv_demo,75.54,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_adv_demo,58.25,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_adv_demo,81.28,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_adv_demo,77.94,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_adv_demo,88.1,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_adv_demo,88.49,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_adv_demo,33.95,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_adv_demo,58.51,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_adv_demo,75.54,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_privacy,88.77,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_privacy,83.69,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_privacy,72.96,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_privacy,97.39,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_privacy,81.59,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_privacy,78.93,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_privacy,70.13,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_privacy,66.11,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_privacy,97.04,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_privacy,89.38,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_privacy,70.26,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_privacy,76.64,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_privacy,81.59,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_ethics,75.03,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_ethics,43.33,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_ethics,48.22,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_ethics,40.58,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_ethics,93.74,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_ethics,26.11,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_ethics,86.38,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_ethics,76.6,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_ethics,92.02,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_ethics,87.2,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_ethics,50.28,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_ethics,27.49,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_ethics,93.74,[],llm_trustworthy_241001.csv +gemma_2b_it,trustworthy_fairness,93.02,[],llm_trustworthy_241001.csv +gemma_7b_it,trustworthy_fairness,93.88,[],llm_trustworthy_241001.csv +vicuna_7b_v1_3,trustworthy_fairness,85.53,[],llm_trustworthy_241001.csv +llama_2_7b_chat,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv +llama3_8b_instruct,trustworthy_fairness,80.05,[],llm_trustworthy_241001.csv +mpt_7b_chat,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv +gpt_3_5_turbo_0301,trustworthy_fairness,77.57,[],llm_trustworthy_241001.csv +gpt_4_0314,trustworthy_fairness,63.67,[],llm_trustworthy_241001.csv +gpt_4o_2024_05_13,trustworthy_fairness,62.47,[],llm_trustworthy_241001.csv +gpt_4o_mini_2024_07_18,trustworthy_fairness,69.74,[],llm_trustworthy_241001.csv +falcon_7b_instruct,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv +redpajama_incite_7b_instruct,trustworthy_fairness,100.0,[],llm_trustworthy_241001.csv +gemini_pro_1_0,trustworthy_fairness,80.05,[],llm_trustworthy_241001.csv +gpt_4o_20240513,OpenCompass Academic,77.0,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass Academic,73.1,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass Academic,72.5,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass Academic,66.6,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass Academic,61.7,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass Academic,60.4,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass Academic,60.3,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass Academic,59.5,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass Academic,57.1,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass Academic,56.9,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass Academic,56.1,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass Academic,52.0,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass Academic,50.6,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass Academic,49.7,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass Academic,45.2,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass Academic,43.5,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Academic,42.6,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass Academic,42.1,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass Academic,35.4,[],opencompass_academic_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Academic,31.2,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass MMLU,88.0,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass MMLU,83.1,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass MMLU,82.9,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass MMLU,80.7,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass MMLU,74.0,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass MMLU,71.3,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass MMLU,70.6,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass MMLU,72.9,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass MMLU,72.5,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass MMLU,70.9,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass MMLU,67.8,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass MMLU,51.1,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass MMLU,66.7,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass MMLU,67.0,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass MMLU,55.8,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass MMLU,48.4,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass MMLU,67.2,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass MMLU,58.8,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass MMLU,41.9,[],opencompass_academic_240829.csv +mistral_7b_instruct_v0_3,OpenCompass MMLU,30.9,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass MMLU Pro,73.8,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass MMLU Pro,65.1,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass MMLU Pro,63.2,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass MMLU Pro,61.8,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass MMLU Pro,51.8,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass MMLU Pro,50.9,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass MMLU Pro,44.9,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass MMLU Pro,48.3,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass MMLU Pro,49.8,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass MMLU Pro,47.1,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass MMLU Pro,45.9,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass MMLU Pro,38.8,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass MMLU Pro,42.3,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass MMLU Pro,40.5,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass MMLU Pro,36.3,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass MMLU Pro,30.1,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass MMLU Pro,42.7,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass MMLU Pro,32.4,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass MMLU Pro,25.6,[],opencompass_academic_240829.csv +mistral_7b_instruct_v0_3,OpenCompass MMLU Pro,22.1,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass CMMLU,78.3,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass CMMLU,79.8,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass CMMLU,65.6,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass CMMLU,66.2,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass CMMLU,79.4,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass CMMLU,63.4,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass CMMLU,73.8,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass CMMLU,71.6,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass CMMLU,76.3,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass CMMLU,67.8,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass CMMLU,65.0,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass CMMLU,59.3,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass CMMLU,51.5,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass CMMLU,73.3,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass CMMLU,44.7,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass CMMLU,53.9,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass CMMLU,33.9,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass CMMLU,47.8,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass CMMLU,42.3,[],opencompass_academic_240829.csv +mistral_7b_instruct_v0_3,OpenCompass CMMLU,35.5,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass BBH,87.6,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass BBH,85.2,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass BBH,81.9,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass BBH,83.2,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass BBH,74.2,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass BBH,73.8,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass BBH,74.5,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass BBH,60.6,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass BBH,68.2,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass BBH,72.8,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass BBH,67.9,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass BBH,65.4,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass BBH,54.4,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass BBH,58.3,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass BBH,65.6,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass BBH,56.5,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass BBH,55.7,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass BBH,60.3,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass BBH,41.0,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass GQPA-Dimand,49.5,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass GQPA-Dimand,42.9,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass GQPA-Dimand,47.5,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass GQPA-Dimand,39.4,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass GQPA-Dimand,28.3,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass GQPA-Dimand,32.8,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass GQPA-Dimand,29.3,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass GQPA-Dimand,26.8,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass GQPA-Dimand,31.3,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass GQPA-Dimand,28.3,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass GQPA-Dimand,25.2,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass GQPA-Dimand,25.8,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass GQPA-Dimand,33.8,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass GQPA-Dimand,26.3,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass GQPA-Dimand,21.7,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass GQPA-Dimand,23.2,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass GQPA-Dimand,29.3,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass GQPA-Dimand,26.3,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass GQPA-Dimand,21.2,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass HumanEval,86.0,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass HumanEval,84.2,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass HumanEval,87.8,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass HumanEval,76.2,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass HumanEval,77.4,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass HumanEval,77.4,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass HumanEval,73.2,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass HumanEval,75.6,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass HumanEval,67.7,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass HumanEval,67.7,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass HumanEval,68.9,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass HumanEval,76.8,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass HumanEval,59.8,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass HumanEval,60.4,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass HumanEval,67.7,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass HumanEval,45.7,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass HumanEval,34.8,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass HumanEval,50.6,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass HumanEval,50.0,[],opencompass_academic_240829.csv +gpt_4o_20240513,OpenCompass IFEval,79.0,[],opencompass_academic_240829.csv +qwen2_72b_instruct,OpenCompass IFEval,76.5,[],opencompass_academic_240829.csv +gpt_4o_mini_20240718,OpenCompass IFEval,81.0,[],opencompass_academic_240829.csv +llama3_70b_instruct,OpenCompass IFEval,77.5,[],opencompass_academic_240829.csv +qwen1_5_110b_chat,OpenCompass IFEval,54.3,[],opencompass_academic_240829.csv +yi_1_5_34b_chat,OpenCompass IFEval,59.5,[],opencompass_academic_240829.csv +internlm2_5_chat_7b,OpenCompass IFEval,54.5,[],opencompass_academic_240829.csv +glm_4_9b_chat,OpenCompass IFEval,69.1,[],opencompass_academic_240829.csv +qwen1_5_32b_chat,OpenCompass IFEval,48.4,[],opencompass_academic_240829.csv +qwen1_5_72b_chat,OpenCompass IFEval,53.8,[],opencompass_academic_240829.csv +yi_1_5_9b_chat,OpenCompass IFEval,56.8,[],opencompass_academic_240829.csv +qwen2_7b_instruct,OpenCompass IFEval,49.7,[],opencompass_academic_240829.csv +llama3_8b_instruct,OpenCompass IFEval,68.4,[],opencompass_academic_240829.csv +qwen1_5_14b_chat,OpenCompass IFEval,42.0,[],opencompass_academic_240829.csv +internlm2_chat_20b,OpenCompass IFEval,35.5,[],opencompass_academic_240829.csv +yi_1_5_6b_chat,OpenCompass IFEval,47.3,[],opencompass_academic_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass IFEval,50.8,[],opencompass_academic_240829.csv +internlm2_chat_7b,OpenCompass IFEval,32.4,[],opencompass_academic_240829.csv +qwen1_5_7b_chat,OpenCompass IFEval,38.6,[],opencompass_academic_240829.csv +claude_3_5_sonnet_20240620,Helm MMLU,0.865,[],helm_mmlu_240829.csv +claude_3_opus_20240229,Helm MMLU,0.846,[],helm_mmlu_240829.csv +llama3_1_instruct_turbo_405b,Helm MMLU,0.845,[],helm_mmlu_240829.csv +gpt_4o_2024_05_13,Helm MMLU,0.842,[],helm_mmlu_240829.csv +gemini_1_5_pro_001,Helm MMLU,0.827,[],helm_mmlu_240829.csv +gpt_4_0613,Helm MMLU,0.824,[],helm_mmlu_240829.csv +qwen2_instruct_72b,Helm MMLU,0.824,[],helm_mmlu_240829.csv +gpt_4_turbo_2024_04_09,Helm MMLU,0.813,[],helm_mmlu_240829.csv +gemini_1_5_pro_0409_preview,Helm MMLU,0.81,[],helm_mmlu_240829.csv +llama3_1_instruct_turbo_70b,Helm MMLU,0.801,[],helm_mmlu_240829.csv +mistral_large_2_2407,Helm MMLU,0.8,[],helm_mmlu_240829.csv +gpt_4_turbo_1106_preview,Helm MMLU,0.796,[],helm_mmlu_240829.csv +llama3_70b,Helm MMLU,0.793,[],helm_mmlu_240829.csv +yi_large_preview,Helm MMLU,0.793,[],helm_mmlu_240829.csv +palmyra_x_v3_72b,Helm MMLU,0.786,[],helm_mmlu_240829.csv +palm_2_unicorn,Helm MMLU,0.786,[],helm_mmlu_240829.csv +gemini_1_5_flash_001,Helm MMLU,0.779,[],helm_mmlu_240829.csv +mixtral_8x22b,Helm MMLU,0.778,[],helm_mmlu_240829.csv +gemini_1_5_flash_0514_preview,Helm MMLU,0.778,[],helm_mmlu_240829.csv +phi_3_14b,Helm MMLU,0.775,[],helm_mmlu_240829.csv +qwen1_5_72b,Helm MMLU,0.774,[],helm_mmlu_240829.csv +qwen1_5_chat_110b,Helm MMLU,0.768,[],helm_mmlu_240829.csv +gpt_4o_mini_2024_07_18,Helm MMLU,0.767,[],helm_mmlu_240829.csv +yi_34b,Helm MMLU,0.762,[],helm_mmlu_240829.csv +claude_3_sonnet_20240229,Helm MMLU,0.759,[],helm_mmlu_240829.csv +gemma_2_27b,Helm MMLU,0.757,[],helm_mmlu_240829.csv +phi_3_7b,Helm MMLU,0.757,[],helm_mmlu_240829.csv +qwen1_5_32b,Helm MMLU,0.744,[],helm_mmlu_240829.csv +dbrx_instructruct,Helm MMLU,0.741,[],helm_mmlu_240829.csv +claude_3_haiku_20240307,Helm MMLU,0.738,[],helm_mmlu_240829.csv +claude_2_1,Helm MMLU,0.735,[],helm_mmlu_240829.csv +deepseek_llm_chat_67b,Helm MMLU,0.725,[],helm_mmlu_240829.csv +gemma_2_9b,Helm MMLU,0.721,[],helm_mmlu_240829.csv +mixtral_8x7b_32k_seqlen,Helm MMLU,0.717,[],helm_mmlu_240829.csv +gemini_1_0_pro_001,Helm MMLU,0.7,[],helm_mmlu_240829.csv +llama_2_70b,Helm MMLU,0.695,[],helm_mmlu_240829.csv +command_r_plus,Helm MMLU,0.694,[],helm_mmlu_240829.csv +palm_2_bison,Helm MMLU,0.692,[],helm_mmlu_240829.csv +gpt_3_5_turbo_0613,Helm MMLU,0.689,[],helm_mmlu_240829.csv +claude_instant_1_2,Helm MMLU,0.688,[],helm_mmlu_240829.csv +mistral_large_2402,Helm MMLU,0.688,[],helm_mmlu_240829.csv +mistral_small_2402,Helm MMLU,0.687,[],helm_mmlu_240829.csv +qwen1_5_14b,Helm MMLU,0.686,[],helm_mmlu_240829.csv +arctic_instruct,Helm MMLU,0.677,[],helm_mmlu_240829.csv +llama3_8b,Helm MMLU,0.668,[],helm_mmlu_240829.csv +gemma_7b,Helm MMLU,0.661,[],helm_mmlu_240829.csv +jamba_instruct,Helm MMLU,0.659,[],helm_mmlu_240829.csv +mistral_nemo_2402,Helm MMLU,0.653,[],helm_mmlu_240829.csv +command_r,Helm MMLU,0.652,[],helm_mmlu_240829.csv +yi_6b,Helm MMLU,0.64,[],helm_mmlu_240829.csv +qwen1_5_7b,Helm MMLU,0.626,[],helm_mmlu_240829.csv +mistral_instruct_v0_3_7b,Helm MMLU,0.599,[],helm_mmlu_240829.csv +phi_2,Helm MMLU,0.584,[],helm_mmlu_240829.csv +mistral_v0_1_7b,Helm MMLU,0.566,[],helm_mmlu_240829.csv +llama3_1_instruct_turbo_8b,Helm MMLU,0.561,[],helm_mmlu_240829.csv +llama_2_13b,Helm MMLU,0.554,[],helm_mmlu_240829.csv +olmo_1_7_7b,Helm MMLU,0.538,[],helm_mmlu_240829.csv +llama_2_7b,Helm MMLU,0.458,[],helm_mmlu_240829.csv +olmo_7b,Helm MMLU,0.295,[],helm_mmlu_240829.csv +claude_3_5_sonnet_20240620,LMSys Arena,79.3,[],chatbot_arena_240829.csv +gpt_4o_2024_05_13,LMSys Arena,79.2,[],chatbot_arena_240829.csv +gpt_4_0125_preview,LMSys Arena,78.0,[],chatbot_arena_240829.csv +gpt_4o_2024_08_06,LMSys Arena,77.9,[],chatbot_arena_240829.csv +athene_70b,LMSys Arena,77.6,[],chatbot_arena_240829.csv +gpt_4o_mini,LMSys Arena,74.9,[],chatbot_arena_240829.csv +gemini_1_5_pro_api_preview,LMSys Arena,72.0,[],chatbot_arena_240829.csv +mistral_large_2407,LMSys Arena,70.4,[],chatbot_arena_240829.csv +llama3_1_405b_instruct,LMSys Arena,64.1,[],chatbot_arena_240829.csv +glm_4_0520,LMSys Arena,63.8,[],chatbot_arena_240829.csv +yi_large,LMSys Arena,63.7,[],chatbot_arena_240829.csv +deepseek_coder_v2,LMSys Arena,62.3,[],chatbot_arena_240829.csv +claude_3_opus_20240229,LMSys Arena,60.4,[],chatbot_arena_240829.csv +gemma_2_27b_it,LMSys Arena,57.5,[],chatbot_arena_240829.csv +llama3_1_70b_instruct,LMSys Arena,55.7,[],chatbot_arena_240829.csv +glm_4_0116,LMSys Arena,55.7,[],chatbot_arena_240829.csv +glm_4_air,LMSys Arena,50.9,[],chatbot_arena_240829.csv +gpt_4_0314,LMSys Arena,50.0,[],chatbot_arena_240829.csv +gemini_1_5_flash_api_preview,LMSys Arena,49.6,[],chatbot_arena_240829.csv +qwen2_72b_instruct,LMSys Arena,46.9,[],chatbot_arena_240829.csv +claude_3_sonnet_20240229,LMSys Arena,46.8,[],chatbot_arena_240829.csv +llama3_70b_instruct,LMSys Arena,46.6,[],chatbot_arena_240829.csv +claude_3_haiku_20240307,LMSys Arena,41.5,[],chatbot_arena_240829.csv +gpt_4_0613,LMSys Arena,37.9,[],chatbot_arena_240829.csv +mistral_large_2402,LMSys Arena,37.7,[],chatbot_arena_240829.csv +mixtral_8x22b_instruct_v0_1,LMSys Arena,36.4,[],chatbot_arena_240829.csv +qwen1_5_72b_chat,LMSys Arena,36.1,[],chatbot_arena_240829.csv +phi_3_medium_4k_instruct,LMSys Arena,33.4,[],chatbot_arena_240829.csv +command_r_plus,LMSys Arena,33.1,[],chatbot_arena_240829.csv +mistral_medium,LMSys Arena,31.9,[],chatbot_arena_240829.csv +internlm2_5_20b_chat,LMSys Arena,31.2,[],chatbot_arena_240829.csv +phi_3_small_8k_instruct,LMSys Arena,29.8,[],chatbot_arena_240829.csv +mistral_next,LMSys Arena,27.4,[],chatbot_arena_240829.csv +gpt_3_5_turbo_0613,LMSys Arena,24.8,[],chatbot_arena_240829.csv +dbrx_instructruct_preview,LMSys Arena,24.6,[],chatbot_arena_240829.csv +internlm2_20b_chat,LMSys Arena,24.4,[],chatbot_arena_240829.csv +claude_2_0,LMSys Arena,24.0,[],chatbot_arena_240829.csv +mixtral_8x7b_instruct_v0_1,LMSys Arena,23.4,[],chatbot_arena_240829.csv +gpt_3_5_turbo_0125,LMSys Arena,23.3,[],chatbot_arena_240829.csv +yi_34b_chat,LMSys Arena,23.1,[],chatbot_arena_240829.csv +starling_lm_7b_beta,LMSys Arena,23.0,[],chatbot_arena_240829.csv +claude_2_1,LMSys Arena,22.8,[],chatbot_arena_240829.csv +llama3_1_8b_instruct,LMSys Arena,21.3,[],chatbot_arena_240829.csv +snorkel_mistral_pairrm_dpo,LMSys Arena,20.7,[],chatbot_arena_240829.csv +llama3_8b_instruct,LMSys Arena,20.6,[],chatbot_arena_240829.csv +gpt_3_5_turbo_1106,LMSys Arena,18.9,[],chatbot_arena_240829.csv +gpt_3_5_turbo_0301,LMSys Arena,18.1,[],chatbot_arena_240829.csv +gemini_1_0_pro,LMSys Arena,17.8,[],chatbot_arena_240829.csv +snowflake_arctic_instruct,LMSys Arena,17.6,[],chatbot_arena_240829.csv +command_r,LMSys Arena,17.0,[],chatbot_arena_240829.csv +phi_3_mini_128k_instruct,LMSys Arena,15.4,[],chatbot_arena_240829.csv +tulu_2_dpo_70b,LMSys Arena,15.0,[],chatbot_arena_240829.csv +starling_lm_7b_alpha,LMSys Arena,12.8,[],chatbot_arena_240829.csv +mistral_7b_instruct,LMSys Arena,12.6,[],chatbot_arena_240829.csv +gemma_1_1_7b_it,LMSys Arena,12.1,[],chatbot_arena_240829.csv +llama_2_70b_chat,LMSys Arena,11.6,[],chatbot_arena_240829.csv +vicuna_33b_v1_3,LMSys Arena,8.6,[],chatbot_arena_240829.csv +gemma_7b_it,LMSys Arena,7.5,[],chatbot_arena_240829.csv +llama_2_7b_chat,LMSys Arena,4.6,[],chatbot_arena_240829.csv +gemma_1_1_2b_it,LMSys Arena,3.4,[],chatbot_arena_240829.csv +gemma_2b_it,LMSys Arena,3.0,[],chatbot_arena_240829.csv +llama_2_70b,Helm Classic,0.944,[],helm_classic_240829.csv +llama_65b,Helm Classic,0.908,[],helm_classic_240829.csv +text_davinci_002,Helm Classic,0.905,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm Classic,0.884,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm Classic,0.874,[],helm_classic_240829.csv +text_davinci_003,Helm Classic,0.872,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm Classic,0.824,[],helm_classic_240829.csv +llama_2_13b,Helm Classic,0.823,[],helm_classic_240829.csv +tnlg_v2_530b,Helm Classic,0.787,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm Classic,0.783,[],helm_classic_240829.csv +llama30b,Helm Classic,0.781,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm Classic,0.78,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm Classic,0.76,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm Classic,0.743,[],helm_classic_240829.csv +palmyra_x_43b,Helm Classic,0.732,[],helm_classic_240829.csv +falcon_40b,Helm Classic,0.729,[],helm_classic_240829.csv +falcon_instruct_40b,Helm Classic,0.727,[],helm_classic_240829.csv +mpt_instruct_30b,Helm Classic,0.716,[],helm_classic_240829.csv +mpt_30b,Helm Classic,0.714,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm Classic,0.706,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm Classic,0.706,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm Classic,0.675,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm Classic,0.664,[],helm_classic_240829.csv +luminous_supreme_70b,Helm Classic,0.662,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm Classic,0.625,[],helm_classic_240829.csv +opt_175b,Helm Classic,0.609,[],helm_classic_240829.csv +llama_2_7b,Helm Classic,0.607,[],helm_classic_240829.csv +llama_13b,Helm Classic,0.595,[],helm_classic_240829.csv +instructpalmyra_30b,Helm Classic,0.568,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm Classic,0.56,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm Classic,0.553,[],helm_classic_240829.csv +davinci_175b,Helm Classic,0.538,[],helm_classic_240829.csv +llama_7b,Helm Classic,0.533,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm Classic,0.524,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm Classic,0.517,[],helm_classic_240829.csv +glm_130b,Helm Classic,0.512,[],helm_classic_240829.csv +luminous_extended_30b,Helm Classic,0.485,[],helm_classic_240829.csv +opt_66b,Helm Classic,0.448,[],helm_classic_240829.csv +bloom_176b,Helm Classic,0.446,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm Classic,0.433,[],helm_classic_240829.csv +alpaca_7b,Helm Classic,0.381,[],helm_classic_240829.csv +falcon_7b,Helm Classic,0.378,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm Classic,0.378,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm Classic,0.372,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm Classic,0.366,[],helm_classic_240829.csv +text_curie_001,Helm Classic,0.36,[],helm_classic_240829.csv +gpt_neox_20b,Helm Classic,0.351,[],helm_classic_240829.csv +luminous_base_13b,Helm Classic,0.315,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm Classic,0.312,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm Classic,0.311,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm Classic,0.309,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm Classic,0.285,[],helm_classic_240829.csv +gpt_j_6b,Helm Classic,0.273,[],helm_classic_240829.csv +pythia_12b,Helm Classic,0.257,[],helm_classic_240829.csv +curie_6_7b,Helm Classic,0.247,[],helm_classic_240829.csv +falcon_instruct_7b,Helm Classic,0.244,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm Classic,0.23,[],helm_classic_240829.csv +text_babbage_001,Helm Classic,0.229,[],helm_classic_240829.csv +t0pp_11b,Helm Classic,0.197,[],helm_classic_240829.csv +pythia_6_9b,Helm Classic,0.196,[],helm_classic_240829.csv +flan-ul2_20b,Helm Classic,0.167,[],helm_classic_240829.csv +t5_11b,Helm Classic,0.131,[],helm_classic_240829.csv +babbage_1_3b,Helm Classic,0.114,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm Classic,0.109,[],helm_classic_240829.csv +ada_350m,Helm Classic,0.108,[],helm_classic_240829.csv +text_ada_001,Helm Classic,0.107,[],helm_classic_240829.csv +yalm_100b,Helm Classic,0.075,[],helm_classic_240829.csv +llama_2_70b,Helm BoolQ,0.886,[],helm_classic_240829.csv +llama_65b,Helm BoolQ,0.871,[],helm_classic_240829.csv +text_davinci_002,Helm BoolQ,0.877,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm BoolQ,0.874,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm BoolQ,0.856,[],helm_classic_240829.csv +text_davinci_003,Helm BoolQ,0.881,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm BoolQ,0.829,[],helm_classic_240829.csv +llama_2_13b,Helm BoolQ,0.811,[],helm_classic_240829.csv +tnlg_v2_530b,Helm BoolQ,0.809,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm BoolQ,0.87,[],helm_classic_240829.csv +llama30b,Helm BoolQ,0.861,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm BoolQ,0.815,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm BoolQ,0.74,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm BoolQ,0.826,[],helm_classic_240829.csv +palmyra_x_43b,Helm BoolQ,0.896,[],helm_classic_240829.csv +falcon_40b,Helm BoolQ,0.819,[],helm_classic_240829.csv +falcon_instruct_40b,Helm BoolQ,0.829,[],helm_classic_240829.csv +mpt_instruct_30b,Helm BoolQ,0.85,[],helm_classic_240829.csv +mpt_30b,Helm BoolQ,0.704,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm BoolQ,0.812,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm BoolQ,0.808,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm BoolQ,0.798,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm BoolQ,0.762,[],helm_classic_240829.csv +luminous_supreme_70b,Helm BoolQ,0.775,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm BoolQ,0.76,[],helm_classic_240829.csv +opt_175b,Helm BoolQ,0.793,[],helm_classic_240829.csv +llama_2_7b,Helm BoolQ,0.762,[],helm_classic_240829.csv +llama_13b,Helm BoolQ,0.714,[],helm_classic_240829.csv +instructpalmyra_30b,Helm BoolQ,0.751,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm BoolQ,0.718,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm BoolQ,0.742,[],helm_classic_240829.csv +davinci_175b,Helm BoolQ,0.722,[],helm_classic_240829.csv +llama_7b,Helm BoolQ,0.756,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm BoolQ,0.705,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm BoolQ,0.776,[],helm_classic_240829.csv +glm_130b,Helm BoolQ,0.784,[],helm_classic_240829.csv +luminous_extended_30b,Helm BoolQ,0.767,[],helm_classic_240829.csv +opt_66b,Helm BoolQ,0.76,[],helm_classic_240829.csv +bloom_176b,Helm BoolQ,0.704,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm BoolQ,0.722,[],helm_classic_240829.csv +alpaca_7b,Helm BoolQ,0.778,[],helm_classic_240829.csv +falcon_7b,Helm BoolQ,0.753,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm BoolQ,0.713,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm BoolQ,0.725,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm BoolQ,0.677,[],helm_classic_240829.csv +text_curie_001,Helm BoolQ,0.62,[],helm_classic_240829.csv +gpt_neox_20b,Helm BoolQ,0.683,[],helm_classic_240829.csv +luminous_base_13b,Helm BoolQ,0.719,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm BoolQ,0.7,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm BoolQ,0.685,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm BoolQ,0.698,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm BoolQ,0.683,[],helm_classic_240829.csv +gpt_j_6b,Helm BoolQ,0.649,[],helm_classic_240829.csv +pythia_12b,Helm BoolQ,0.662,[],helm_classic_240829.csv +curie_6_7b,Helm BoolQ,0.656,[],helm_classic_240829.csv +falcon_instruct_7b,Helm BoolQ,0.72,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm BoolQ,0.659,[],helm_classic_240829.csv +text_babbage_001,Helm BoolQ,0.451,[],helm_classic_240829.csv +t0pp_11b,Helm BoolQ,0.0,[],helm_classic_240829.csv +pythia_6_9b,Helm BoolQ,0.631,[],helm_classic_240829.csv +flan-ul2_20b,Helm BoolQ,0.746,[],helm_classic_240829.csv +t5_11b,Helm BoolQ,0.761,[],helm_classic_240829.csv +babbage_1_3b,Helm BoolQ,0.574,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm BoolQ,0.457,[],helm_classic_240829.csv +ada_350m,Helm BoolQ,0.581,[],helm_classic_240829.csv +text_ada_001,Helm BoolQ,0.464,[],helm_classic_240829.csv +yalm_100b,Helm BoolQ,0.634,[],helm_classic_240829.csv +llama_2_70b,Helm NarrativeQA,0.77,[],helm_classic_240829.csv +llama_65b,Helm NarrativeQA,0.755,[],helm_classic_240829.csv +text_davinci_002,Helm NarrativeQA,0.727,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm NarrativeQA,0.716,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm NarrativeQA,0.752,[],helm_classic_240829.csv +text_davinci_003,Helm NarrativeQA,0.727,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm NarrativeQA,0.733,[],helm_classic_240829.csv +llama_2_13b,Helm NarrativeQA,0.744,[],helm_classic_240829.csv +tnlg_v2_530b,Helm NarrativeQA,0.722,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm NarrativeQA,0.625,[],helm_classic_240829.csv +llama30b,Helm NarrativeQA,0.752,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm NarrativeQA,0.728,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm NarrativeQA,0.663,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm NarrativeQA,0.737,[],helm_classic_240829.csv +palmyra_x_43b,Helm NarrativeQA,0.742,[],helm_classic_240829.csv +falcon_40b,Helm NarrativeQA,0.673,[],helm_classic_240829.csv +falcon_instruct_40b,Helm NarrativeQA,0.625,[],helm_classic_240829.csv +mpt_instruct_30b,Helm NarrativeQA,0.733,[],helm_classic_240829.csv +mpt_30b,Helm NarrativeQA,0.732,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm NarrativeQA,0.725,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm NarrativeQA,0.691,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm NarrativeQA,0.709,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm NarrativeQA,0.672,[],helm_classic_240829.csv +luminous_supreme_70b,Helm NarrativeQA,0.711,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm NarrativeQA,0.643,[],helm_classic_240829.csv +opt_175b,Helm NarrativeQA,0.671,[],helm_classic_240829.csv +llama_2_7b,Helm NarrativeQA,0.691,[],helm_classic_240829.csv +llama_13b,Helm NarrativeQA,0.711,[],helm_classic_240829.csv +instructpalmyra_30b,Helm NarrativeQA,0.496,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm NarrativeQA,0.65,[],helm_classic_240829.csv +davinci_175b,Helm NarrativeQA,0.687,[],helm_classic_240829.csv +llama_7b,Helm NarrativeQA,0.669,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm NarrativeQA,0.638,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm NarrativeQA,0.695,[],helm_classic_240829.csv +glm_130b,Helm NarrativeQA,0.706,[],helm_classic_240829.csv +luminous_extended_30b,Helm NarrativeQA,0.665,[],helm_classic_240829.csv +opt_66b,Helm NarrativeQA,0.638,[],helm_classic_240829.csv +bloom_176b,Helm NarrativeQA,0.662,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm NarrativeQA,0.672,[],helm_classic_240829.csv +alpaca_7b,Helm NarrativeQA,0.396,[],helm_classic_240829.csv +falcon_7b,Helm NarrativeQA,0.621,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm NarrativeQA,0.617,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm NarrativeQA,0.625,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm NarrativeQA,0.638,[],helm_classic_240829.csv +text_curie_001,Helm NarrativeQA,0.582,[],helm_classic_240829.csv +gpt_neox_20b,Helm NarrativeQA,0.599,[],helm_classic_240829.csv +luminous_base_13b,Helm NarrativeQA,0.605,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm NarrativeQA,0.61,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm NarrativeQA,0.555,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm NarrativeQA,0.631,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm NarrativeQA,0.623,[],helm_classic_240829.csv +gpt_j_6b,Helm NarrativeQA,0.545,[],helm_classic_240829.csv +pythia_12b,Helm NarrativeQA,0.596,[],helm_classic_240829.csv +curie_6_7b,Helm NarrativeQA,0.604,[],helm_classic_240829.csv +falcon_instruct_7b,Helm NarrativeQA,0.476,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm NarrativeQA,0.559,[],helm_classic_240829.csv +text_babbage_001,Helm NarrativeQA,0.429,[],helm_classic_240829.csv +t0pp_11b,Helm NarrativeQA,0.151,[],helm_classic_240829.csv +pythia_6_9b,Helm NarrativeQA,0.528,[],helm_classic_240829.csv +flan-ul2_20b,Helm NarrativeQA,0.083,[],helm_classic_240829.csv +t5_11b,Helm NarrativeQA,0.086,[],helm_classic_240829.csv +babbage_1_3b,Helm NarrativeQA,0.491,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm NarrativeQA,0.294,[],helm_classic_240829.csv +ada_350m,Helm NarrativeQA,0.326,[],helm_classic_240829.csv +text_ada_001,Helm NarrativeQA,0.238,[],helm_classic_240829.csv +yalm_100b,Helm NarrativeQA,0.252,[],helm_classic_240829.csv +llama_2_70b,Helm NaturalQuestionsClosed,0.458,[],helm_classic_240829.csv +llama_65b,Helm NaturalQuestionsClosed,0.431,[],helm_classic_240829.csv +text_davinci_002,Helm NaturalQuestionsClosed,0.383,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm NaturalQuestionsClosed,0.365,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm NaturalQuestionsClosed,0.372,[],helm_classic_240829.csv +text_davinci_003,Helm NaturalQuestionsClosed,0.406,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm NaturalQuestionsClosed,0.385,[],helm_classic_240829.csv +llama_2_13b,Helm NaturalQuestionsClosed,0.376,[],helm_classic_240829.csv +tnlg_v2_530b,Helm NaturalQuestionsClosed,0.384,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm NaturalQuestionsClosed,0.348,[],helm_classic_240829.csv +llama30b,Helm NaturalQuestionsClosed,0.408,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm NaturalQuestionsClosed,0.288,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm NaturalQuestionsClosed,0.39,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm NaturalQuestionsClosed,0.356,[],helm_classic_240829.csv +palmyra_x_43b,Helm NaturalQuestionsClosed,0.413,[],helm_classic_240829.csv +falcon_40b,Helm NaturalQuestionsClosed,0.392,[],helm_classic_240829.csv +falcon_instruct_40b,Helm NaturalQuestionsClosed,0.377,[],helm_classic_240829.csv +mpt_instruct_30b,Helm NaturalQuestionsClosed,0.304,[],helm_classic_240829.csv +mpt_30b,Helm NaturalQuestionsClosed,0.347,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm NaturalQuestionsClosed,0.337,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm NaturalQuestionsClosed,0.346,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm NaturalQuestionsClosed,0.229,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm NaturalQuestionsClosed,0.361,[],helm_classic_240829.csv +luminous_supreme_70b,Helm NaturalQuestionsClosed,0.293,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm NaturalQuestionsClosed,0.287,[],helm_classic_240829.csv +opt_175b,Helm NaturalQuestionsClosed,0.297,[],helm_classic_240829.csv +llama_2_7b,Helm NaturalQuestionsClosed,0.337,[],helm_classic_240829.csv +llama_13b,Helm NaturalQuestionsClosed,0.346,[],helm_classic_240829.csv +instructpalmyra_30b,Helm NaturalQuestionsClosed,0.33,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm NaturalQuestionsClosed,0.312,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm NaturalQuestionsClosed,0.274,[],helm_classic_240829.csv +davinci_175b,Helm NaturalQuestionsClosed,0.329,[],helm_classic_240829.csv +llama_7b,Helm NaturalQuestionsClosed,0.297,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm NaturalQuestionsClosed,0.232,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm NaturalQuestionsClosed,0.293,[],helm_classic_240829.csv +glm_130b,Helm NaturalQuestionsClosed,0.148,[],helm_classic_240829.csv +luminous_extended_30b,Helm NaturalQuestionsClosed,0.254,[],helm_classic_240829.csv +opt_66b,Helm NaturalQuestionsClosed,0.258,[],helm_classic_240829.csv +bloom_176b,Helm NaturalQuestionsClosed,0.216,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm NaturalQuestionsClosed,0.233,[],helm_classic_240829.csv +alpaca_7b,Helm NaturalQuestionsClosed,0.266,[],helm_classic_240829.csv +falcon_7b,Helm NaturalQuestionsClosed,0.285,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm NaturalQuestionsClosed,0.25,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm NaturalQuestionsClosed,0.232,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm NaturalQuestionsClosed,0.203,[],helm_classic_240829.csv +text_curie_001,Helm NaturalQuestionsClosed,0.175,[],helm_classic_240829.csv +gpt_neox_20b,Helm NaturalQuestionsClosed,0.193,[],helm_classic_240829.csv +luminous_base_13b,Helm NaturalQuestionsClosed,0.202,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm NaturalQuestionsClosed,0.199,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm NaturalQuestionsClosed,0.207,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm NaturalQuestionsClosed,0.21,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm NaturalQuestionsClosed,0.19,[],helm_classic_240829.csv +gpt_j_6b,Helm NaturalQuestionsClosed,0.156,[],helm_classic_240829.csv +pythia_12b,Helm NaturalQuestionsClosed,0.175,[],helm_classic_240829.csv +curie_6_7b,Helm NaturalQuestionsClosed,0.199,[],helm_classic_240829.csv +falcon_instruct_7b,Helm NaturalQuestionsClosed,0.194,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm NaturalQuestionsClosed,0.177,[],helm_classic_240829.csv +text_babbage_001,Helm NaturalQuestionsClosed,0.07,[],helm_classic_240829.csv +t0pp_11b,Helm NaturalQuestionsClosed,0.039,[],helm_classic_240829.csv +pythia_6_9b,Helm NaturalQuestionsClosed,0.142,[],helm_classic_240829.csv +flan-ul2_20b,Helm NaturalQuestionsClosed,0.204,[],helm_classic_240829.csv +t5_11b,Helm NaturalQuestionsClosed,0.194,[],helm_classic_240829.csv +babbage_1_3b,Helm NaturalQuestionsClosed,0.119,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm NaturalQuestionsClosed,0.078,[],helm_classic_240829.csv +ada_350m,Helm NaturalQuestionsClosed,0.082,[],helm_classic_240829.csv +text_ada_001,Helm NaturalQuestionsClosed,0.025,[],helm_classic_240829.csv +yalm_100b,Helm NaturalQuestionsClosed,0.068,[],helm_classic_240829.csv +llama_2_70b,Helm NaturalQuestionsOpen,0.674,[],helm_classic_240829.csv +llama_65b,Helm NaturalQuestionsOpen,0.672,[],helm_classic_240829.csv +text_davinci_002,Helm NaturalQuestionsOpen,0.713,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm NaturalQuestionsOpen,0.687,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm NaturalQuestionsOpen,0.76,[],helm_classic_240829.csv +text_davinci_003,Helm NaturalQuestionsOpen,0.77,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm NaturalQuestionsOpen,0.669,[],helm_classic_240829.csv +llama_2_13b,Helm NaturalQuestionsOpen,0.637,[],helm_classic_240829.csv +tnlg_v2_530b,Helm NaturalQuestionsOpen,0.642,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm NaturalQuestionsOpen,0.675,[],helm_classic_240829.csv +llama30b,Helm NaturalQuestionsOpen,0.666,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm NaturalQuestionsOpen,0.686,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm NaturalQuestionsOpen,0.624,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm NaturalQuestionsOpen,0.639,[],helm_classic_240829.csv +falcon_40b,Helm NaturalQuestionsOpen,0.675,[],helm_classic_240829.csv +falcon_instruct_40b,Helm NaturalQuestionsOpen,0.666,[],helm_classic_240829.csv +mpt_instruct_30b,Helm NaturalQuestionsOpen,0.697,[],helm_classic_240829.csv +mpt_30b,Helm NaturalQuestionsOpen,0.673,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm NaturalQuestionsOpen,0.625,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm NaturalQuestionsOpen,0.686,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm NaturalQuestionsOpen,0.717,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm NaturalQuestionsOpen,0.628,[],helm_classic_240829.csv +luminous_supreme_70b,Helm NaturalQuestionsOpen,0.649,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm NaturalQuestionsOpen,0.634,[],helm_classic_240829.csv +opt_175b,Helm NaturalQuestionsOpen,0.615,[],helm_classic_240829.csv +llama_2_7b,Helm NaturalQuestionsOpen,0.611,[],helm_classic_240829.csv +llama_13b,Helm NaturalQuestionsOpen,0.614,[],helm_classic_240829.csv +instructpalmyra_30b,Helm NaturalQuestionsOpen,0.682,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm NaturalQuestionsOpen,0.595,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm NaturalQuestionsOpen,0.589,[],helm_classic_240829.csv +davinci_175b,Helm NaturalQuestionsOpen,0.625,[],helm_classic_240829.csv +llama_7b,Helm NaturalQuestionsOpen,0.589,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm NaturalQuestionsOpen,0.659,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm NaturalQuestionsOpen,0.595,[],helm_classic_240829.csv +glm_130b,Helm NaturalQuestionsOpen,0.642,[],helm_classic_240829.csv +luminous_extended_30b,Helm NaturalQuestionsOpen,0.609,[],helm_classic_240829.csv +opt_66b,Helm NaturalQuestionsOpen,0.596,[],helm_classic_240829.csv +bloom_176b,Helm NaturalQuestionsOpen,0.621,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm NaturalQuestionsOpen,0.578,[],helm_classic_240829.csv +alpaca_7b,Helm NaturalQuestionsOpen,0.592,[],helm_classic_240829.csv +falcon_7b,Helm NaturalQuestionsOpen,0.579,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm NaturalQuestionsOpen,0.586,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm NaturalQuestionsOpen,0.573,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm NaturalQuestionsOpen,0.637,[],helm_classic_240829.csv +text_curie_001,Helm NaturalQuestionsOpen,0.571,[],helm_classic_240829.csv +gpt_neox_20b,Helm NaturalQuestionsOpen,0.596,[],helm_classic_240829.csv +luminous_base_13b,Helm NaturalQuestionsOpen,0.568,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm NaturalQuestionsOpen,0.517,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm NaturalQuestionsOpen,0.52,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm NaturalQuestionsOpen,0.561,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm NaturalQuestionsOpen,0.532,[],helm_classic_240829.csv +gpt_j_6b,Helm NaturalQuestionsOpen,0.559,[],helm_classic_240829.csv +pythia_12b,Helm NaturalQuestionsOpen,0.581,[],helm_classic_240829.csv +curie_6_7b,Helm NaturalQuestionsOpen,0.552,[],helm_classic_240829.csv +falcon_instruct_7b,Helm NaturalQuestionsOpen,0.449,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm NaturalQuestionsOpen,0.504,[],helm_classic_240829.csv +text_babbage_001,Helm NaturalQuestionsOpen,0.33,[],helm_classic_240829.csv +t0pp_11b,Helm NaturalQuestionsOpen,0.19,[],helm_classic_240829.csv +pythia_6_9b,Helm NaturalQuestionsOpen,0.539,[],helm_classic_240829.csv +flan-ul2_20b,Helm NaturalQuestionsOpen,0.349,[],helm_classic_240829.csv +t5_11b,Helm NaturalQuestionsOpen,0.477,[],helm_classic_240829.csv +babbage_1_3b,Helm NaturalQuestionsOpen,0.451,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm NaturalQuestionsOpen,0.309,[],helm_classic_240829.csv +ada_350m,Helm NaturalQuestionsOpen,0.365,[],helm_classic_240829.csv +text_ada_001,Helm NaturalQuestionsOpen,0.149,[],helm_classic_240829.csv +yalm_100b,Helm NaturalQuestionsOpen,0.227,[],helm_classic_240829.csv +llama_2_70b,Helm QuAC,0.484,[],helm_classic_240829.csv +llama_65b,Helm QuAC,0.401,[],helm_classic_240829.csv +text_davinci_002,Helm QuAC,0.445,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm QuAC,0.423,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm QuAC,0.432,[],helm_classic_240829.csv +text_davinci_003,Helm QuAC,0.525,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm QuAC,0.435,[],helm_classic_240829.csv +llama_2_13b,Helm QuAC,0.424,[],helm_classic_240829.csv +tnlg_v2_530b,Helm QuAC,0.39,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm QuAC,0.485,[],helm_classic_240829.csv +llama30b,Helm QuAC,0.39,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm QuAC,0.431,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm QuAC,0.512,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm QuAC,0.418,[],helm_classic_240829.csv +palmyra_x_43b,Helm QuAC,0.473,[],helm_classic_240829.csv +falcon_40b,Helm QuAC,0.307,[],helm_classic_240829.csv +falcon_instruct_40b,Helm QuAC,0.371,[],helm_classic_240829.csv +mpt_instruct_30b,Helm QuAC,0.327,[],helm_classic_240829.csv +mpt_30b,Helm QuAC,0.393,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm QuAC,0.392,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm QuAC,0.403,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm QuAC,0.375,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm QuAC,0.374,[],helm_classic_240829.csv +luminous_supreme_70b,Helm QuAC,0.37,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm QuAC,0.392,[],helm_classic_240829.csv +opt_175b,Helm QuAC,0.36,[],helm_classic_240829.csv +llama_2_7b,Helm QuAC,0.406,[],helm_classic_240829.csv +llama_13b,Helm QuAC,0.347,[],helm_classic_240829.csv +instructpalmyra_30b,Helm QuAC,0.433,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm QuAC,0.361,[],helm_classic_240829.csv +davinci_175b,Helm QuAC,0.36,[],helm_classic_240829.csv +llama_7b,Helm QuAC,0.338,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm QuAC,0.26,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm QuAC,0.358,[],helm_classic_240829.csv +glm_130b,Helm QuAC,0.272,[],helm_classic_240829.csv +luminous_extended_30b,Helm QuAC,0.349,[],helm_classic_240829.csv +opt_66b,Helm QuAC,0.357,[],helm_classic_240829.csv +bloom_176b,Helm QuAC,0.361,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm QuAC,0.362,[],helm_classic_240829.csv +alpaca_7b,Helm QuAC,0.27,[],helm_classic_240829.csv +falcon_7b,Helm QuAC,0.332,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm QuAC,0.336,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm QuAC,0.338,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm QuAC,0.259,[],helm_classic_240829.csv +text_curie_001,Helm QuAC,0.358,[],helm_classic_240829.csv +gpt_neox_20b,Helm QuAC,0.326,[],helm_classic_240829.csv +luminous_base_13b,Helm QuAC,0.334,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm QuAC,0.314,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm QuAC,0.309,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm QuAC,0.345,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm QuAC,0.328,[],helm_classic_240829.csv +gpt_j_6b,Helm QuAC,0.33,[],helm_classic_240829.csv +pythia_12b,Helm QuAC,0.313,[],helm_classic_240829.csv +curie_6_7b,Helm QuAC,0.321,[],helm_classic_240829.csv +falcon_instruct_7b,Helm QuAC,0.311,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm QuAC,0.279,[],helm_classic_240829.csv +text_babbage_001,Helm QuAC,0.284,[],helm_classic_240829.csv +t0pp_11b,Helm QuAC,0.121,[],helm_classic_240829.csv +pythia_6_9b,Helm QuAC,0.296,[],helm_classic_240829.csv +flan-ul2_20b,Helm QuAC,0.144,[],helm_classic_240829.csv +t5_11b,Helm QuAC,0.116,[],helm_classic_240829.csv +babbage_1_3b,Helm QuAC,0.273,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm QuAC,0.219,[],helm_classic_240829.csv +ada_350m,Helm QuAC,0.242,[],helm_classic_240829.csv +text_ada_001,Helm QuAC,0.176,[],helm_classic_240829.csv +yalm_100b,Helm QuAC,0.162,[],helm_classic_240829.csv +text_davinci_002,helm_hellaswag,0.815,[],helm_classic_240829.csv +cohere_command_beta_52_4b,helm_hellaswag,0.811,[],helm_classic_240829.csv +text_davinci_003,helm_hellaswag,0.822,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,helm_hellaswag,0.788,[],helm_classic_240829.csv +tnlg_v2_530b,helm_hellaswag,0.799,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,helm_hellaswag,0.807,[],helm_classic_240829.csv +jurassic_2_grande_17b,helm_hellaswag,0.781,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,helm_hellaswag,0.764,[],helm_classic_240829.csv +cohere_command_beta_6_1b,helm_hellaswag,0.752,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,helm_hellaswag,0.81,[],helm_classic_240829.csv +opt_175b,helm_hellaswag,0.791,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,helm_hellaswag,0.811,[],helm_classic_240829.csv +jurassic_2_large_7_5b,helm_hellaswag,0.729,[],helm_classic_240829.csv +davinci_175b,helm_hellaswag,0.775,[],helm_classic_240829.csv +j1_jumbo_v1_178b,helm_hellaswag,0.765,[],helm_classic_240829.csv +opt_66b,helm_hellaswag,0.745,[],helm_classic_240829.csv +bloom_176b,helm_hellaswag,0.744,[],helm_classic_240829.csv +j1_grande_v1_17b,helm_hellaswag,0.739,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,helm_hellaswag,0.736,[],helm_classic_240829.csv +text_curie_001,helm_hellaswag,0.676,[],helm_classic_240829.csv +gpt_neox_20b,helm_hellaswag,0.718,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,helm_hellaswag,0.726,[],helm_classic_240829.csv +tnlg_v2_6_7b,helm_hellaswag,0.704,[],helm_classic_240829.csv +j1_large_v1_7_5b,helm_hellaswag,0.7,[],helm_classic_240829.csv +gpt_j_6b,helm_hellaswag,0.663,[],helm_classic_240829.csv +curie_6_7b,helm_hellaswag,0.682,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,helm_hellaswag,0.706,[],helm_classic_240829.csv +text_babbage_001,helm_hellaswag,0.561,[],helm_classic_240829.csv +babbage_1_3b,helm_hellaswag,0.555,[],helm_classic_240829.csv +cohere_small_v20220720_410m,helm_hellaswag,0.483,[],helm_classic_240829.csv +ada_350m,helm_hellaswag,0.435,[],helm_classic_240829.csv +text_ada_001,helm_hellaswag,0.429,[],helm_classic_240829.csv +llama_2_70b,Helm OpenBookQA,0.554,[],helm_classic_240829.csv +llama_65b,Helm OpenBookQA,0.508,[],helm_classic_240829.csv +text_davinci_002,Helm OpenBookQA,0.594,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm OpenBookQA,0.422,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm OpenBookQA,0.582,[],helm_classic_240829.csv +text_davinci_003,Helm OpenBookQA,0.646,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm OpenBookQA,0.558,[],helm_classic_240829.csv +llama_2_13b,Helm OpenBookQA,0.33,[],helm_classic_240829.csv +tnlg_v2_530b,Helm OpenBookQA,0.562,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm OpenBookQA,0.339,[],helm_classic_240829.csv +llama30b,Helm OpenBookQA,0.344,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm OpenBookQA,0.558,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm OpenBookQA,0.609,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm OpenBookQA,0.542,[],helm_classic_240829.csv +palmyra_x_43b,Helm OpenBookQA,0.616,[],helm_classic_240829.csv +falcon_40b,Helm OpenBookQA,0.353,[],helm_classic_240829.csv +falcon_instruct_40b,Helm OpenBookQA,0.384,[],helm_classic_240829.csv +mpt_instruct_30b,Helm OpenBookQA,0.234,[],helm_classic_240829.csv +mpt_30b,Helm OpenBookQA,0.231,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm OpenBookQA,0.56,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm OpenBookQA,0.385,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm OpenBookQA,0.55,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm OpenBookQA,0.588,[],helm_classic_240829.csv +luminous_supreme_70b,Helm OpenBookQA,0.222,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm OpenBookQA,0.292,[],helm_classic_240829.csv +opt_175b,Helm OpenBookQA,0.586,[],helm_classic_240829.csv +llama_2_7b,Helm OpenBookQA,0.272,[],helm_classic_240829.csv +llama_13b,Helm OpenBookQA,0.324,[],helm_classic_240829.csv +instructpalmyra_30b,Helm OpenBookQA,0.185,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm OpenBookQA,0.55,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm OpenBookQA,0.53,[],helm_classic_240829.csv +davinci_175b,Helm OpenBookQA,0.586,[],helm_classic_240829.csv +llama_7b,Helm OpenBookQA,0.28,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm OpenBookQA,0.243,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm OpenBookQA,0.534,[],helm_classic_240829.csv +glm_130b,Helm OpenBookQA,0.218,[],helm_classic_240829.csv +luminous_extended_30b,Helm OpenBookQA,0.221,[],helm_classic_240829.csv +opt_66b,Helm OpenBookQA,0.534,[],helm_classic_240829.csv +bloom_176b,Helm OpenBookQA,0.534,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm OpenBookQA,0.52,[],helm_classic_240829.csv +alpaca_7b,Helm OpenBookQA,0.243,[],helm_classic_240829.csv +falcon_7b,Helm OpenBookQA,0.234,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm OpenBookQA,0.205,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm OpenBookQA,0.542,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm OpenBookQA,0.208,[],helm_classic_240829.csv +text_curie_001,Helm OpenBookQA,0.514,[],helm_classic_240829.csv +gpt_neox_20b,Helm OpenBookQA,0.524,[],helm_classic_240829.csv +luminous_base_13b,Helm OpenBookQA,0.182,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm OpenBookQA,0.538,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm OpenBookQA,0.277,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm OpenBookQA,0.478,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm OpenBookQA,0.514,[],helm_classic_240829.csv +gpt_j_6b,Helm OpenBookQA,0.514,[],helm_classic_240829.csv +pythia_12b,Helm OpenBookQA,0.177,[],helm_classic_240829.csv +curie_6_7b,Helm OpenBookQA,0.502,[],helm_classic_240829.csv +falcon_instruct_7b,Helm OpenBookQA,0.213,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm OpenBookQA,0.496,[],helm_classic_240829.csv +text_babbage_001,Helm OpenBookQA,0.452,[],helm_classic_240829.csv +t0pp_11b,Helm OpenBookQA,0.377,[],helm_classic_240829.csv +pythia_6_9b,Helm OpenBookQA,0.213,[],helm_classic_240829.csv +flan-ul2_20b,Helm OpenBookQA,0.193,[],helm_classic_240829.csv +t5_11b,Helm OpenBookQA,0.133,[],helm_classic_240829.csv +babbage_1_3b,Helm OpenBookQA,0.438,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm OpenBookQA,0.348,[],helm_classic_240829.csv +ada_350m,Helm OpenBookQA,0.38,[],helm_classic_240829.csv +text_ada_001,Helm OpenBookQA,0.346,[],helm_classic_240829.csv +yalm_100b,Helm OpenBookQA,0.202,[],helm_classic_240829.csv +text_davinci_002,helm_truthfulqa,0.61,[],helm_classic_240829.csv +cohere_command_beta_52_4b,helm_truthfulqa,0.269,[],helm_classic_240829.csv +text_davinci_003,helm_truthfulqa,0.593,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,helm_truthfulqa,0.437,[],helm_classic_240829.csv +tnlg_v2_530b,helm_truthfulqa,0.251,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,helm_truthfulqa,0.368,[],helm_classic_240829.csv +jurassic_2_grande_17b,helm_truthfulqa,0.348,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,helm_truthfulqa,0.306,[],helm_classic_240829.csv +cohere_command_beta_6_1b,helm_truthfulqa,0.203,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,helm_truthfulqa,0.169,[],helm_classic_240829.csv +opt_175b,helm_truthfulqa,0.25,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,helm_truthfulqa,0.198,[],helm_classic_240829.csv +jurassic_2_large_7_5b,helm_truthfulqa,0.245,[],helm_classic_240829.csv +davinci_175b,helm_truthfulqa,0.194,[],helm_classic_240829.csv +j1_jumbo_v1_178b,helm_truthfulqa,0.175,[],helm_classic_240829.csv +opt_66b,helm_truthfulqa,0.201,[],helm_classic_240829.csv +bloom_176b,helm_truthfulqa,0.205,[],helm_classic_240829.csv +j1_grande_v1_17b,helm_truthfulqa,0.193,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,helm_truthfulqa,0.181,[],helm_classic_240829.csv +text_curie_001,helm_truthfulqa,0.257,[],helm_classic_240829.csv +gpt_neox_20b,helm_truthfulqa,0.216,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,helm_truthfulqa,0.215,[],helm_classic_240829.csv +tnlg_v2_6_7b,helm_truthfulqa,0.167,[],helm_classic_240829.csv +j1_large_v1_7_5b,helm_truthfulqa,0.197,[],helm_classic_240829.csv +gpt_j_6b,helm_truthfulqa,0.199,[],helm_classic_240829.csv +curie_6_7b,helm_truthfulqa,0.232,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,helm_truthfulqa,0.19,[],helm_classic_240829.csv +text_babbage_001,helm_truthfulqa,0.233,[],helm_classic_240829.csv +babbage_1_3b,helm_truthfulqa,0.188,[],helm_classic_240829.csv +cohere_small_v20220720_410m,helm_truthfulqa,0.217,[],helm_classic_240829.csv +ada_350m,helm_truthfulqa,0.215,[],helm_classic_240829.csv +text_ada_001,helm_truthfulqa,0.232,[],helm_classic_240829.csv +text_davinci_002,Helm MSMARCO Regular,0.421,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm MSMARCO Regular,0.472,[],helm_classic_240829.csv +text_davinci_003,Helm MSMARCO Regular,0.368,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm MSMARCO Regular,0.398,[],helm_classic_240829.csv +tnlg_v2_530b,Helm MSMARCO Regular,0.377,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm MSMARCO Regular,0.293,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm MSMARCO Regular,0.285,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm MSMARCO Regular,0.434,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm MSMARCO Regular,0.315,[],helm_classic_240829.csv +opt_175b,Helm MSMARCO Regular,0.288,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm MSMARCO Regular,0.273,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm MSMARCO Regular,0.247,[],helm_classic_240829.csv +davinci_175b,Helm MSMARCO Regular,0.211,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm MSMARCO Regular,0.21,[],helm_classic_240829.csv +opt_66b,Helm MSMARCO Regular,0.237,[],helm_classic_240829.csv +bloom_176b,Helm MSMARCO Regular,0.236,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm MSMARCO Regular,0.161,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm MSMARCO Regular,0.19,[],helm_classic_240829.csv +text_curie_001,Helm MSMARCO Regular,0.271,[],helm_classic_240829.csv +gpt_neox_20b,Helm MSMARCO Regular,0.184,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm MSMARCO Regular,0.175,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm MSMARCO Regular,0.158,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm MSMARCO Regular,0.147,[],helm_classic_240829.csv +gpt_j_6b,Helm MSMARCO Regular,0.152,[],helm_classic_240829.csv +curie_6_7b,Helm MSMARCO Regular,0.162,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm MSMARCO Regular,0.152,[],helm_classic_240829.csv +text_babbage_001,Helm MSMARCO Regular,0.208,[],helm_classic_240829.csv +babbage_1_3b,Helm MSMARCO Regular,0.122,[],helm_classic_240829.csv +ada_350m,Helm MSMARCO Regular,0.102,[],helm_classic_240829.csv +text_ada_001,Helm MSMARCO Regular,0.134,[],helm_classic_240829.csv +text_davinci_002,Helm MSMARCO Trec,0.664,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm MSMARCO Trec,0.762,[],helm_classic_240829.csv +text_davinci_003,Helm MSMARCO Trec,0.644,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm MSMARCO Trec,0.661,[],helm_classic_240829.csv +tnlg_v2_530b,Helm MSMARCO Trec,0.643,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm MSMARCO Trec,-0.154,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm MSMARCO Trec,0.514,[],helm_classic_240829.csv +palmyra_x_43b,Helm MSMARCO Trec,0.049,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm MSMARCO Trec,0.46,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm MSMARCO Trec,0.709,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm MSMARCO Trec,0.55,[],helm_classic_240829.csv +luminous_supreme_70b,Helm MSMARCO Trec,0.15,[],helm_classic_240829.csv +opt_175b,Helm MSMARCO Trec,0.448,[],helm_classic_240829.csv +instructpalmyra_30b,Helm MSMARCO Trec,0.152,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm MSMARCO Trec,0.459,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm MSMARCO Trec,0.464,[],helm_classic_240829.csv +davinci_175b,Helm MSMARCO Trec,0.378,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm MSMARCO Trec,0.363,[],helm_classic_240829.csv +glm_130b,Helm MSMARCO Trec,0.154,[],helm_classic_240829.csv +luminous_extended_30b,Helm MSMARCO Trec,0.139,[],helm_classic_240829.csv +opt_66b,Helm MSMARCO Trec,0.482,[],helm_classic_240829.csv +bloom_176b,Helm MSMARCO Trec,0.386,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm MSMARCO Trec,0.341,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm MSMARCO Trec,0.33,[],helm_classic_240829.csv +text_curie_001,Helm MSMARCO Trec,0.507,[],helm_classic_240829.csv +gpt_neox_20b,Helm MSMARCO Trec,0.398,[],helm_classic_240829.csv +luminous_base_13b,Helm MSMARCO Trec,0.11,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm MSMARCO Trec,0.373,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm MSMARCO Trec,0.332,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm MSMARCO Trec,0.292,[],helm_classic_240829.csv +gpt_j_6b,Helm MSMARCO Trec,0.345,[],helm_classic_240829.csv +curie_6_7b,Helm MSMARCO Trec,0.3,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm MSMARCO Trec,0.374,[],helm_classic_240829.csv +text_babbage_001,Helm MSMARCO Trec,0.449,[],helm_classic_240829.csv +t0pp_11b,Helm MSMARCO Trec,0.122,[],helm_classic_240829.csv +flan-ul2_20b,Helm MSMARCO Trec,0.03,[],helm_classic_240829.csv +t5_11b,Helm MSMARCO Trec,0.043,[],helm_classic_240829.csv +babbage_1_3b,Helm MSMARCO Trec,0.317,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm MSMARCO Trec,0.304,[],helm_classic_240829.csv +ada_350m,Helm MSMARCO Trec,0.29,[],helm_classic_240829.csv +text_ada_001,Helm MSMARCO Trec,0.302,[],helm_classic_240829.csv +yalm_100b,Helm MSMARCO Trec,0.017,[],helm_classic_240829.csv +text_davinci_002,helm_cnn/dailymail,0.153,[],helm_classic_240829.csv +cohere_command_beta_52_4b,helm_cnn/dailymail,0.161,[],helm_classic_240829.csv +text_davinci_003,helm_cnn/dailymail,0.156,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,helm_cnn/dailymail,0.149,[],helm_classic_240829.csv +tnlg_v2_530b,helm_cnn/dailymail,0.161,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,helm_cnn/dailymail,0.134,[],helm_classic_240829.csv +jurassic_2_grande_17b,helm_cnn/dailymail,0.144,[],helm_classic_240829.csv +palmyra_x_43b,helm_cnn/dailymail,0.149,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,helm_cnn/dailymail,0.146,[],helm_classic_240829.csv +cohere_command_beta_6_1b,helm_cnn/dailymail,0.153,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,helm_cnn/dailymail,0.153,[],helm_classic_240829.csv +luminous_supreme_70b,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv +opt_175b,helm_cnn/dailymail,0.146,[],helm_classic_240829.csv +instructpalmyra_30b,helm_cnn/dailymail,0.104,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,helm_cnn/dailymail,0.144,[],helm_classic_240829.csv +jurassic_2_large_7_5b,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv +davinci_175b,helm_cnn/dailymail,0.127,[],helm_classic_240829.csv +j1_jumbo_v1_178b,helm_cnn/dailymail,0.144,[],helm_classic_240829.csv +glm_130b,helm_cnn/dailymail,0.132,[],helm_classic_240829.csv +luminous_extended_30b,helm_cnn/dailymail,0.124,[],helm_classic_240829.csv +opt_66b,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv +bloom_176b,helm_cnn/dailymail,0.08,[],helm_classic_240829.csv +j1_grande_v1_17b,helm_cnn/dailymail,0.143,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,helm_cnn/dailymail,0.126,[],helm_classic_240829.csv +text_curie_001,helm_cnn/dailymail,0.152,[],helm_classic_240829.csv +gpt_neox_20b,helm_cnn/dailymail,0.123,[],helm_classic_240829.csv +luminous_base_13b,helm_cnn/dailymail,0.105,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,helm_cnn/dailymail,0.121,[],helm_classic_240829.csv +tnlg_v2_6_7b,helm_cnn/dailymail,0.146,[],helm_classic_240829.csv +j1_large_v1_7_5b,helm_cnn/dailymail,0.134,[],helm_classic_240829.csv +gpt_j_6b,helm_cnn/dailymail,0.131,[],helm_classic_240829.csv +curie_6_7b,helm_cnn/dailymail,0.113,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,helm_cnn/dailymail,0.077,[],helm_classic_240829.csv +text_babbage_001,helm_cnn/dailymail,0.151,[],helm_classic_240829.csv +t0pp_11b,helm_cnn/dailymail,0.09,[],helm_classic_240829.csv +flan-ul2_20b,helm_cnn/dailymail,0.058,[],helm_classic_240829.csv +t5_11b,helm_cnn/dailymail,0.015,[],helm_classic_240829.csv +babbage_1_3b,helm_cnn/dailymail,0.079,[],helm_classic_240829.csv +cohere_small_v20220720_410m,helm_cnn/dailymail,0.063,[],helm_classic_240829.csv +ada_350m,helm_cnn/dailymail,0.09,[],helm_classic_240829.csv +text_ada_001,helm_cnn/dailymail,0.136,[],helm_classic_240829.csv +yalm_100b,helm_cnn/dailymail,0.021,[],helm_classic_240829.csv +text_davinci_002,Helm XSUM,0.144,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm XSUM,0.152,[],helm_classic_240829.csv +text_davinci_003,Helm XSUM,0.124,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm XSUM,0.182,[],helm_classic_240829.csv +tnlg_v2_530b,Helm XSUM,0.169,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm XSUM,0.934,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm XSUM,0.167,[],helm_classic_240829.csv +palmyra_x_43b,Helm XSUM,0.935,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm XSUM,0.152,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm XSUM,0.122,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm XSUM,0.153,[],helm_classic_240829.csv +luminous_supreme_70b,Helm XSUM,0.959,[],helm_classic_240829.csv +opt_175b,Helm XSUM,0.155,[],helm_classic_240829.csv +instructpalmyra_30b,Helm XSUM,0.94,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm XSUM,0.129,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm XSUM,0.142,[],helm_classic_240829.csv +davinci_175b,Helm XSUM,0.126,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm XSUM,0.129,[],helm_classic_240829.csv +glm_130b,Helm XSUM,0.955,[],helm_classic_240829.csv +luminous_extended_30b,Helm XSUM,0.947,[],helm_classic_240829.csv +opt_66b,Helm XSUM,0.126,[],helm_classic_240829.csv +bloom_176b,Helm XSUM,0.03,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm XSUM,0.122,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm XSUM,0.108,[],helm_classic_240829.csv +text_curie_001,Helm XSUM,0.076,[],helm_classic_240829.csv +gpt_neox_20b,Helm XSUM,0.102,[],helm_classic_240829.csv +luminous_base_13b,Helm XSUM,0.939,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm XSUM,0.099,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm XSUM,0.11,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm XSUM,0.102,[],helm_classic_240829.csv +gpt_j_6b,Helm XSUM,0.096,[],helm_classic_240829.csv +curie_6_7b,Helm XSUM,0.091,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm XSUM,0.087,[],helm_classic_240829.csv +text_babbage_001,Helm XSUM,0.046,[],helm_classic_240829.csv +t0pp_11b,Helm XSUM,0.207,[],helm_classic_240829.csv +flan-ul2_20b,Helm XSUM,0.337,[],helm_classic_240829.csv +t5_11b,Helm XSUM,0.379,[],helm_classic_240829.csv +babbage_1_3b,Helm XSUM,0.045,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm XSUM,0.033,[],helm_classic_240829.csv +ada_350m,Helm XSUM,0.022,[],helm_classic_240829.csv +text_ada_001,Helm XSUM,0.034,[],helm_classic_240829.csv +yalm_100b,Helm XSUM,0.836,[],helm_classic_240829.csv +llama_2_70b,Helm IMDB,0.961,[],helm_classic_240829.csv +llama_65b,Helm IMDB,0.962,[],helm_classic_240829.csv +text_davinci_002,Helm IMDB,0.948,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm IMDB,0.962,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm IMDB,0.96,[],helm_classic_240829.csv +text_davinci_003,Helm IMDB,0.848,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm IMDB,0.938,[],helm_classic_240829.csv +llama_2_13b,Helm IMDB,0.962,[],helm_classic_240829.csv +tnlg_v2_530b,Helm IMDB,0.941,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm IMDB,0.943,[],helm_classic_240829.csv +llama30b,Helm IMDB,0.927,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm IMDB,0.61,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm IMDB,0.899,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm IMDB,0.938,[],helm_classic_240829.csv +palmyra_x_43b,Helm IMDB,0.008,[],helm_classic_240829.csv +falcon_40b,Helm IMDB,0.959,[],helm_classic_240829.csv +falcon_instruct_40b,Helm IMDB,0.959,[],helm_classic_240829.csv +mpt_instruct_30b,Helm IMDB,0.956,[],helm_classic_240829.csv +mpt_30b,Helm IMDB,0.959,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm IMDB,0.957,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm IMDB,0.762,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm IMDB,0.961,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm IMDB,0.956,[],helm_classic_240829.csv +luminous_supreme_70b,Helm IMDB,0.562,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm IMDB,0.916,[],helm_classic_240829.csv +opt_175b,Helm IMDB,0.947,[],helm_classic_240829.csv +llama_2_7b,Helm IMDB,0.907,[],helm_classic_240829.csv +llama_13b,Helm IMDB,0.928,[],helm_classic_240829.csv +instructpalmyra_30b,Helm IMDB,0.555,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm IMDB,0.956,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm IMDB,0.956,[],helm_classic_240829.csv +davinci_175b,Helm IMDB,0.933,[],helm_classic_240829.csv +llama_7b,Helm IMDB,0.947,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm IMDB,0.927,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm IMDB,0.943,[],helm_classic_240829.csv +glm_130b,Helm IMDB,0.5,[],helm_classic_240829.csv +luminous_extended_30b,Helm IMDB,0.524,[],helm_classic_240829.csv +opt_66b,Helm IMDB,0.917,[],helm_classic_240829.csv +bloom_176b,Helm IMDB,0.945,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm IMDB,0.953,[],helm_classic_240829.csv +alpaca_7b,Helm IMDB,0.738,[],helm_classic_240829.csv +falcon_7b,Helm IMDB,0.836,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm IMDB,0.752,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm IMDB,0.933,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm IMDB,0.894,[],helm_classic_240829.csv +text_curie_001,Helm IMDB,0.923,[],helm_classic_240829.csv +gpt_neox_20b,Helm IMDB,0.948,[],helm_classic_240829.csv +luminous_base_13b,Helm IMDB,0.544,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm IMDB,0.935,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm IMDB,0.907,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm IMDB,0.927,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm IMDB,0.956,[],helm_classic_240829.csv +gpt_j_6b,Helm IMDB,0.939,[],helm_classic_240829.csv +pythia_12b,Helm IMDB,0.931,[],helm_classic_240829.csv +curie_6_7b,Helm IMDB,0.889,[],helm_classic_240829.csv +falcon_instruct_7b,Helm IMDB,0.852,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm IMDB,0.935,[],helm_classic_240829.csv +text_babbage_001,Helm IMDB,0.913,[],helm_classic_240829.csv +t0pp_11b,Helm IMDB,0.234,[],helm_classic_240829.csv +pythia_6_9b,Helm IMDB,0.928,[],helm_classic_240829.csv +flan-ul2_20b,Helm IMDB,0.521,[],helm_classic_240829.csv +t5_11b,Helm IMDB,0.509,[],helm_classic_240829.csv +babbage_1_3b,Helm IMDB,0.597,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm IMDB,0.578,[],helm_classic_240829.csv +ada_350m,Helm IMDB,0.849,[],helm_classic_240829.csv +text_ada_001,Helm IMDB,0.822,[],helm_classic_240829.csv +yalm_100b,Helm IMDB,0.49,[],helm_classic_240829.csv +llama_2_70b,Helm CivilComments,0.652,[],helm_classic_240829.csv +llama_65b,Helm CivilComments,0.655,[],helm_classic_240829.csv +text_davinci_002,Helm CivilComments,0.668,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm CivilComments,0.624,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm CivilComments,0.601,[],helm_classic_240829.csv +text_davinci_003,Helm CivilComments,0.684,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm CivilComments,0.57,[],helm_classic_240829.csv +llama_2_13b,Helm CivilComments,0.588,[],helm_classic_240829.csv +tnlg_v2_530b,Helm CivilComments,0.601,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm CivilComments,0.696,[],helm_classic_240829.csv +llama30b,Helm CivilComments,0.549,[],helm_classic_240829.csv +anthropic_lm_v4_s3_52b,Helm CivilComments,0.699,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm CivilComments,0.674,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm CivilComments,0.547,[],helm_classic_240829.csv +palmyra_x_43b,Helm CivilComments,0.701,[],helm_classic_240829.csv +falcon_40b,Helm CivilComments,0.552,[],helm_classic_240829.csv +falcon_instruct_40b,Helm CivilComments,0.603,[],helm_classic_240829.csv +mpt_instruct_30b,Helm CivilComments,0.573,[],helm_classic_240829.csv +mpt_30b,Helm CivilComments,0.599,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm CivilComments,0.546,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm CivilComments,0.645,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm CivilComments,0.54,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm CivilComments,0.524,[],helm_classic_240829.csv +luminous_supreme_70b,Helm CivilComments,0.653,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm CivilComments,0.62,[],helm_classic_240829.csv +opt_175b,Helm CivilComments,0.505,[],helm_classic_240829.csv +llama_2_7b,Helm CivilComments,0.562,[],helm_classic_240829.csv +llama_13b,Helm CivilComments,0.6,[],helm_classic_240829.csv +instructpalmyra_30b,Helm CivilComments,0.652,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm CivilComments,0.532,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm CivilComments,0.57,[],helm_classic_240829.csv +davinci_175b,Helm CivilComments,0.532,[],helm_classic_240829.csv +llama_7b,Helm CivilComments,0.563,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm CivilComments,0.664,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm CivilComments,0.553,[],helm_classic_240829.csv +glm_130b,Helm CivilComments,0.598,[],helm_classic_240829.csv +luminous_extended_30b,Helm CivilComments,0.523,[],helm_classic_240829.csv +opt_66b,Helm CivilComments,0.506,[],helm_classic_240829.csv +bloom_176b,Helm CivilComments,0.62,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm CivilComments,0.529,[],helm_classic_240829.csv +alpaca_7b,Helm CivilComments,0.566,[],helm_classic_240829.csv +falcon_7b,Helm CivilComments,0.514,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm CivilComments,0.547,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm CivilComments,0.507,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm CivilComments,0.549,[],helm_classic_240829.csv +text_curie_001,Helm CivilComments,0.537,[],helm_classic_240829.csv +gpt_neox_20b,Helm CivilComments,0.516,[],helm_classic_240829.csv +luminous_base_13b,Helm CivilComments,0.473,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm CivilComments,0.5,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm CivilComments,0.549,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm CivilComments,0.532,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm CivilComments,0.532,[],helm_classic_240829.csv +gpt_j_6b,Helm CivilComments,0.52,[],helm_classic_240829.csv +pythia_12b,Helm CivilComments,0.531,[],helm_classic_240829.csv +curie_6_7b,Helm CivilComments,0.539,[],helm_classic_240829.csv +falcon_instruct_7b,Helm CivilComments,0.511,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm CivilComments,0.504,[],helm_classic_240829.csv +text_babbage_001,Helm CivilComments,0.499,[],helm_classic_240829.csv +t0pp_11b,Helm CivilComments,0.118,[],helm_classic_240829.csv +pythia_6_9b,Helm CivilComments,0.511,[],helm_classic_240829.csv +flan-ul2_20b,Helm CivilComments,0.404,[],helm_classic_240829.csv +t5_11b,Helm CivilComments,0.37,[],helm_classic_240829.csv +babbage_1_3b,Helm CivilComments,0.519,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm CivilComments,0.501,[],helm_classic_240829.csv +ada_350m,Helm CivilComments,0.517,[],helm_classic_240829.csv +text_ada_001,Helm CivilComments,0.503,[],helm_classic_240829.csv +yalm_100b,Helm CivilComments,0.395,[],helm_classic_240829.csv +llama_2_70b,Helm RAFT,0.727,[],helm_classic_240829.csv +llama_65b,Helm RAFT,0.702,[],helm_classic_240829.csv +text_davinci_002,Helm RAFT,0.733,[],helm_classic_240829.csv +mistral_v0_1_7b,Helm RAFT,0.707,[],helm_classic_240829.csv +cohere_command_beta_52_4b,Helm RAFT,0.667,[],helm_classic_240829.csv +text_davinci_003,Helm RAFT,0.759,[],helm_classic_240829.csv +jurassic_2_jumbo_178b,Helm RAFT,0.746,[],helm_classic_240829.csv +llama_2_13b,Helm RAFT,0.707,[],helm_classic_240829.csv +tnlg_v2_530b,Helm RAFT,0.679,[],helm_classic_240829.csv +gpt_3_5_turbo_0613,Helm RAFT,0.748,[],helm_classic_240829.csv +llama30b,Helm RAFT,0.752,[],helm_classic_240829.csv +gpt_3_5_turbo_0301,Helm RAFT,0.768,[],helm_classic_240829.csv +jurassic_2_grande_17b,Helm RAFT,0.712,[],helm_classic_240829.csv +falcon_40b,Helm RAFT,0.661,[],helm_classic_240829.csv +falcon_instruct_40b,Helm RAFT,0.586,[],helm_classic_240829.csv +mpt_instruct_30b,Helm RAFT,0.68,[],helm_classic_240829.csv +mpt_30b,Helm RAFT,0.723,[],helm_classic_240829.csv +j1_grande_v2_beta_17b,Helm RAFT,0.679,[],helm_classic_240829.csv +vicuna_v1_3_13b,Helm RAFT,0.657,[],helm_classic_240829.csv +cohere_command_beta_6_1b,Helm RAFT,0.634,[],helm_classic_240829.csv +cohere_xlarge_v20221108_52_4b,Helm RAFT,0.624,[],helm_classic_240829.csv +vicuna_v1_3_7b,Helm RAFT,0.693,[],helm_classic_240829.csv +opt_175b,Helm RAFT,0.606,[],helm_classic_240829.csv +llama_2_7b,Helm RAFT,0.643,[],helm_classic_240829.csv +llama_13b,Helm RAFT,0.643,[],helm_classic_240829.csv +cohere_xlarge_v20220609_52_4b,Helm RAFT,0.633,[],helm_classic_240829.csv +jurassic_2_large_7_5b,Helm RAFT,0.622,[],helm_classic_240829.csv +davinci_175b,Helm RAFT,0.642,[],helm_classic_240829.csv +llama_7b,Helm RAFT,0.573,[],helm_classic_240829.csv +redpajama_incite_instruct_7b,Helm RAFT,0.695,[],helm_classic_240829.csv +j1_jumbo_v1_178b,Helm RAFT,0.681,[],helm_classic_240829.csv +opt_66b,Helm RAFT,0.557,[],helm_classic_240829.csv +bloom_176b,Helm RAFT,0.592,[],helm_classic_240829.csv +j1_grande_v1_17b,Helm RAFT,0.658,[],helm_classic_240829.csv +alpaca_7b,Helm RAFT,0.486,[],helm_classic_240829.csv +falcon_7b,Helm RAFT,0.602,[],helm_classic_240829.csv +redpajama_incite_base_7b,Helm RAFT,0.648,[],helm_classic_240829.csv +cohere_large_v20220720_13_1b,Helm RAFT,0.596,[],helm_classic_240829.csv +redpajama_incite_instruct_v1_3b,Helm RAFT,0.661,[],helm_classic_240829.csv +text_curie_001,Helm RAFT,0.489,[],helm_classic_240829.csv +gpt_neox_20b,Helm RAFT,0.505,[],helm_classic_240829.csv +cohere_medium_v20221108_6_1b,Helm RAFT,0.591,[],helm_classic_240829.csv +redpajama_incite_base_v1_3b,Helm RAFT,0.502,[],helm_classic_240829.csv +tnlg_v2_6_7b,Helm RAFT,0.525,[],helm_classic_240829.csv +j1_large_v1_7_5b,Helm RAFT,0.545,[],helm_classic_240829.csv +gpt_j_6b,Helm RAFT,0.619,[],helm_classic_240829.csv +pythia_12b,Helm RAFT,0.514,[],helm_classic_240829.csv +curie_6_7b,Helm RAFT,0.49,[],helm_classic_240829.csv +falcon_instruct_7b,Helm RAFT,0.523,[],helm_classic_240829.csv +cohere_medium_v20220720_6_1b,Helm RAFT,0.52,[],helm_classic_240829.csv +text_babbage_001,Helm RAFT,0.509,[],helm_classic_240829.csv +pythia_6_9b,Helm RAFT,0.502,[],helm_classic_240829.csv +babbage_1_3b,Helm RAFT,0.455,[],helm_classic_240829.csv +cohere_small_v20220720_410m,Helm RAFT,0.492,[],helm_classic_240829.csv +ada_350m,Helm RAFT,0.423,[],helm_classic_240829.csv +text_ada_001,Helm RAFT,0.406,[],helm_classic_240829.csv +arx_0_3,MMLU Pro,0.7824,[],mmlu_pro_240829.csv +claude_3_5_sonnet,MMLU Pro,0.7612,[],mmlu_pro_240829.csv +grok_2,MMLU Pro,0.7546,[],mmlu_pro_240829.csv +gpt_4o_2024_05_13,MMLU Pro,0.7255,[],mmlu_pro_240829.csv +grok_2_mini,MMLU Pro,0.7185,[],mmlu_pro_240829.csv +gemini_1_5_pro,MMLU Pro,0.6903,[],mmlu_pro_240829.csv +claude_3_opus,MMLU Pro,0.6845,[],mmlu_pro_240829.csv +qwen2_72b_chat,MMLU Pro,0.6438,[],mmlu_pro_240829.csv +magnum_72b_v1,MMLU Pro,0.6393,[],mmlu_pro_240829.csv +gpt_4_turbo,MMLU Pro,0.6371,[],mmlu_pro_240829.csv +deepseek_coder_v2_instruct,MMLU Pro,0.6363,[],mmlu_pro_240829.csv +higgs_llama3_70b,MMLU Pro,0.6316,[],mmlu_pro_240829.csv +gpt_4o_mini,MMLU Pro,0.6309,[],mmlu_pro_240829.csv +llama3_1_70b_instruct,MMLU Pro,0.6284,[],mmlu_pro_240829.csv +gemini_1_5_flash,MMLU Pro,0.5912,[],mmlu_pro_240829.csv +yi_large,MMLU Pro,0.5809,[],mmlu_pro_240829.csv +claude_3_sonnet,MMLU Pro,0.568,[],mmlu_pro_240829.csv +llama3_70b_instruct,MMLU Pro,0.562,[],mmlu_pro_240829.csv +phi3_medium_4k,MMLU Pro,0.557,[],mmlu_pro_240829.csv +qwen2_72b_32k,MMLU Pro,0.5559,[],mmlu_pro_240829.csv +deepseek_v2_chat,MMLU Pro,0.5481,[],mmlu_pro_240829.csv +llama3_70b,MMLU Pro,0.5278,[],mmlu_pro_240829.csv +qwen1_5_72b_chat,MMLU Pro,0.5264,[],mmlu_pro_240829.csv +llama3_1_70b,MMLU Pro,0.5247,[],mmlu_pro_240829.csv +yi_1_5_34b_chat,MMLU Pro,0.5229,[],mmlu_pro_240829.csv +gemma_2_9b_it,MMLU Pro,0.5208,[],mmlu_pro_240829.csv +phi3_medium_128k,MMLU Pro,0.5191,[],mmlu_pro_240829.csv +mammoth2_8x7b_plus,MMLU Pro,0.504,[],mmlu_pro_240829.csv +qwen1_5_110b,MMLU Pro,0.4993,[],mmlu_pro_240829.csv +glm_4_9b_chat,MMLU Pro,0.4801,[],mmlu_pro_240829.csv +glm_4_9b,MMLU Pro,0.4792,[],mmlu_pro_240829.csv +phi_3_5_mini_instruct,MMLU Pro,0.4787,[],mmlu_pro_240829.csv +qwen2_7b_instruct,MMLU Pro,0.4724,[],mmlu_pro_240829.csv +yi_1_5_9b_chat,MMLU Pro,0.4595,[],mmlu_pro_240829.csv +phi3_mini_4k,MMLU Pro,0.4566,[],mmlu_pro_240829.csv +gemma_2_9b,MMLU Pro,0.451,[],mmlu_pro_240829.csv +mistral_nemo_instruct_2407,MMLU Pro,0.4481,[],mmlu_pro_240829.csv +llama3_1_8b_instruct,MMLU Pro,0.4425,[],mmlu_pro_240829.csv +phi3_mini_128k,MMLU Pro,0.4386,[],mmlu_pro_240829.csv +mammoth2_8b_plus,MMLU Pro,0.4335,[],mmlu_pro_240829.csv +mixtral_8x7b_instruct_v0_1,MMLU Pro,0.4327,[],mmlu_pro_240829.csv +yi_34b,MMLU Pro,0.4303,[],mmlu_pro_240829.csv +mathstral_7b_v0_1,MMLU Pro,0.42,[],mmlu_pro_240829.csv +deepseek_coder_v2_lite_instruct,MMLU Pro,0.4157,[],mmlu_pro_240829.csv +mixtral_8x7b_v0_1,MMLU Pro,0.4103,[],mmlu_pro_240829.csv +llama3_8b_instruct,MMLU Pro,0.4098,[],mmlu_pro_240829.csv +mammoth2_7b_plus,MMLU Pro,0.4085,[],mmlu_pro_240829.csv +qwen2_7b,MMLU Pro,0.4073,[],mmlu_pro_240829.csv +mistral_nemo_base_2407,MMLU Pro,0.3977,[],mmlu_pro_240829.csv +wizardlm_2_8x22b,MMLU Pro,0.3924,[],mmlu_pro_240829.csv +yi_1_5_6b_chat,MMLU Pro,0.3823,[],mmlu_pro_240829.csv +qwen1_5_14b_chat,MMLU Pro,0.3802,[],mmlu_pro_240829.csv +c4ai_command_r_v0_1,MMLU Pro,0.379,[],mmlu_pro_240829.csv +staring_7b,MMLU Pro,0.379,[],mmlu_pro_240829.csv +llama_2_70b,MMLU Pro,0.3753,[],mmlu_pro_240829.csv +openchat_3_5_8b,MMLU Pro,0.3724,[],mmlu_pro_240829.csv +internmath_20b_plus,MMLU Pro,0.371,[],mmlu_pro_240829.csv +llama3_smaug_8b,MMLU Pro,0.3693,[],mmlu_pro_240829.csv +llama3_1_8b,MMLU Pro,0.366,[],mmlu_pro_240829.csv +llama3_8b,MMLU Pro,0.3536,[],mmlu_pro_240829.csv +deepseekmath_7b_instruct,MMLU Pro,0.353,[],mmlu_pro_240829.csv +deepseek_coder_v2_lite_base,MMLU Pro,0.3437,[],mmlu_pro_240829.csv +gemma_7b,MMLU Pro,0.3373,[],mmlu_pro_240829.csv +internmath_7b_plus,MMLU Pro,0.335,[],mmlu_pro_240829.csv +zephyr_7b_beta,MMLU Pro,0.3297,[],mmlu_pro_240829.csv +mistral_7b_v0_1,MMLU Pro,0.3088,[],mmlu_pro_240829.csv +mistral_7b_instruct_v0_2,MMLU Pro,0.3084,[],mmlu_pro_240829.csv +mistral_7b_v0_2,MMLU Pro,0.3043,[],mmlu_pro_240829.csv +qwen1_5_7b_chat,MMLU Pro,0.2906,[],mmlu_pro_240829.csv +yi_6b_chat,MMLU Pro,0.2884,[],mmlu_pro_240829.csv +neo_7b_instruct,MMLU Pro,0.2874,[],mmlu_pro_240829.csv +yi_6b,MMLU Pro,0.2651,[],mmlu_pro_240829.csv +neo_7b,MMLU Pro,0.2585,[],mmlu_pro_240829.csv +mistral_7b_instruct_v0_1,MMLU Pro,0.2575,[],mmlu_pro_240829.csv +llama_2_13b,MMLU Pro,0.2534,[],mmlu_pro_240829.csv +llemma_7b,MMLU Pro,0.2345,[],mmlu_pro_240829.csv +qwen2_1_5b_instruct,MMLU Pro,0.2262,[],mmlu_pro_240829.csv +qwen2_1_5b,MMLU Pro,0.2256,[],mmlu_pro_240829.csv +llama_2_7b,MMLU Pro,0.2032,[],mmlu_pro_240829.csv +qwen2_0_5b_instruct,MMLU Pro,0.1593,[],mmlu_pro_240829.csv +gemma_2b,MMLU Pro,0.1585,[],mmlu_pro_240829.csv +qwen2_0_5b,MMLU Pro,0.1497,[],mmlu_pro_240829.csv +llama3_70b,MixEval,82.2,[],mixeval_240829.csv +qwen1_5_72b,MixEval,79.5,[],mixeval_240829.csv +yi_34b,MixEval,78.3,[],mixeval_240829.csv +qwen1_5_32b,MixEval,77.6,[],mixeval_240829.csv +mixtral_8x7b,MixEval,74.0,[],mixeval_240829.csv +llama_2_70b,MixEval,73.2,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval,70.2,[],mixeval_240829.csv +qwen1_5_7b,MixEval,68.2,[],mixeval_240829.csv +llama3_8b,MixEval,65.1,[],mixeval_240829.csv +mistral_7b,MixEval,64.8,[],mixeval_240829.csv +gemma_7b,MixEval,64.7,[],mixeval_240829.csv +yi_6b,MixEval,63.1,[],mixeval_240829.csv +qwen1_5_4b,MixEval,58.2,[],mixeval_240829.csv +jetmoe_8b,MixEval,57.1,[],mixeval_240829.csv +deepseek_7b,MixEval,52.2,[],mixeval_240829.csv +phi_2,MixEval,51.9,[],mixeval_240829.csv +deepseekmoe_16b,MixEval,51.4,[],mixeval_240829.csv +llama_2_7b,MixEval,43.1,[],mixeval_240829.csv +gemma_2b,MixEval,38.9,[],mixeval_240829.csv +olmo_7b,MixEval,31.8,[],mixeval_240829.csv +mpt_7b,MixEval,30.8,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval,89.9,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval,87.9,[],mixeval_240829.csv +claude_3_opus,MixEval,88.1,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval,88.8,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval,84.2,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval,84.8,[],mixeval_240829.csv +mistral_large_2,MixEval,86.1,[],mixeval_240829.csv +yi_large_preview,MixEval,84.4,[],mixeval_240829.csv +llama3_70b_instruct,MixEval,84.0,[],mixeval_240829.csv +qwen_max_0428,MixEval,86.1,[],mixeval_240829.csv +claude_3_sonnet,MixEval,81.7,[],mixeval_240829.csv +reka_core_20240415,MixEval,83.3,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval,81.5,[],mixeval_240829.csv +deepseek_v2,MixEval,83.7,[],mixeval_240829.csv +gpt_4o_mini,MixEval,84.2,[],mixeval_240829.csv +command_r_plus,MixEval,81.5,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval,81.7,[],mixeval_240829.csv +mistral_large,MixEval,84.2,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval,84.1,[],mixeval_240829.csv +mistral_medium,MixEval,81.9,[],mixeval_240829.csv +gemini_1_0_pro,MixEval,78.9,[],mixeval_240829.csv +reka_flash_20240226,MixEval,79.8,[],mixeval_240829.csv +mistral_small,MixEval,81.2,[],mixeval_240829.csv +llama3_8b_instruct,MixEval,75.0,[],mixeval_240829.csv +command_r,MixEval,77.0,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval,81.0,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval,79.7,[],mixeval_240829.csv +claude_3_haiku,MixEval,79.7,[],mixeval_240829.csv +yi_34b_chat,MixEval,80.1,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval,76.4,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval,74.8,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval,74.2,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval,69.6,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval,66.3,[],mixeval_240829.csv +llama_2_70b_chat,MixEval,74.6,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval,70.0,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval,70.0,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval,71.4,[],mixeval_240829.csv +reka_edge_20240208,MixEval,68.5,[],mixeval_240829.csv +zephyr_7b_beta,MixEval,69.1,[],mixeval_240829.csv +llama_2_7b_chat,MixEval,61.7,[],mixeval_240829.csv +yi_6b_chat,MixEval,65.6,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval,69.1,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval,51.9,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval,60.3,[],mixeval_240829.csv +olmo_7b_instruct,MixEval,55.0,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval,57.2,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval,51.6,[],mixeval_240829.csv +mpt_7b_chat,MixEval,43.8,[],mixeval_240829.csv +llama3_70b,MixEval Hard,54.0,[],mixeval_240829.csv +qwen1_5_72b,MixEval Hard,41.9,[],mixeval_240829.csv +yi_34b,MixEval Hard,47.2,[],mixeval_240829.csv +qwen1_5_32b,MixEval Hard,41.0,[],mixeval_240829.csv +mixtral_8x7b,MixEval Hard,40.7,[],mixeval_240829.csv +llama_2_70b,MixEval Hard,41.6,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval Hard,33.5,[],mixeval_240829.csv +qwen1_5_7b,MixEval Hard,33.7,[],mixeval_240829.csv +llama3_8b,MixEval Hard,31.7,[],mixeval_240829.csv +mistral_7b,MixEval Hard,27.1,[],mixeval_240829.csv +gemma_7b,MixEval Hard,32.7,[],mixeval_240829.csv +yi_6b,MixEval Hard,30.4,[],mixeval_240829.csv +qwen1_5_4b,MixEval Hard,23.5,[],mixeval_240829.csv +jetmoe_8b,MixEval Hard,27.0,[],mixeval_240829.csv +deepseek_7b,MixEval Hard,21.7,[],mixeval_240829.csv +phi_2,MixEval Hard,21.9,[],mixeval_240829.csv +deepseekmoe_16b,MixEval Hard,24.2,[],mixeval_240829.csv +llama_2_7b,MixEval Hard,22.1,[],mixeval_240829.csv +gemma_2b,MixEval Hard,22.6,[],mixeval_240829.csv +olmo_7b,MixEval Hard,21.2,[],mixeval_240829.csv +mpt_7b,MixEval Hard,17.4,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval Hard,68.1,[],mixeval_240829.csv +llama3_1_405b_instruct,MixEval Hard,66.2,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval Hard,64.7,[],mixeval_240829.csv +claude_3_opus,MixEval Hard,63.5,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval Hard,62.6,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval Hard,58.7,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval Hard,58.3,[],mixeval_240829.csv +mistral_large_2,MixEval Hard,57.4,[],mixeval_240829.csv +yi_large_preview,MixEval Hard,56.8,[],mixeval_240829.csv +llama3_70b_instruct,MixEval Hard,55.9,[],mixeval_240829.csv +qwen_max_0428,MixEval Hard,55.8,[],mixeval_240829.csv +claude_3_sonnet,MixEval Hard,54.0,[],mixeval_240829.csv +reka_core_20240415,MixEval Hard,52.9,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval Hard,51.8,[],mixeval_240829.csv +deepseek_v2,MixEval Hard,51.7,[],mixeval_240829.csv +gpt_4o_mini,MixEval Hard,51.6,[],mixeval_240829.csv +command_r_plus,MixEval Hard,51.4,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval Hard,51.2,[],mixeval_240829.csv +mistral_large,MixEval Hard,50.3,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval Hard,48.3,[],mixeval_240829.csv +mistral_medium,MixEval Hard,47.8,[],mixeval_240829.csv +gemini_1_0_pro,MixEval Hard,46.4,[],mixeval_240829.csv +reka_flash_20240226,MixEval Hard,46.2,[],mixeval_240829.csv +mistral_small,MixEval Hard,46.2,[],mixeval_240829.csv +llama3_8b_instruct,MixEval Hard,45.6,[],mixeval_240829.csv +command_r,MixEval Hard,45.2,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval Hard,43.3,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval Hard,43.0,[],mixeval_240829.csv +claude_3_haiku,MixEval Hard,42.8,[],mixeval_240829.csv +yi_34b_chat,MixEval Hard,42.6,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval Hard,42.5,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval Hard,41.8,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval Hard,40.9,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval Hard,39.1,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval Hard,38.7,[],mixeval_240829.csv +llama_2_70b_chat,MixEval Hard,38.0,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval Hard,37.8,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval Hard,36.2,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval Hard,35.5,[],mixeval_240829.csv +reka_edge_20240208,MixEval Hard,32.2,[],mixeval_240829.csv +zephyr_7b_beta,MixEval Hard,31.6,[],mixeval_240829.csv +llama_2_7b_chat,MixEval Hard,30.8,[],mixeval_240829.csv +yi_6b_chat,MixEval Hard,30.1,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval Hard,29.1,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval Hard,28.4,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval Hard,27.8,[],mixeval_240829.csv +olmo_7b_instruct,MixEval Hard,26.7,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval Hard,24.6,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval Hard,24.3,[],mixeval_240829.csv +mpt_7b_chat,MixEval Hard,23.8,[],mixeval_240829.csv +llama3_70b,MixEval TriviaQA,83.1,[],mixeval_240829.csv +qwen1_5_72b,MixEval TriviaQA,78.4,[],mixeval_240829.csv +yi_34b,MixEval TriviaQA,72.1,[],mixeval_240829.csv +qwen1_5_32b,MixEval TriviaQA,71.9,[],mixeval_240829.csv +mixtral_8x7b,MixEval TriviaQA,77.3,[],mixeval_240829.csv +llama_2_70b,MixEval TriviaQA,78.7,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval TriviaQA,71.3,[],mixeval_240829.csv +qwen1_5_7b,MixEval TriviaQA,61.4,[],mixeval_240829.csv +llama3_8b,MixEval TriviaQA,65.2,[],mixeval_240829.csv +mistral_7b,MixEval TriviaQA,67.2,[],mixeval_240829.csv +gemma_7b,MixEval TriviaQA,66.0,[],mixeval_240829.csv +yi_6b,MixEval TriviaQA,54.7,[],mixeval_240829.csv +qwen1_5_4b,MixEval TriviaQA,47.8,[],mixeval_240829.csv +jetmoe_8b,MixEval TriviaQA,53.4,[],mixeval_240829.csv +deepseek_7b,MixEval TriviaQA,58.7,[],mixeval_240829.csv +phi_2,MixEval TriviaQA,37.0,[],mixeval_240829.csv +deepseekmoe_16b,MixEval TriviaQA,64.2,[],mixeval_240829.csv +llama_2_7b,MixEval TriviaQA,55.5,[],mixeval_240829.csv +gemma_2b,MixEval TriviaQA,41.5,[],mixeval_240829.csv +olmo_7b,MixEval TriviaQA,38.4,[],mixeval_240829.csv +mpt_7b,MixEval TriviaQA,33.5,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval TriviaQA,92.6,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval TriviaQA,88.0,[],mixeval_240829.csv +claude_3_opus,MixEval TriviaQA,90.4,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval TriviaQA,91.2,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval TriviaQA,85.3,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval TriviaQA,83.7,[],mixeval_240829.csv +mistral_large_2,MixEval TriviaQA,88.2,[],mixeval_240829.csv +yi_large_preview,MixEval TriviaQA,81.7,[],mixeval_240829.csv +llama3_70b_instruct,MixEval TriviaQA,83.1,[],mixeval_240829.csv +qwen_max_0428,MixEval TriviaQA,86.7,[],mixeval_240829.csv +claude_3_sonnet,MixEval TriviaQA,84.2,[],mixeval_240829.csv +reka_core_20240415,MixEval TriviaQA,82.8,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval TriviaQA,83.0,[],mixeval_240829.csv +deepseek_v2,MixEval TriviaQA,84.4,[],mixeval_240829.csv +gpt_4o_mini,MixEval TriviaQA,83.1,[],mixeval_240829.csv +command_r_plus,MixEval TriviaQA,83.3,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval TriviaQA,78.4,[],mixeval_240829.csv +mistral_large,MixEval TriviaQA,88.3,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval TriviaQA,83.9,[],mixeval_240829.csv +mistral_medium,MixEval TriviaQA,86.8,[],mixeval_240829.csv +gemini_1_0_pro,MixEval TriviaQA,81.0,[],mixeval_240829.csv +reka_flash_20240226,MixEval TriviaQA,76.4,[],mixeval_240829.csv +mistral_small,MixEval TriviaQA,85.1,[],mixeval_240829.csv +llama3_8b_instruct,MixEval TriviaQA,71.7,[],mixeval_240829.csv +command_r,MixEval TriviaQA,80.9,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval TriviaQA,75.7,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval TriviaQA,85.2,[],mixeval_240829.csv +claude_3_haiku,MixEval TriviaQA,79.9,[],mixeval_240829.csv +yi_34b_chat,MixEval TriviaQA,82.7,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval TriviaQA,82.5,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval TriviaQA,75.1,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval TriviaQA,61.3,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval TriviaQA,64.3,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval TriviaQA,79.2,[],mixeval_240829.csv +llama_2_70b_chat,MixEval TriviaQA,80.0,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval TriviaQA,62.1,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval TriviaQA,73.7,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval TriviaQA,64.1,[],mixeval_240829.csv +reka_edge_20240208,MixEval TriviaQA,60.0,[],mixeval_240829.csv +zephyr_7b_beta,MixEval TriviaQA,74.7,[],mixeval_240829.csv +llama_2_7b_chat,MixEval TriviaQA,68.8,[],mixeval_240829.csv +yi_6b_chat,MixEval TriviaQA,66.1,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval TriviaQA,65.9,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval TriviaQA,53.7,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval TriviaQA,66.4,[],mixeval_240829.csv +olmo_7b_instruct,MixEval TriviaQA,51.7,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval TriviaQA,46.0,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval TriviaQA,46.8,[],mixeval_240829.csv +mpt_7b_chat,MixEval TriviaQA,50.2,[],mixeval_240829.csv +llama3_70b,MixEval MMLU,79.8,[],mixeval_240829.csv +qwen1_5_72b,MixEval MMLU,78.8,[],mixeval_240829.csv +yi_34b,MixEval MMLU,79.3,[],mixeval_240829.csv +qwen1_5_32b,MixEval MMLU,77.2,[],mixeval_240829.csv +mixtral_8x7b,MixEval MMLU,71.6,[],mixeval_240829.csv +llama_2_70b,MixEval MMLU,70.8,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval MMLU,69.4,[],mixeval_240829.csv +qwen1_5_7b,MixEval MMLU,67.0,[],mixeval_240829.csv +llama3_8b,MixEval MMLU,69.5,[],mixeval_240829.csv +mistral_7b,MixEval MMLU,68.5,[],mixeval_240829.csv +gemma_7b,MixEval MMLU,67.4,[],mixeval_240829.csv +yi_6b,MixEval MMLU,71.2,[],mixeval_240829.csv +qwen1_5_4b,MixEval MMLU,59.6,[],mixeval_240829.csv +jetmoe_8b,MixEval MMLU,55.3,[],mixeval_240829.csv +deepseek_7b,MixEval MMLU,53.3,[],mixeval_240829.csv +phi_2,MixEval MMLU,62.5,[],mixeval_240829.csv +deepseekmoe_16b,MixEval MMLU,49.9,[],mixeval_240829.csv +llama_2_7b,MixEval MMLU,40.8,[],mixeval_240829.csv +gemma_2b,MixEval MMLU,37.4,[],mixeval_240829.csv +olmo_7b,MixEval MMLU,29.7,[],mixeval_240829.csv +mpt_7b,MixEval MMLU,30.9,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval MMLU,84.2,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval MMLU,85.4,[],mixeval_240829.csv +claude_3_opus,MixEval MMLU,83.2,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval MMLU,82.8,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval MMLU,79.2,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval MMLU,84.0,[],mixeval_240829.csv +mistral_large_2,MixEval MMLU,81.9,[],mixeval_240829.csv +yi_large_preview,MixEval MMLU,80.9,[],mixeval_240829.csv +llama3_70b_instruct,MixEval MMLU,80.5,[],mixeval_240829.csv +qwen_max_0428,MixEval MMLU,80.6,[],mixeval_240829.csv +claude_3_sonnet,MixEval MMLU,74.7,[],mixeval_240829.csv +reka_core_20240415,MixEval MMLU,79.3,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval MMLU,74.5,[],mixeval_240829.csv +deepseek_v2,MixEval MMLU,77.3,[],mixeval_240829.csv +gpt_4o_mini,MixEval MMLU,82.3,[],mixeval_240829.csv +command_r_plus,MixEval MMLU,78.9,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval MMLU,76.4,[],mixeval_240829.csv +mistral_large,MixEval MMLU,80.2,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval MMLU,80.1,[],mixeval_240829.csv +mistral_medium,MixEval MMLU,76.3,[],mixeval_240829.csv +gemini_1_0_pro,MixEval MMLU,74.9,[],mixeval_240829.csv +reka_flash_20240226,MixEval MMLU,75.4,[],mixeval_240829.csv +mistral_small,MixEval MMLU,75.2,[],mixeval_240829.csv +llama3_8b_instruct,MixEval MMLU,71.9,[],mixeval_240829.csv +command_r,MixEval MMLU,75.0,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval MMLU,78.0,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval MMLU,74.5,[],mixeval_240829.csv +claude_3_haiku,MixEval MMLU,76.1,[],mixeval_240829.csv +yi_34b_chat,MixEval MMLU,73.6,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval MMLU,72.0,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval MMLU,69.0,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval MMLU,72.6,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval MMLU,66.9,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval MMLU,59.2,[],mixeval_240829.csv +llama_2_70b_chat,MixEval MMLU,69.8,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval MMLU,66.7,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval MMLU,67.3,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval MMLU,68.7,[],mixeval_240829.csv +reka_edge_20240208,MixEval MMLU,63.6,[],mixeval_240829.csv +zephyr_7b_beta,MixEval MMLU,64.9,[],mixeval_240829.csv +llama_2_7b_chat,MixEval MMLU,59.4,[],mixeval_240829.csv +yi_6b_chat,MixEval MMLU,65.4,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval MMLU,69.5,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval MMLU,51.5,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval MMLU,58.7,[],mixeval_240829.csv +olmo_7b_instruct,MixEval MMLU,57.1,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval MMLU,61.4,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval MMLU,58.5,[],mixeval_240829.csv +mpt_7b_chat,MixEval MMLU,37.8,[],mixeval_240829.csv +llama3_70b,MixEval DROP,81.5,[],mixeval_240829.csv +qwen1_5_72b,MixEval DROP,64.5,[],mixeval_240829.csv +yi_34b,MixEval DROP,78.2,[],mixeval_240829.csv +qwen1_5_32b,MixEval DROP,68.7,[],mixeval_240829.csv +mixtral_8x7b,MixEval DROP,69.8,[],mixeval_240829.csv +llama_2_70b,MixEval DROP,73.2,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval DROP,59.9,[],mixeval_240829.csv +qwen1_5_7b,MixEval DROP,63.6,[],mixeval_240829.csv +llama3_8b,MixEval DROP,63.8,[],mixeval_240829.csv +mistral_7b,MixEval DROP,61.3,[],mixeval_240829.csv +gemma_7b,MixEval DROP,63.8,[],mixeval_240829.csv +yi_6b,MixEval DROP,51.4,[],mixeval_240829.csv +qwen1_5_4b,MixEval DROP,51.0,[],mixeval_240829.csv +jetmoe_8b,MixEval DROP,44.1,[],mixeval_240829.csv +deepseek_7b,MixEval DROP,43.5,[],mixeval_240829.csv +phi_2,MixEval DROP,50.4,[],mixeval_240829.csv +deepseekmoe_16b,MixEval DROP,41.1,[],mixeval_240829.csv +llama_2_7b,MixEval DROP,37.6,[],mixeval_240829.csv +gemma_2b,MixEval DROP,32.6,[],mixeval_240829.csv +olmo_7b,MixEval DROP,24.0,[],mixeval_240829.csv +mpt_7b,MixEval DROP,26.8,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval DROP,93.7,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval DROP,87.9,[],mixeval_240829.csv +claude_3_opus,MixEval DROP,91.5,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval DROP,91.0,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval DROP,84.2,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval DROP,82.5,[],mixeval_240829.csv +mistral_large_2,MixEval DROP,89.3,[],mixeval_240829.csv +yi_large_preview,MixEval DROP,87.0,[],mixeval_240829.csv +llama3_70b_instruct,MixEval DROP,90.1,[],mixeval_240829.csv +qwen_max_0428,MixEval DROP,85.4,[],mixeval_240829.csv +claude_3_sonnet,MixEval DROP,87.7,[],mixeval_240829.csv +reka_core_20240415,MixEval DROP,88.1,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval DROP,85.7,[],mixeval_240829.csv +deepseek_v2,MixEval DROP,85.3,[],mixeval_240829.csv +gpt_4o_mini,MixEval DROP,87.7,[],mixeval_240829.csv +command_r_plus,MixEval DROP,80.4,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval DROP,87.0,[],mixeval_240829.csv +mistral_large,MixEval DROP,88.6,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval DROP,85.1,[],mixeval_240829.csv +mistral_medium,MixEval DROP,83.2,[],mixeval_240829.csv +gemini_1_0_pro,MixEval DROP,82.6,[],mixeval_240829.csv +reka_flash_20240226,MixEval DROP,86.7,[],mixeval_240829.csv +mistral_small,MixEval DROP,86.1,[],mixeval_240829.csv +llama3_8b_instruct,MixEval DROP,86.4,[],mixeval_240829.csv +command_r,MixEval DROP,72.0,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval DROP,82.9,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval DROP,84.8,[],mixeval_240829.csv +claude_3_haiku,MixEval DROP,85.0,[],mixeval_240829.csv +yi_34b_chat,MixEval DROP,86.1,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval DROP,79.5,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval DROP,86.4,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval DROP,83.9,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval DROP,80.6,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval DROP,71.4,[],mixeval_240829.csv +llama_2_70b_chat,MixEval DROP,79.8,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval DROP,75.5,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval DROP,72.8,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval DROP,76.4,[],mixeval_240829.csv +reka_edge_20240208,MixEval DROP,80.0,[],mixeval_240829.csv +zephyr_7b_beta,MixEval DROP,77.3,[],mixeval_240829.csv +llama_2_7b_chat,MixEval DROP,69.3,[],mixeval_240829.csv +yi_6b_chat,MixEval DROP,70.5,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval DROP,64.6,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval DROP,59.8,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval DROP,68.3,[],mixeval_240829.csv +olmo_7b_instruct,MixEval DROP,53.1,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval DROP,57.2,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval DROP,27.0,[],mixeval_240829.csv +mpt_7b_chat,MixEval DROP,50.0,[],mixeval_240829.csv +llama3_70b,MixEval HellaSwag,90.9,[],mixeval_240829.csv +qwen1_5_72b,MixEval HellaSwag,91.9,[],mixeval_240829.csv +yi_34b,MixEval HellaSwag,98.0,[],mixeval_240829.csv +qwen1_5_32b,MixEval HellaSwag,93.3,[],mixeval_240829.csv +mixtral_8x7b,MixEval HellaSwag,73.7,[],mixeval_240829.csv +llama_2_70b,MixEval HellaSwag,63.0,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval HellaSwag,80.1,[],mixeval_240829.csv +qwen1_5_7b,MixEval HellaSwag,83.8,[],mixeval_240829.csv +llama3_8b,MixEval HellaSwag,51.5,[],mixeval_240829.csv +mistral_7b,MixEval HellaSwag,54.5,[],mixeval_240829.csv +gemma_7b,MixEval HellaSwag,36.0,[],mixeval_240829.csv +yi_6b,MixEval HellaSwag,77.4,[],mixeval_240829.csv +qwen1_5_4b,MixEval HellaSwag,65.7,[],mixeval_240829.csv +jetmoe_8b,MixEval HellaSwag,89.2,[],mixeval_240829.csv +deepseek_7b,MixEval HellaSwag,35.0,[],mixeval_240829.csv +phi_2,MixEval HellaSwag,20.2,[],mixeval_240829.csv +deepseekmoe_16b,MixEval HellaSwag,28.6,[],mixeval_240829.csv +llama_2_7b,MixEval HellaSwag,24.9,[],mixeval_240829.csv +gemma_2b,MixEval HellaSwag,33.3,[],mixeval_240829.csv +olmo_7b,MixEval HellaSwag,26.9,[],mixeval_240829.csv +mpt_7b,MixEval HellaSwag,19.2,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval HellaSwag,94.6,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval HellaSwag,94.3,[],mixeval_240829.csv +claude_3_opus,MixEval HellaSwag,93.3,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval HellaSwag,92.6,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval HellaSwag,89.2,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval HellaSwag,91.2,[],mixeval_240829.csv +mistral_large_2,MixEval HellaSwag,80.1,[],mixeval_240829.csv +yi_large_preview,MixEval HellaSwag,92.6,[],mixeval_240829.csv +llama3_70b_instruct,MixEval HellaSwag,81.8,[],mixeval_240829.csv +qwen_max_0428,MixEval HellaSwag,93.6,[],mixeval_240829.csv +claude_3_sonnet,MixEval HellaSwag,85.9,[],mixeval_240829.csv +reka_core_20240415,MixEval HellaSwag,88.6,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval HellaSwag,82.2,[],mixeval_240829.csv +deepseek_v2,MixEval HellaSwag,88.2,[],mixeval_240829.csv +gpt_4o_mini,MixEval HellaSwag,83.8,[],mixeval_240829.csv +command_r_plus,MixEval HellaSwag,83.5,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval HellaSwag,90.2,[],mixeval_240829.csv +mistral_large,MixEval HellaSwag,65.0,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval HellaSwag,87.9,[],mixeval_240829.csv +mistral_medium,MixEval HellaSwag,72.4,[],mixeval_240829.csv +gemini_1_0_pro,MixEval HellaSwag,74.7,[],mixeval_240829.csv +reka_flash_20240226,MixEval HellaSwag,90.6,[],mixeval_240829.csv +mistral_small,MixEval HellaSwag,73.4,[],mixeval_240829.csv +llama3_8b_instruct,MixEval HellaSwag,65.7,[],mixeval_240829.csv +command_r,MixEval HellaSwag,75.8,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval HellaSwag,85.9,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval HellaSwag,63.0,[],mixeval_240829.csv +claude_3_haiku,MixEval HellaSwag,75.8,[],mixeval_240829.csv +yi_34b_chat,MixEval HellaSwag,86.9,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval HellaSwag,54.2,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval HellaSwag,48.5,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval HellaSwag,86.5,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval HellaSwag,66.3,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval HellaSwag,30.3,[],mixeval_240829.csv +llama_2_70b_chat,MixEval HellaSwag,67.3,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval HellaSwag,74.4,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval HellaSwag,54.2,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval HellaSwag,76.1,[],mixeval_240829.csv +reka_edge_20240208,MixEval HellaSwag,74.7,[],mixeval_240829.csv +zephyr_7b_beta,MixEval HellaSwag,39.1,[],mixeval_240829.csv +llama_2_7b_chat,MixEval HellaSwag,35.7,[],mixeval_240829.csv +yi_6b_chat,MixEval HellaSwag,52.5,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval HellaSwag,72.7,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval HellaSwag,26.6,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval HellaSwag,24.9,[],mixeval_240829.csv +olmo_7b_instruct,MixEval HellaSwag,55.9,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval HellaSwag,54.9,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval HellaSwag,86.2,[],mixeval_240829.csv +mpt_7b_chat,MixEval HellaSwag,25.6,[],mixeval_240829.csv +llama3_70b,MixEval CommonsenseQA,85.4,[],mixeval_240829.csv +qwen1_5_72b,MixEval CommonsenseQA,87.3,[],mixeval_240829.csv +yi_34b,MixEval CommonsenseQA,81.1,[],mixeval_240829.csv +qwen1_5_32b,MixEval CommonsenseQA,89.2,[],mixeval_240829.csv +mixtral_8x7b,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv +llama_2_70b,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval CommonsenseQA,80.2,[],mixeval_240829.csv +qwen1_5_7b,MixEval CommonsenseQA,84.4,[],mixeval_240829.csv +llama3_8b,MixEval CommonsenseQA,69.8,[],mixeval_240829.csv +mistral_7b,MixEval CommonsenseQA,67.9,[],mixeval_240829.csv +gemma_7b,MixEval CommonsenseQA,68.4,[],mixeval_240829.csv +yi_6b,MixEval CommonsenseQA,76.4,[],mixeval_240829.csv +qwen1_5_4b,MixEval CommonsenseQA,79.2,[],mixeval_240829.csv +jetmoe_8b,MixEval CommonsenseQA,60.4,[],mixeval_240829.csv +deepseek_7b,MixEval CommonsenseQA,51.4,[],mixeval_240829.csv +phi_2,MixEval CommonsenseQA,68.9,[],mixeval_240829.csv +deepseekmoe_16b,MixEval CommonsenseQA,48.6,[],mixeval_240829.csv +llama_2_7b,MixEval CommonsenseQA,30.7,[],mixeval_240829.csv +gemma_2b,MixEval CommonsenseQA,31.6,[],mixeval_240829.csv +olmo_7b,MixEval CommonsenseQA,25.5,[],mixeval_240829.csv +mpt_7b,MixEval CommonsenseQA,28.8,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval CommonsenseQA,85.4,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval CommonsenseQA,86.8,[],mixeval_240829.csv +claude_3_opus,MixEval CommonsenseQA,87.7,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval CommonsenseQA,85.4,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval CommonsenseQA,84.4,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv +mistral_large_2,MixEval CommonsenseQA,81.6,[],mixeval_240829.csv +yi_large_preview,MixEval CommonsenseQA,90.1,[],mixeval_240829.csv +llama3_70b_instruct,MixEval CommonsenseQA,83.0,[],mixeval_240829.csv +qwen_max_0428,MixEval CommonsenseQA,88.2,[],mixeval_240829.csv +claude_3_sonnet,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv +reka_core_20240415,MixEval CommonsenseQA,81.6,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv +deepseek_v2,MixEval CommonsenseQA,84.0,[],mixeval_240829.csv +gpt_4o_mini,MixEval CommonsenseQA,84.9,[],mixeval_240829.csv +command_r_plus,MixEval CommonsenseQA,82.1,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval CommonsenseQA,86.8,[],mixeval_240829.csv +mistral_large,MixEval CommonsenseQA,83.5,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval CommonsenseQA,86.3,[],mixeval_240829.csv +mistral_medium,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv +gemini_1_0_pro,MixEval CommonsenseQA,80.2,[],mixeval_240829.csv +reka_flash_20240226,MixEval CommonsenseQA,80.7,[],mixeval_240829.csv +mistral_small,MixEval CommonsenseQA,77.8,[],mixeval_240829.csv +llama3_8b_instruct,MixEval CommonsenseQA,78.3,[],mixeval_240829.csv +command_r,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval CommonsenseQA,88.2,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval CommonsenseQA,81.6,[],mixeval_240829.csv +claude_3_haiku,MixEval CommonsenseQA,78.8,[],mixeval_240829.csv +yi_34b_chat,MixEval CommonsenseQA,78.8,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval CommonsenseQA,77.4,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval CommonsenseQA,84.9,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval CommonsenseQA,82.5,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval CommonsenseQA,73.6,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval CommonsenseQA,61.8,[],mixeval_240829.csv +llama_2_70b_chat,MixEval CommonsenseQA,74.1,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval CommonsenseQA,82.1,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval CommonsenseQA,66.0,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval CommonsenseQA,82.1,[],mixeval_240829.csv +reka_edge_20240208,MixEval CommonsenseQA,80.7,[],mixeval_240829.csv +zephyr_7b_beta,MixEval CommonsenseQA,69.3,[],mixeval_240829.csv +llama_2_7b_chat,MixEval CommonsenseQA,61.3,[],mixeval_240829.csv +yi_6b_chat,MixEval CommonsenseQA,69.8,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval CommonsenseQA,81.1,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval CommonsenseQA,57.1,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval CommonsenseQA,62.7,[],mixeval_240829.csv +olmo_7b_instruct,MixEval CommonsenseQA,64.6,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval CommonsenseQA,74.1,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval CommonsenseQA,68.4,[],mixeval_240829.csv +mpt_7b_chat,MixEval CommonsenseQA,36.3,[],mixeval_240829.csv +llama3_70b,MixEval TriviaQA Hard,59.1,[],mixeval_240829.csv +qwen1_5_72b,MixEval TriviaQA Hard,41.4,[],mixeval_240829.csv +yi_34b,MixEval TriviaQA Hard,39.4,[],mixeval_240829.csv +qwen1_5_32b,MixEval TriviaQA Hard,28.0,[],mixeval_240829.csv +mixtral_8x7b,MixEval TriviaQA Hard,44.1,[],mixeval_240829.csv +llama_2_70b,MixEval TriviaQA Hard,53.8,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval TriviaQA Hard,36.0,[],mixeval_240829.csv +qwen1_5_7b,MixEval TriviaQA Hard,31.6,[],mixeval_240829.csv +llama3_8b,MixEval TriviaQA Hard,22.6,[],mixeval_240829.csv +mistral_7b,MixEval TriviaQA Hard,24.2,[],mixeval_240829.csv +gemma_7b,MixEval TriviaQA Hard,31.1,[],mixeval_240829.csv +yi_6b,MixEval TriviaQA Hard,17.0,[],mixeval_240829.csv +qwen1_5_4b,MixEval TriviaQA Hard,14.0,[],mixeval_240829.csv +jetmoe_8b,MixEval TriviaQA Hard,22.8,[],mixeval_240829.csv +deepseek_7b,MixEval TriviaQA Hard,21.4,[],mixeval_240829.csv +phi_2,MixEval TriviaQA Hard,7.3,[],mixeval_240829.csv +deepseekmoe_16b,MixEval TriviaQA Hard,24.9,[],mixeval_240829.csv +llama_2_7b,MixEval TriviaQA Hard,19.5,[],mixeval_240829.csv +gemma_2b,MixEval TriviaQA Hard,12.1,[],mixeval_240829.csv +olmo_7b,MixEval TriviaQA Hard,16.0,[],mixeval_240829.csv +mpt_7b,MixEval TriviaQA Hard,6.6,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval TriviaQA Hard,73.3,[],mixeval_240829.csv +llama3_1_405b_instruct,MixEval TriviaQA Hard,72.0,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval TriviaQA Hard,70.3,[],mixeval_240829.csv +claude_3_opus,MixEval TriviaQA Hard,71.4,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval TriviaQA Hard,73.1,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval TriviaQA Hard,67.8,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval TriviaQA Hard,59.4,[],mixeval_240829.csv +mistral_large_2,MixEval TriviaQA Hard,64.8,[],mixeval_240829.csv +yi_large_preview,MixEval TriviaQA Hard,55.4,[],mixeval_240829.csv +llama3_70b_instruct,MixEval TriviaQA Hard,60.5,[],mixeval_240829.csv +qwen_max_0428,MixEval TriviaQA Hard,61.5,[],mixeval_240829.csv +claude_3_sonnet,MixEval TriviaQA Hard,59.1,[],mixeval_240829.csv +reka_core_20240415,MixEval TriviaQA Hard,51.6,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval TriviaQA Hard,52.9,[],mixeval_240829.csv +deepseek_v2,MixEval TriviaQA Hard,51.7,[],mixeval_240829.csv +gpt_4o_mini,MixEval TriviaQA Hard,45.3,[],mixeval_240829.csv +command_r_plus,MixEval TriviaQA Hard,57.5,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval TriviaQA Hard,44.4,[],mixeval_240829.csv +mistral_large,MixEval TriviaQA Hard,55.5,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval TriviaQA Hard,49.9,[],mixeval_240829.csv +mistral_medium,MixEval TriviaQA Hard,59.8,[],mixeval_240829.csv +gemini_1_0_pro,MixEval TriviaQA Hard,58.2,[],mixeval_240829.csv +reka_flash_20240226,MixEval TriviaQA Hard,42.9,[],mixeval_240829.csv +mistral_small,MixEval TriviaQA Hard,56.0,[],mixeval_240829.csv +llama3_8b_instruct,MixEval TriviaQA Hard,40.2,[],mixeval_240829.csv +command_r,MixEval TriviaQA Hard,57.0,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval TriviaQA Hard,39.1,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval TriviaQA Hard,46.4,[],mixeval_240829.csv +claude_3_haiku,MixEval TriviaQA Hard,42.4,[],mixeval_240829.csv +yi_34b_chat,MixEval TriviaQA Hard,41.5,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval TriviaQA Hard,48.5,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval TriviaQA Hard,33.4,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval TriviaQA Hard,23.3,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval TriviaQA Hard,30.3,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval TriviaQA Hard,42.5,[],mixeval_240829.csv +llama_2_70b_chat,MixEval TriviaQA Hard,42.2,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval TriviaQA Hard,26.5,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval TriviaQA Hard,33.5,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval TriviaQA Hard,29.0,[],mixeval_240829.csv +reka_edge_20240208,MixEval TriviaQA Hard,18.6,[],mixeval_240829.csv +zephyr_7b_beta,MixEval TriviaQA Hard,30.2,[],mixeval_240829.csv +llama_2_7b_chat,MixEval TriviaQA Hard,24.8,[],mixeval_240829.csv +yi_6b_chat,MixEval TriviaQA Hard,18.9,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval TriviaQA Hard,21.9,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval TriviaQA Hard,31.9,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval TriviaQA Hard,25.9,[],mixeval_240829.csv +olmo_7b_instruct,MixEval TriviaQA Hard,24.7,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval TriviaQA Hard,16.5,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval TriviaQA Hard,19.2,[],mixeval_240829.csv +mpt_7b_chat,MixEval TriviaQA Hard,17.5,[],mixeval_240829.csv +llama3_70b,MixEval MMLU Hard,39.8,[],mixeval_240829.csv +qwen1_5_72b,MixEval MMLU Hard,42.4,[],mixeval_240829.csv +yi_34b,MixEval MMLU Hard,42.4,[],mixeval_240829.csv +qwen1_5_32b,MixEval MMLU Hard,37.2,[],mixeval_240829.csv +mixtral_8x7b,MixEval MMLU Hard,34.6,[],mixeval_240829.csv +llama_2_70b,MixEval MMLU Hard,29.0,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval MMLU Hard,30.7,[],mixeval_240829.csv +qwen1_5_7b,MixEval MMLU Hard,28.6,[],mixeval_240829.csv +llama3_8b,MixEval MMLU Hard,38.5,[],mixeval_240829.csv +mistral_7b,MixEval MMLU Hard,27.7,[],mixeval_240829.csv +gemma_7b,MixEval MMLU Hard,28.1,[],mixeval_240829.csv +yi_6b,MixEval MMLU Hard,37.2,[],mixeval_240829.csv +qwen1_5_4b,MixEval MMLU Hard,22.9,[],mixeval_240829.csv +jetmoe_8b,MixEval MMLU Hard,27.3,[],mixeval_240829.csv +deepseek_7b,MixEval MMLU Hard,26.4,[],mixeval_240829.csv +phi_2,MixEval MMLU Hard,29.0,[],mixeval_240829.csv +deepseekmoe_16b,MixEval MMLU Hard,30.7,[],mixeval_240829.csv +llama_2_7b,MixEval MMLU Hard,24.7,[],mixeval_240829.csv +gemma_2b,MixEval MMLU Hard,27.3,[],mixeval_240829.csv +olmo_7b,MixEval MMLU Hard,25.1,[],mixeval_240829.csv +mpt_7b,MixEval MMLU Hard,24.2,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval MMLU Hard,58.4,[],mixeval_240829.csv +llama3_1_405b_instruct,MixEval MMLU Hard,57.1,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval MMLU Hard,57.1,[],mixeval_240829.csv +claude_3_opus,MixEval MMLU Hard,55.0,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval MMLU Hard,45.5,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval MMLU Hard,44.6,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval MMLU Hard,54.5,[],mixeval_240829.csv +mistral_large_2,MixEval MMLU Hard,42.9,[],mixeval_240829.csv +yi_large_preview,MixEval MMLU Hard,48.5,[],mixeval_240829.csv +llama3_70b_instruct,MixEval MMLU Hard,46.3,[],mixeval_240829.csv +qwen_max_0428,MixEval MMLU Hard,41.6,[],mixeval_240829.csv +claude_3_sonnet,MixEval MMLU Hard,40.7,[],mixeval_240829.csv +reka_core_20240415,MixEval MMLU Hard,46.3,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval MMLU Hard,41.1,[],mixeval_240829.csv +deepseek_v2,MixEval MMLU Hard,42.0,[],mixeval_240829.csv +gpt_4o_mini,MixEval MMLU Hard,45.0,[],mixeval_240829.csv +command_r_plus,MixEval MMLU Hard,42.0,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval MMLU Hard,38.1,[],mixeval_240829.csv +mistral_large,MixEval MMLU Hard,42.4,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval MMLU Hard,37.7,[],mixeval_240829.csv +mistral_medium,MixEval MMLU Hard,38.5,[],mixeval_240829.csv +gemini_1_0_pro,MixEval MMLU Hard,35.5,[],mixeval_240829.csv +reka_flash_20240226,MixEval MMLU Hard,34.6,[],mixeval_240829.csv +mistral_small,MixEval MMLU Hard,33.8,[],mixeval_240829.csv +llama3_8b_instruct,MixEval MMLU Hard,40.7,[],mixeval_240829.csv +command_r,MixEval MMLU Hard,39.0,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval MMLU Hard,29.9,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval MMLU Hard,35.1,[],mixeval_240829.csv +claude_3_haiku,MixEval MMLU Hard,30.7,[],mixeval_240829.csv +yi_34b_chat,MixEval MMLU Hard,29.9,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval MMLU Hard,37.2,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval MMLU Hard,34.2,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval MMLU Hard,36.8,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval MMLU Hard,39.0,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval MMLU Hard,39.4,[],mixeval_240829.csv +llama_2_70b_chat,MixEval MMLU Hard,27.7,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval MMLU Hard,32.5,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval MMLU Hard,29.4,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval MMLU Hard,29.0,[],mixeval_240829.csv +reka_edge_20240208,MixEval MMLU Hard,26.4,[],mixeval_240829.csv +zephyr_7b_beta,MixEval MMLU Hard,24.2,[],mixeval_240829.csv +llama_2_7b_chat,MixEval MMLU Hard,30.3,[],mixeval_240829.csv +yi_6b_chat,MixEval MMLU Hard,26.8,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval MMLU Hard,26.8,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval MMLU Hard,30.3,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval MMLU Hard,23.4,[],mixeval_240829.csv +olmo_7b_instruct,MixEval MMLU Hard,27.3,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval MMLU Hard,17.3,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval MMLU Hard,25.5,[],mixeval_240829.csv +mpt_7b_chat,MixEval MMLU Hard,24.7,[],mixeval_240829.csv +llama3_70b,MixEval DROP Hard,59.5,[],mixeval_240829.csv +qwen1_5_72b,MixEval DROP Hard,26.2,[],mixeval_240829.csv +yi_34b,MixEval DROP Hard,56.5,[],mixeval_240829.csv +qwen1_5_32b,MixEval DROP Hard,36.9,[],mixeval_240829.csv +mixtral_8x7b,MixEval DROP Hard,42.0,[],mixeval_240829.csv +llama_2_70b,MixEval DROP Hard,46.1,[],mixeval_240829.csv +qwen1_5_moe_a2_7b,MixEval DROP Hard,31.0,[],mixeval_240829.csv +qwen1_5_7b,MixEval DROP Hard,29.8,[],mixeval_240829.csv +llama3_8b,MixEval DROP Hard,37.1,[],mixeval_240829.csv +mistral_7b,MixEval DROP Hard,34.5,[],mixeval_240829.csv +gemma_7b,MixEval DROP Hard,31.4,[],mixeval_240829.csv +yi_6b,MixEval DROP Hard,19.4,[],mixeval_240829.csv +qwen1_5_4b,MixEval DROP Hard,24.7,[],mixeval_240829.csv +jetmoe_8b,MixEval DROP Hard,19.2,[],mixeval_240829.csv +deepseek_7b,MixEval DROP Hard,21.4,[],mixeval_240829.csv +phi_2,MixEval DROP Hard,27.1,[],mixeval_240829.csv +deepseekmoe_16b,MixEval DROP Hard,12.2,[],mixeval_240829.csv +llama_2_7b,MixEval DROP Hard,14.9,[],mixeval_240829.csv +gemma_2b,MixEval DROP Hard,13.2,[],mixeval_240829.csv +olmo_7b,MixEval DROP Hard,11.1,[],mixeval_240829.csv +mpt_7b,MixEval DROP Hard,9.2,[],mixeval_240829.csv +claude_3_5_sonnet_0620,MixEval DROP Hard,80.4,[],mixeval_240829.csv +llama3_1_405b_instruct,MixEval DROP Hard,69.2,[],mixeval_240829.csv +gpt_4o_2024_05_13,MixEval DROP Hard,67.5,[],mixeval_240829.csv +claude_3_opus,MixEval DROP Hard,75.2,[],mixeval_240829.csv +gpt_4_turbo_2024_04_09,MixEval DROP Hard,71.0,[],mixeval_240829.csv +gemini_1_5_pro_api_0409,MixEval DROP Hard,64.8,[],mixeval_240829.csv +gemini_1_5_pro_api_0514,MixEval DROP Hard,55.2,[],mixeval_240829.csv +mistral_large_2,MixEval DROP Hard,72.0,[],mixeval_240829.csv +yi_large_preview,MixEval DROP Hard,63.1,[],mixeval_240829.csv +llama3_70b_instruct,MixEval DROP Hard,74.5,[],mixeval_240829.csv +qwen_max_0428,MixEval DROP Hard,53.5,[],mixeval_240829.csv +claude_3_sonnet,MixEval DROP Hard,66.9,[],mixeval_240829.csv +reka_core_20240415,MixEval DROP Hard,66.6,[],mixeval_240829.csv +mammoth2_8x7b_plus,MixEval DROP Hard,65.1,[],mixeval_240829.csv +deepseek_v2,MixEval DROP Hard,62.8,[],mixeval_240829.csv +gpt_4o_mini,MixEval DROP Hard,68.1,[],mixeval_240829.csv +command_r_plus,MixEval DROP Hard,65.0,[],mixeval_240829.csv +yi_1_5_34b_chat,MixEval DROP Hard,67.4,[],mixeval_240829.csv +mistral_large,MixEval DROP Hard,61.6,[],mixeval_240829.csv +qwen1_5_72b_chat,MixEval DROP Hard,56.5,[],mixeval_240829.csv +mistral_medium,MixEval DROP Hard,47.1,[],mixeval_240829.csv +gemini_1_0_pro,MixEval DROP Hard,54.1,[],mixeval_240829.csv +reka_flash_20240226,MixEval DROP Hard,65.0,[],mixeval_240829.csv +mistral_small,MixEval DROP Hard,52.6,[],mixeval_240829.csv +llama3_8b_instruct,MixEval DROP Hard,67.6,[],mixeval_240829.csv +command_r,MixEval DROP Hard,42.0,[],mixeval_240829.csv +qwen1_5_32b_chat,MixEval DROP Hard,54.4,[],mixeval_240829.csv +gpt_3_5_turbo_0125,MixEval DROP Hard,55.4,[],mixeval_240829.csv +claude_3_haiku,MixEval DROP Hard,51.5,[],mixeval_240829.csv +yi_34b_chat,MixEval DROP Hard,57.1,[],mixeval_240829.csv +mixtral_8x7b_instruct_v0_1,MixEval DROP Hard,47.7,[],mixeval_240829.csv +starling_lm_7b_beta,MixEval DROP Hard,62.9,[],mixeval_240829.csv +yi_1_5_9b_chat,MixEval DROP Hard,61.3,[],mixeval_240829.csv +gemma_1_1_7b_it,MixEval DROP Hard,55.1,[],mixeval_240829.csv +vicuna_33b_v1_3,MixEval DROP Hard,36.6,[],mixeval_240829.csv +llama_2_70b_chat,MixEval DROP Hard,42.2,[],mixeval_240829.csv +map_neo_instruct_v0_1,MixEval DROP Hard,42.4,[],mixeval_240829.csv +mistral_7b_instruct_v0_2,MixEval DROP Hard,44.3,[],mixeval_240829.csv +qwen1_5_7b_chat,MixEval DROP Hard,50.0,[],mixeval_240829.csv +reka_edge_20240208,MixEval DROP Hard,56.9,[],mixeval_240829.csv +zephyr_7b_beta,MixEval DROP Hard,45.3,[],mixeval_240829.csv +llama_2_7b_chat,MixEval DROP Hard,44.3,[],mixeval_240829.csv +yi_6b_chat,MixEval DROP Hard,43.7,[],mixeval_240829.csv +qwen1_5_moe_a2_7b_chat,MixEval DROP Hard,39.5,[],mixeval_240829.csv +gemma_1_1_2b_it,MixEval DROP Hard,27.8,[],mixeval_240829.csv +vicuna_7b_v1_5,MixEval DROP Hard,33.2,[],mixeval_240829.csv +olmo_7b_instruct,MixEval DROP Hard,22.9,[],mixeval_240829.csv +qwen1_5_4b_chat,MixEval DROP Hard,28.6,[],mixeval_240829.csv +jetmoe_8b_chat,MixEval DROP Hard,11.5,[],mixeval_240829.csv +mpt_7b_chat,MixEval DROP Hard,31.0,[],mixeval_240829.csv +gpt4,toolbench,68.8,[],toolbench_240829.csv +text_davinci_003,toolbench,67.2,[],toolbench_240829.csv +gpt_3_5_turbo,toolbench,56.6,[],toolbench_240829.csv +text_curie_001,toolbench,10.6,[],toolbench_240829.csv +llama_2_70b,toolbench,61.0,[],toolbench_240829.csv +llama_2_13b,toolbench,48.8,[],toolbench_240829.csv +llama_2_7b,toolbench,39.5,[],toolbench_240829.csv +llama_65b,toolbench,55.6,[],toolbench_240829.csv +llama30b,toolbench,49.6,[],toolbench_240829.csv +llama_13b,toolbench,36.8,[],toolbench_240829.csv +llama_13b_alpaca,toolbench,26.9,[],toolbench_240829.csv +codellama_7b,toolbench,48.3,[],toolbench_240829.csv +codellama_7b_instruct,toolbench,50.5,[],toolbench_240829.csv +codellama_7b_python,toolbench,52.2,[],toolbench_240829.csv +codellama_13b,toolbench,56.9,[],toolbench_240829.csv +codellama_13b_instruct,toolbench,60.5,[],toolbench_240829.csv +codellama_13b_python,toolbench,56.3,[],toolbench_240829.csv +codellama34b,toolbench,62.9,[],toolbench_240829.csv +codellama34b_instruct,toolbench,64.8,[],toolbench_240829.csv +codellama34b_python,toolbench,59.2,[],toolbench_240829.csv +starcoder,toolbench,49.7,[],toolbench_240829.csv +starcoderbase,toolbench,52.2,[],toolbench_240829.csv +codegen_16b_nl,toolbench,28.2,[],toolbench_240829.csv +codegen_16b_multi,toolbench,28.8,[],toolbench_240829.csv +codegen_16b_mono,toolbench,35.6,[],toolbench_240829.csv +bloomz,toolbench,27.8,[],toolbench_240829.csv +opt_iml_30b,toolbench,14.1,[],toolbench_240829.csv +opt_30b,toolbench,13.4,[],toolbench_240829.csv +opt_iml_1_3b,toolbench,7.0,[],toolbench_240829.csv +opt_1_3b,toolbench,7.5,[],toolbench_240829.csv +neox_20b,toolbench,26.4,[],toolbench_240829.csv +gpt_neoxt_chat_base_20b,toolbench,22.6,[],toolbench_240829.csv +pythia_12b,toolbench,19.5,[],toolbench_240829.csv +dolly_v2_12b,toolbench,5.0,[],toolbench_240829.csv +pythia_6_9b,toolbench,19.4,[],toolbench_240829.csv +pythia_2_8b,toolbench,18.6,[],toolbench_240829.csv +pythia_1_4b,toolbench,15.9,[],toolbench_240829.csv +stablelm_base_alpha_7b,toolbench,10.8,[],toolbench_240829.csv +stablelm_tuned_alpha_7b,toolbench,9.2,[],toolbench_240829.csv +stablelm_base_alpha_3b,toolbench,5.2,[],toolbench_240829.csv +stablelm_tuned_alpha_3b,toolbench,6.6,[],toolbench_240829.csv +llama30b_toolbench,toolbench,50.2,[],toolbench_240829.csv +starcoder_toolbench,toolbench,51.7,[],toolbench_240829.csv +codegen_16b_mono_toolbench,toolbench,51.6,[],toolbench_240829.csv +01,AlphacaEval v2lc,18.1,,alphacaeval_v2lc_240829.csv +02,AlphacaEval v2lc,32.7,,alphacaeval_v2lc_240829.csv +06,AlphacaEval v2lc,50.0,,alphacaeval_v2lc_240829.csv +09,AlphacaEval v2lc,58.3,,alphacaeval_v2lc_240829.csv +13,AlphacaEval v2lc,57.5,,alphacaeval_v2lc_240829.csv +14,AlphacaEval v2lc,35.3,,alphacaeval_v2lc_240829.csv +18,AlphacaEval v2lc,50.7,,alphacaeval_v2lc_240829.csv +20,AlphacaEval v2lc,52.4,,alphacaeval_v2lc_240829.csv +29,AlphacaEval v2lc,40.5,,alphacaeval_v2lc_240829.csv +airoboros_33b,AlphacaEval v2lc,10.7,,alphacaeval_v2lc_240829.csv +airoboros_65b,AlphacaEval v2lc,11.0,,alphacaeval_v2lc_240829.csv +aligner_2b+claude_3_opus,AlphacaEval v2lc,41.8,,alphacaeval_v2lc_240829.csv +aligner_2b+qwen1_5_72b_chat,AlphacaEval v2lc,36.7,,alphacaeval_v2lc_240829.csv +alpaca_7b,AlphacaEval v2lc,5.9,,alphacaeval_v2lc_240829.csv +alpaca_farm_ppo_human_7b,AlphacaEval v2lc,6.4,,alphacaeval_v2lc_240829.csv +alpaca_farm_ppo_sim_gpt_4_7b,AlphacaEval v2lc,7.1,,alphacaeval_v2lc_240829.csv +baichuan_13b_chat,AlphacaEval v2lc,2.1,,alphacaeval_v2lc_240829.csv +baize_v2_13b,AlphacaEval v2lc,7.0,,alphacaeval_v2lc_240829.csv +baize_v2_7b,AlphacaEval v2lc,4.4,,alphacaeval_v2lc_240829.csv +blendax_ai_gm_l3_v35,AlphacaEval v2lc,73.4,,alphacaeval_v2lc_240829.csv +blendax_ai_gm_l6_vo31,AlphacaEval v2lc,76.9,,alphacaeval_v2lc_240829.csv +causallm_14b,AlphacaEval v2lc,15.7,,alphacaeval_v2lc_240829.csv +chatglm2_6b,AlphacaEval v2lc,4.4,,alphacaeval_v2lc_240829.csv +claude,AlphacaEval v2lc,27.3,,alphacaeval_v2lc_240829.csv +claude2_alpaca_13b,AlphacaEval v2lc,11.5,,alphacaeval_v2lc_240829.csv +claude_2,AlphacaEval v2lc,28.2,,alphacaeval_v2lc_240829.csv +claude_2_1,AlphacaEval v2lc,25.3,,alphacaeval_v2lc_240829.csv +claude_instant_1_2,AlphacaEval v2lc,25.6,,alphacaeval_v2lc_240829.csv +cohere_command,AlphacaEval v2lc,10.9,,alphacaeval_v2lc_240829.csv +conifer_7b_dpo,AlphacaEval v2lc,17.1,,alphacaeval_v2lc_240829.csv +contextual_ai_kto_mistral_pairrm,AlphacaEval v2lc,29.7,,alphacaeval_v2lc_240829.csv +cut_13b,AlphacaEval v2lc,12.2,,alphacaeval_v2lc_240829.csv +davinci001,AlphacaEval v2lc,9.0,,alphacaeval_v2lc_240829.csv +dbrx_instruct,AlphacaEval v2lc,25.4,,alphacaeval_v2lc_240829.csv +deepseek_llm_67b_chat,AlphacaEval v2lc,17.8,,alphacaeval_v2lc_240829.csv +deita_7b_v1_0,AlphacaEval v2lc,16.1,,alphacaeval_v2lc_240829.csv +dolphin_2_2_1_mistral_7b,AlphacaEval v2lc,13.1,,alphacaeval_v2lc_240829.csv +ein_70b_v0_1,AlphacaEval v2lc,35.0,,alphacaeval_v2lc_240829.csv +evo_7b,AlphacaEval v2lc,16.5,,alphacaeval_v2lc_240829.csv +evo_v2_7b,AlphacaEval v2lc,23.4,,alphacaeval_v2lc_240829.csv +expo_+_internlm2_chat_20b,AlphacaEval v2lc,27.2,,alphacaeval_v2lc_240829.csv +expo_+_internlm2_chat_7b,AlphacaEval v2lc,22.7,,alphacaeval_v2lc_240829.csv +expo_+_llama3_instruct_8b_simpo,AlphacaEval v2lc,45.8,,alphacaeval_v2lc_240829.csv +expo_+_sppo_mistral7b_pairrm,AlphacaEval v2lc,31.8,,alphacaeval_v2lc_240829.csv +expo_+_starling_lm_7b_alpha,AlphacaEval v2lc,19.5,,alphacaeval_v2lc_240829.csv +expo_+_starling_lm_7b_beta,AlphacaEval v2lc,26.4,,alphacaeval_v2lc_240829.csv +expo_+_tulu_2_dpo_13b,AlphacaEval v2lc,17.6,,alphacaeval_v2lc_240829.csv +expo_+_tulu_2_dpo_70b,AlphacaEval v2lc,25.7,,alphacaeval_v2lc_240829.csv +expo_+_tulu_2_dpo_7b,AlphacaEval v2lc,11.7,,alphacaeval_v2lc_240829.csv +expo_+_zephyr_7b_alpha,AlphacaEval v2lc,13.6,,alphacaeval_v2lc_240829.csv +expo_+_zephyr_7b_beta,AlphacaEval v2lc,14.0,,alphacaeval_v2lc_240829.csv +falcon_40b_instruct,AlphacaEval v2lc,5.6,,alphacaeval_v2lc_240829.csv +falcon_7b_instruct,AlphacaEval v2lc,4.0,,alphacaeval_v2lc_240829.csv +fsfairx_zephyr_chat_v0_1,AlphacaEval v2lc,34.8,,alphacaeval_v2lc_240829.csv +gemini_pro,AlphacaEval v2lc,24.4,,alphacaeval_v2lc_240829.csv +gemma_2_9b_it_dpo,AlphacaEval v2lc,67.7,,alphacaeval_v2lc_240829.csv +gemma_2_9b_it_simpo,AlphacaEval v2lc,72.4,,alphacaeval_v2lc_240829.csv +gemma_2_9b_it_wpo_hb,AlphacaEval v2lc,76.7,,alphacaeval_v2lc_240829.csv +gemma_instruct_2b,AlphacaEval v2lc,5.4,,alphacaeval_v2lc_240829.csv +gemma_instruct_7b,AlphacaEval v2lc,10.4,,alphacaeval_v2lc_240829.csv +ghost_7b_alpha,AlphacaEval v2lc,6.9,,alphacaeval_v2lc_240829.csv +ghost_8b_beta_d0x5,AlphacaEval v2lc,23.1,,alphacaeval_v2lc_240829.csv +gpt_3_5,AlphacaEval v2lc,17.7,,alphacaeval_v2lc_240829.csv +gpt_4,AlphacaEval v2lc,38.1,,alphacaeval_v2lc_240829.csv +gpt_4_adversarial,AlphacaEval v2lc,12.2,,alphacaeval_v2lc_240829.csv +guanaco_13b,AlphacaEval v2lc,3.0,,alphacaeval_v2lc_240829.csv +guanaco_33b,AlphacaEval v2lc,5.7,,alphacaeval_v2lc_240829.csv +guanaco_65b,AlphacaEval v2lc,8.3,,alphacaeval_v2lc_240829.csv +guanaco_7b,AlphacaEval v2lc,2.9,,alphacaeval_v2lc_240829.csv +higgs_llama3_70b_v2,AlphacaEval v2lc,56.8,,alphacaeval_v2lc_240829.csv +humpback_llama2_70b,AlphacaEval v2lc,16.2,,alphacaeval_v2lc_240829.csv +humpback_llama_65b,AlphacaEval v2lc,12.8,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0613_llama3_70b,AlphacaEval v2lc,31.5,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0613_mistral_7b,AlphacaEval v2lc,25.5,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0625_llama3_70b,AlphacaEval v2lc,38.0,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0625_llama3_8b,AlphacaEval v2lc,27.5,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0625_mistral_7b,AlphacaEval v2lc,31.4,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0625_qwen2_7b,AlphacaEval v2lc,21.9,,alphacaeval_v2lc_240829.csv +infinity_instruct_3m_0625_yi_1_5_9b,AlphacaEval v2lc,20.5,,alphacaeval_v2lc_240829.csv +infinity_instruct_7m_gen_llama3_1_70b,AlphacaEval v2lc,46.1,,alphacaeval_v2lc_240829.csv +infinity_instruct_7m_gen_llama3_1_8b,AlphacaEval v2lc,33.9,,alphacaeval_v2lc_240829.csv +infinity_instruct_7m_gen_mistral_7b,AlphacaEval v2lc,39.7,,alphacaeval_v2lc_240829.csv +internlm2_chat_20b,AlphacaEval v2lc,18.7,,alphacaeval_v2lc_240829.csv +jinachat,AlphacaEval v2lc,15.9,,alphacaeval_v2lc_240829.csv +llama2_chat_13b,AlphacaEval v2lc,8.4,,alphacaeval_v2lc_240829.csv +llama2_chat_70b,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv +llama2_chat_7b,AlphacaEval v2lc,5.4,,alphacaeval_v2lc_240829.csv +llama2_chat_7b_evol70k_neft,AlphacaEval v2lc,7.5,,alphacaeval_v2lc_240829.csv +llama33b_oasst_rlhf,AlphacaEval v2lc,8.0,,alphacaeval_v2lc_240829.csv +llama33b_oasst_sft,AlphacaEval v2lc,9.9,,alphacaeval_v2lc_240829.csv +llama3_1_405b_instruct,AlphacaEval v2lc,39.3,,alphacaeval_v2lc_240829.csv +llama3_1_70b_instruct,AlphacaEval v2lc,38.1,,alphacaeval_v2lc_240829.csv +llama3_1_8b_instruct,AlphacaEval v2lc,20.9,,alphacaeval_v2lc_240829.csv +llama3_70b_instruct,AlphacaEval v2lc,34.4,,alphacaeval_v2lc_240829.csv +llama3_8b_instruct,AlphacaEval v2lc,22.9,,alphacaeval_v2lc_240829.csv +llama3_instruct_8b_simpo,AlphacaEval v2lc,44.7,,alphacaeval_v2lc_240829.csv +llama3_instruct_8b_wpo_hb_v2,AlphacaEval v2lc,53.4,,alphacaeval_v2lc_240829.csv +llama3_pbm_nova_70b,AlphacaEval v2lc,62.4,,alphacaeval_v2lc_240829.csv +lmcocktail_10_7b_v1,AlphacaEval v2lc,19.0,,alphacaeval_v2lc_240829.csv +merlinite_7b_aot,AlphacaEval v2lc,31.7,,alphacaeval_v2lc_240829.csv +minichat_1_5_3b,AlphacaEval v2lc,7.7,,alphacaeval_v2lc_240829.csv +minichat_3b,AlphacaEval v2lc,5.7,,alphacaeval_v2lc_240829.csv +minotaur_13b,AlphacaEval v2lc,11.5,,alphacaeval_v2lc_240829.csv +mistral_7b_remax_v0_1,AlphacaEval v2lc,20.6,,alphacaeval_v2lc_240829.csv +mistral_7b_v0_2,AlphacaEval v2lc,17.1,,alphacaeval_v2lc_240829.csv +mistral_7b_v0_3,AlphacaEval v2lc,20.6,,alphacaeval_v2lc_240829.csv +mistral_medium,AlphacaEval v2lc,28.6,,alphacaeval_v2lc_240829.csv +mistral_orpo_beta,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv +mixtral_8x22b_v0_1,AlphacaEval v2lc,30.9,,alphacaeval_v2lc_240829.csv +mixtral_8x7b_v0_1,AlphacaEval v2lc,23.7,,alphacaeval_v2lc_240829.csv +nanbeige2_16b_chat,AlphacaEval v2lc,40.6,,alphacaeval_v2lc_240829.csv +nanbeige2_8b_chat,AlphacaEval v2lc,25.2,,alphacaeval_v2lc_240829.csv +nanbeige_plus_chat_v0_1,AlphacaEval v2lc,44.5,,alphacaeval_v2lc_240829.csv +nous_hermes_13b,AlphacaEval v2lc,9.7,,alphacaeval_v2lc_240829.csv +openbudddy_llama2_13b_v11_1,AlphacaEval v2lc,9.2,,alphacaeval_v2lc_240829.csv +openbudddy_llama2_70b_v10_1,AlphacaEval v2lc,12.6,,alphacaeval_v2lc_240829.csv +openbuddy_falcon_40b_v9,AlphacaEval v2lc,9.0,,alphacaeval_v2lc_240829.csv +openbuddy_falcon_7b_v6,AlphacaEval v2lc,4.8,,alphacaeval_v2lc_240829.csv +openbuddy_llama30b_v7_1,AlphacaEval v2lc,10.2,,alphacaeval_v2lc_240829.csv +openbuddy_llama_65b_v8,AlphacaEval v2lc,12.5,,alphacaeval_v2lc_240829.csv +openchat8192_13b,AlphacaEval v2lc,7.9,,alphacaeval_v2lc_240829.csv +openchat_13b,AlphacaEval v2lc,8.8,,alphacaeval_v2lc_240829.csv +openchat_v2_13b,AlphacaEval v2lc,10.4,,alphacaeval_v2lc_240829.csv +openchat_v2_w_13b,AlphacaEval v2lc,12.0,,alphacaeval_v2lc_240829.csv +openchat_v3_1_13b,AlphacaEval v2lc,14.5,,alphacaeval_v2lc_240829.csv +opencoderplus_15b,AlphacaEval v2lc,8.2,,alphacaeval_v2lc_240829.csv +openhermes_2_5_mistral_7b,AlphacaEval v2lc,16.2,,alphacaeval_v2lc_240829.csv +openpipe_moa_gpt_4_turbo,AlphacaEval v2lc,68.4,,alphacaeval_v2lc_240829.csv +pairrm_0_4b+tulu_2+dpo_13b_best_of_16,AlphacaEval v2lc,17.4,,alphacaeval_v2lc_240829.csv +pairrm_0_4b+tulu_2+dpo_70b_best_of_16,AlphacaEval v2lc,21.4,,alphacaeval_v2lc_240829.csv +pairrm_0_4b+yi_34b_chat_best_of_16,AlphacaEval v2lc,28.8,,alphacaeval_v2lc_240829.csv +pairrm_0_4b+zephyr_7b_beta_best_of_16,AlphacaEval v2lc,15.5,,alphacaeval_v2lc_240829.csv +phi_2,AlphacaEval v2lc,4.4,,alphacaeval_v2lc_240829.csv +phi_2_dpo,AlphacaEval v2lc,7.8,,alphacaeval_v2lc_240829.csv +phi_2_sft,AlphacaEval v2lc,5.9,,alphacaeval_v2lc_240829.csv +platolm_7b,AlphacaEval v2lc,10.5,,alphacaeval_v2lc_240829.csv +pythia_12b_oasst_sft,AlphacaEval v2lc,3.3,,alphacaeval_v2lc_240829.csv +pythia_12b_sft,AlphacaEval v2lc,4.2,,alphacaeval_v2lc_240829.csv +qwen1_5_110b_chat,AlphacaEval v2lc,43.9,,alphacaeval_v2lc_240829.csv +qwen1_5_14b_chat,AlphacaEval v2lc,23.9,,alphacaeval_v2lc_240829.csv +qwen1_5_1_8b_chat,AlphacaEval v2lc,2.6,,alphacaeval_v2lc_240829.csv +qwen1_5_72b_chat,AlphacaEval v2lc,36.6,,alphacaeval_v2lc_240829.csv +qwen1_5_7b_chat,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv +qwen2_72b_instruct,AlphacaEval v2lc,38.1,,alphacaeval_v2lc_240829.csv +qwen_14b_chat,AlphacaEval v2lc,12.4,,alphacaeval_v2lc_240829.csv +rebel_llama3_8b_instruct,AlphacaEval v2lc,31.4,,alphacaeval_v2lc_240829.csv +recycled_wizardlm_7b_v1_0,AlphacaEval v2lc,6.9,,alphacaeval_v2lc_240829.csv +recycled_wizardlm_7b_v2_0,AlphacaEval v2lc,7.5,,alphacaeval_v2lc_240829.csv +samba_coe_v0_1,AlphacaEval v2lc,22.9,,alphacaeval_v2lc_240829.csv +samba_coe_v0_2,AlphacaEval v2lc,27.6,,alphacaeval_v2lc_240829.csv +samba_coe_v0_2_best_of_16,AlphacaEval v2lc,31.5,,alphacaeval_v2lc_240829.csv +shopee_slimmoa_v1,AlphacaEval v2lc,77.5,,alphacaeval_v2lc_240829.csv +snorkel_mistral_pairrm_dpo,AlphacaEval v2lc,26.4,,alphacaeval_v2lc_240829.csv +snorkel_mistral_pairrm_dpo+best_of_16,AlphacaEval v2lc,30.0,,alphacaeval_v2lc_240829.csv +sppo_gemma_2_9b_it_pairrm,AlphacaEval v2lc,54.0,,alphacaeval_v2lc_240829.csv +sppo_llama3_instruct_8b_pairrm,AlphacaEval v2lc,38.6,,alphacaeval_v2lc_240829.csv +sppo_mistral7b_pairrm,AlphacaEval v2lc,30.5,,alphacaeval_v2lc_240829.csv +starling_lm_7b_alpha,AlphacaEval v2lc,14.7,,alphacaeval_v2lc_240829.csv +storm_7b,AlphacaEval v2lc,50.5,,alphacaeval_v2lc_240829.csv +storm_7b_best_of_64,AlphacaEval v2lc,61.6,,alphacaeval_v2lc_240829.csv +tempnet_llama2_chat_13b_v0_1,AlphacaEval v2lc,8.6,,alphacaeval_v2lc_240829.csv +tempnet_llama2_chat_70b_v0_1,AlphacaEval v2lc,15.8,,alphacaeval_v2lc_240829.csv +tempnet_llama2_chat_7b_v0_1,AlphacaEval v2lc,5.7,,alphacaeval_v2lc_240829.csv +together_moa,AlphacaEval v2lc,65.4,,alphacaeval_v2lc_240829.csv +together_moa_lite,AlphacaEval v2lc,59.1,,alphacaeval_v2lc_240829.csv +tulu_2+dpo_13b,AlphacaEval v2lc,11.6,,alphacaeval_v2lc_240829.csv +tulu_2+dpo_70b,AlphacaEval v2lc,21.2,,alphacaeval_v2lc_240829.csv +tulu_2+dpo_7b,AlphacaEval v2lc,9.2,,alphacaeval_v2lc_240829.csv +ultralm_13b,AlphacaEval v2lc,7.1,,alphacaeval_v2lc_240829.csv +ultralm_13b_best_of_16,AlphacaEval v2lc,9.9,,alphacaeval_v2lc_240829.csv +ultralm_13b_v2_0,AlphacaEval v2lc,9.1,,alphacaeval_v2lc_240829.csv +ultralm_13b_v2_0_best_of_16,AlphacaEval v2lc,14.2,,alphacaeval_v2lc_240829.csv +vicuna_13b,AlphacaEval v2lc,9.2,,alphacaeval_v2lc_240829.csv +vicuna_13b_v1_3,AlphacaEval v2lc,10.8,,alphacaeval_v2lc_240829.csv +vicuna_13b_v1_5,AlphacaEval v2lc,10.5,,alphacaeval_v2lc_240829.csv +vicuna_13b_v1_5_together,AlphacaEval v2lc,11.7,,alphacaeval_v2lc_240829.csv +vicuna_33b_v1_3,AlphacaEval v2lc,17.6,,alphacaeval_v2lc_240829.csv +vicuna_7b,AlphacaEval v2lc,6.3,,alphacaeval_v2lc_240829.csv +vicuna_7b_v1_3,AlphacaEval v2lc,7.2,,alphacaeval_v2lc_240829.csv +vicuna_7b_v1_5,AlphacaEval v2lc,7.6,,alphacaeval_v2lc_240829.csv +wizardlm_13b,AlphacaEval v2lc,9.8,,alphacaeval_v2lc_240829.csv +wizardlm_13b_v1_1,AlphacaEval v2lc,13.9,,alphacaeval_v2lc_240829.csv +wizardlm_13b_v1_2,AlphacaEval v2lc,14.5,,alphacaeval_v2lc_240829.csv +wizardlm_70b,AlphacaEval v2lc,17.6,,alphacaeval_v2lc_240829.csv +xwinlm_13b_v0_1,AlphacaEval v2lc,17.9,,alphacaeval_v2lc_240829.csv +xwinlm_70b_v0_1,AlphacaEval v2lc,24.6,,alphacaeval_v2lc_240829.csv +xwinlm_7b_v0_1,AlphacaEval v2lc,10.8,,alphacaeval_v2lc_240829.csv +yi_34b_chat,AlphacaEval v2lc,27.2,,alphacaeval_v2lc_240829.csv +yi_large_preview,AlphacaEval v2lc,51.9,,alphacaeval_v2lc_240829.csv +zephyr_7b_alpha,AlphacaEval v2lc,10.3,,alphacaeval_v2lc_240829.csv +zephyr_7b_beta,AlphacaEval v2lc,13.2,,alphacaeval_v2lc_240829.csv +claude_3_haiku_20240307,HELM AirBench Security Risks,0.005,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Security Risks,0.009,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Security Risks,0.065,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Security Risks,0.065,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Security Risks,0.097,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Security Risks,0.124,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Security Risks,0.137,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Security Risks,0.142,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Security Risks,0.158,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Security Risks,0.275,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Security Risks,0.297,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Security Risks,0.405,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Security Risks,0.453,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Security Risks,0.457,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Security Risks,0.509,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Security Risks,0.671,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Security Risks,0.777,[],helm_airbench_240916.csv +command_r,HELM AirBench Security Risks,0.782,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Security Risks,0.829,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Security Risks,0.932,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Security Risks,0.955,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Operational Misuses,0.572,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Operational Misuses,0.473,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Operational Misuses,0.694,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Operational Misuses,0.477,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Operational Misuses,0.338,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Operational Misuses,0.371,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Operational Misuses,0.551,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Operational Misuses,0.636,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Operational Misuses,0.726,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Operational Misuses,0.636,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Operational Misuses,0.813,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Operational Misuses,0.768,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Operational Misuses,0.772,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Operational Misuses,0.709,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Operational Misuses,0.691,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Operational Misuses,0.744,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Operational Misuses,0.818,[],helm_airbench_240916.csv +command_r,HELM AirBench Operational Misuses,0.878,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Operational Misuses,0.881,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Operational Misuses,0.841,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Operational Misuses,0.874,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Violence & Extremism,0.159,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Violence & Extremism,0.156,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Violence & Extremism,0.214,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Violence & Extremism,0.187,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Violence & Extremism,0.253,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Violence & Extremism,0.289,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Violence & Extremism,0.455,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Violence & Extremism,0.329,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Violence & Extremism,0.351,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Violence & Extremism,0.589,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Violence & Extremism,0.527,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Violence & Extremism,0.664,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Violence & Extremism,0.579,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Violence & Extremism,0.541,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Violence & Extremism,0.558,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Violence & Extremism,0.726,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Violence & Extremism,0.733,[],helm_airbench_240916.csv +command_r,HELM AirBench Violence & Extremism,0.775,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Violence & Extremism,0.816,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Violence & Extremism,0.806,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Violence & Extremism,0.841,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Hate/Toxicity,0.057,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Hate/Toxicity,0.071,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Hate/Toxicity,0.188,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Hate/Toxicity,0.091,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Hate/Toxicity,0.135,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Hate/Toxicity,0.164,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Hate/Toxicity,0.274,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Hate/Toxicity,0.144,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Hate/Toxicity,0.329,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Hate/Toxicity,0.433,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Hate/Toxicity,0.327,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Hate/Toxicity,0.51,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Hate/Toxicity,0.371,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Hate/Toxicity,0.365,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Hate/Toxicity,0.377,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.417,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Hate/Toxicity,0.504,[],helm_airbench_240916.csv +command_r,HELM AirBench Hate/Toxicity,0.586,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Hate/Toxicity,0.653,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Hate/Toxicity,0.501,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Hate/Toxicity,0.624,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Sexual Content,0.219,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Sexual Content,0.184,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Sexual Content,0.208,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Sexual Content,0.34,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Sexual Content,0.288,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Sexual Content,0.302,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Sexual Content,0.549,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Sexual Content,0.378,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Sexual Content,0.49,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Sexual Content,0.559,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Sexual Content,0.524,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Sexual Content,0.667,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Sexual Content,0.635,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Sexual Content,0.622,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Sexual Content,0.576,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Sexual Content,0.569,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Sexual Content,0.632,[],helm_airbench_240916.csv +command_r,HELM AirBench Sexual Content,0.712,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Sexual Content,0.729,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Sexual Content,0.597,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Sexual Content,0.684,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Child Harm,0.152,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Child Harm,0.133,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Child Harm,0.276,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Child Harm,0.19,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Child Harm,0.233,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Child Harm,0.286,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Child Harm,0.429,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Child Harm,0.381,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Child Harm,0.267,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Child Harm,0.629,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Child Harm,0.552,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Child Harm,0.752,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Child Harm,0.686,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Child Harm,0.643,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Child Harm,0.624,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Child Harm,0.767,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Child Harm,0.848,[],helm_airbench_240916.csv +command_r,HELM AirBench Child Harm,0.824,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Child Harm,0.819,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Child Harm,0.924,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Child Harm,0.924,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Self Harm,0.022,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Self Harm,0.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Self Harm,0.089,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Self Harm,0.022,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Self Harm,0.078,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Self Harm,0.022,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Self Harm,0.089,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Self Harm,0.156,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Self Harm,0.078,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Self Harm,0.322,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Self Harm,0.189,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Self Harm,0.422,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Self Harm,0.356,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Self Harm,0.344,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Self Harm,0.289,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Self Harm,0.322,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Self Harm,0.533,[],helm_airbench_240916.csv +command_r,HELM AirBench Self Harm,0.578,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Self Harm,0.578,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Self Harm,0.522,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Self Harm,0.722,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Political Usage,0.041,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Political Usage,0.031,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Political Usage,0.177,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Political Usage,0.063,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Political Usage,0.161,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Political Usage,0.195,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Political Usage,0.463,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Political Usage,0.323,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Political Usage,0.339,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Political Usage,0.609,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Political Usage,0.601,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Political Usage,0.725,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Political Usage,0.616,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Political Usage,0.532,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Political Usage,0.52,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Political Usage,0.747,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Political Usage,0.808,[],helm_airbench_240916.csv +command_r,HELM AirBench Political Usage,0.861,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Political Usage,0.895,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Political Usage,0.909,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Political Usage,0.963,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Economic Harm,0.04,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Economic Harm,0.02,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Economic Harm,0.14,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Economic Harm,0.06,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Economic Harm,0.09,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Economic Harm,0.153,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Economic Harm,0.433,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Economic Harm,0.293,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Economic Harm,0.34,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Economic Harm,0.623,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Economic Harm,0.587,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Economic Harm,0.71,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Economic Harm,0.623,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Economic Harm,0.567,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Economic Harm,0.503,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Economic Harm,0.647,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Economic Harm,0.74,[],helm_airbench_240916.csv +command_r,HELM AirBench Economic Harm,0.82,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Economic Harm,0.897,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Economic Harm,0.91,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Economic Harm,0.953,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Deception,0.089,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Deception,0.096,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Deception,0.259,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Deception,0.126,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Deception,0.215,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Deception,0.3,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Deception,0.522,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Deception,0.304,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Deception,0.385,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Deception,0.659,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Deception,0.504,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Deception,0.748,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Deception,0.733,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Deception,0.648,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Deception,0.681,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Deception,0.726,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Deception,0.822,[],helm_airbench_240916.csv +command_r,HELM AirBench Deception,0.822,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Deception,0.867,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Deception,0.889,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Deception,0.926,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Manipulation,0.053,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Manipulation,0.127,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Manipulation,0.353,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Manipulation,0.1,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Manipulation,0.22,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Manipulation,0.293,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Manipulation,0.433,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Manipulation,0.34,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Manipulation,0.427,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Manipulation,0.573,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Manipulation,0.54,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Manipulation,0.7,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Manipulation,0.633,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Manipulation,0.573,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Manipulation,0.533,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Manipulation,0.66,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Manipulation,0.687,[],helm_airbench_240916.csv +command_r,HELM AirBench Manipulation,0.813,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Manipulation,0.853,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Manipulation,0.853,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Manipulation,0.953,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Defamation,0.037,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Defamation,0.074,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Defamation,0.38,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Defamation,0.074,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Defamation,0.194,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Defamation,0.278,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Defamation,0.463,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Defamation,0.167,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Defamation,0.574,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Defamation,0.481,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Defamation,0.426,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Defamation,0.593,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Defamation,0.63,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Defamation,0.407,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Defamation,0.491,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Defamation,0.463,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Defamation,0.602,[],helm_airbench_240916.csv +command_r,HELM AirBench Defamation,0.648,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Defamation,0.815,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Defamation,0.648,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Defamation,0.75,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Fundamental Rights,0.027,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Fundamental Rights,0.0,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Fundamental Rights,0.06,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Fundamental Rights,0.053,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Fundamental Rights,0.213,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Fundamental Rights,0.08,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Fundamental Rights,0.147,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Fundamental Rights,0.333,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Fundamental Rights,0.267,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Fundamental Rights,0.52,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Fundamental Rights,0.467,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Fundamental Rights,0.373,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Fundamental Rights,0.227,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Fundamental Rights,0.573,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Fundamental Rights,0.627,[],helm_airbench_240916.csv +command_r,HELM AirBench Fundamental Rights,0.773,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Fundamental Rights,0.8,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Fundamental Rights,0.893,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Fundamental Rights,0.947,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Discrimination/Bias,0.382,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Discrimination/Bias,0.332,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Discrimination/Bias,0.521,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Discrimination/Bias,0.27,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Discrimination/Bias,0.24,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Discrimination/Bias,0.325,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Discrimination/Bias,0.516,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Discrimination/Bias,0.461,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Discrimination/Bias,0.502,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Discrimination/Bias,0.589,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Discrimination/Bias,0.575,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Discrimination/Bias,0.624,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Discrimination/Bias,0.571,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Discrimination/Bias,0.584,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Discrimination/Bias,0.559,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.593,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Discrimination/Bias,0.592,[],helm_airbench_240916.csv +command_r,HELM AirBench Discrimination/Bias,0.678,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Discrimination/Bias,0.68,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Discrimination/Bias,0.624,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Discrimination/Bias,0.675,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Privacy,0.086,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Privacy,0.089,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Privacy,0.225,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Privacy,0.096,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Privacy,0.123,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Privacy,0.14,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Privacy,0.316,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Privacy,0.207,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Privacy,0.274,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Privacy,0.39,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Privacy,0.45,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Privacy,0.471,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Privacy,0.546,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Privacy,0.515,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Privacy,0.436,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Privacy,0.593,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Privacy,0.579,[],helm_airbench_240916.csv +command_r,HELM AirBench Privacy,0.699,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Privacy,0.709,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Privacy,0.717,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Privacy,0.817,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench Criminal Activities,0.008,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench Criminal Activities,0.0,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench Criminal Activities,0.0,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench Criminal Activities,0.017,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench Criminal Activities,0.042,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench Criminal Activities,0.033,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench Criminal Activities,0.108,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench Criminal Activities,0.058,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench Criminal Activities,0.025,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench Criminal Activities,0.267,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench Criminal Activities,0.233,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench Criminal Activities,0.45,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench Criminal Activities,0.35,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench Criminal Activities,0.3,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench Criminal Activities,0.275,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench Criminal Activities,0.646,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench Criminal Activities,0.742,[],helm_airbench_240916.csv +command_r,HELM AirBench Criminal Activities,0.717,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench Criminal Activities,0.817,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench Criminal Activities,0.942,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench Criminal Activities,0.967,[],helm_airbench_240916.csv +claude_3_haiku_20240307,HELM AirBench AIR Score,0.198,[],helm_airbench_240916.csv +claude_3_sonnet_20240229,HELM AirBench AIR Score,0.177,[],helm_airbench_240916.csv +llama3_8b_chat,HELM AirBench AIR Score,0.386,[],helm_airbench_240916.csv +claude_3_opus_20240229,HELM AirBench AIR Score,0.177,[],helm_airbench_240916.csv +gemini_1_5_pro_001_safety_default,HELM AirBench AIR Score,0.189,[],helm_airbench_240916.csv +gemini_1_5_flash_001_safety_default,HELM AirBench AIR Score,0.233,[],helm_airbench_240916.csv +gpt_3_5_turbo_0613,HELM AirBench AIR Score,0.407,[],helm_airbench_240916.csv +gpt_4_turbo_2024_04_09,HELM AirBench AIR Score,0.322,[],helm_airbench_240916.csv +llama3_70b_chat,HELM AirBench AIR Score,0.386,[],helm_airbench_240916.csv +gpt_3_5_turbo_1106,HELM AirBench AIR Score,0.511,[],helm_airbench_240916.csv +gpt_4o_2024_05_13,HELM AirBench AIR Score,0.506,[],helm_airbench_240916.csv +gpt_3_5_turbo_0125,HELM AirBench AIR Score,0.593,[],helm_airbench_240916.csv +qwen1_5_72b_chat,HELM AirBench AIR Score,0.558,[],helm_airbench_240916.csv +deepseek_llm_67b_chat,HELM AirBench AIR Score,0.533,[],helm_airbench_240916.csv +yi_34b_chat,HELM AirBench AIR Score,0.507,[],helm_airbench_240916.csv +mixtral_8x22b_instruct_v0_1,HELM AirBench AIR Score,0.611,[],helm_airbench_240916.csv +mixtral_8x7b_instruct_v0_1,HELM AirBench AIR Score,0.645,[],helm_airbench_240916.csv +command_r,HELM AirBench AIR Score,0.722,[],helm_airbench_240916.csv +command_r_plus,HELM AirBench AIR Score,0.747,[],helm_airbench_240916.csv +mistral_7b_instruct_v0_3,HELM AirBench AIR Score,0.718,[],helm_airbench_240916.csv +dbrx_instructruct,HELM AirBench AIR Score,0.786,[],helm_airbench_240916.csv +claude_3_5_sonnet,OpenCompass,67.9,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass,67.7,[],opencompass_240829.csv +mistral_large,OpenCompass,63.2,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass,62.5,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass,61.7,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass,60.4,[],opencompass_240829.csv +qwen_max_0428,OpenCompass,57.8,[],opencompass_240829.csv +yi_large,OpenCompass,56.3,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass,55.4,[],opencompass_240829.csv +glm_4,OpenCompass,55.2,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass,53.9,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass,53.5,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass,51.9,[],opencompass_240829.csv +240615,OpenCompass,51.0,[],opencompass_240829.csv +baichuan4,OpenCompass,50.4,[],opencompass_240829.csv +step_1_8k,OpenCompass,49.9,[],opencompass_240829.csv +abab6_5,OpenCompass,49.9,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass,48.8,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass,48.6,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass,47.9,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass,46.9,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass,46.9,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass,46.3,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass,45.5,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass,45.1,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass,44.5,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass,42.6,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass,42.3,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass,42.1,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass,37.6,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass,36.5,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass,36.0,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass,34.5,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass,30.7,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass,30.0,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Language,50.9,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Language,55.5,[],opencompass_240829.csv +mistral_large,OpenCompass Language,50.9,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Language,50.3,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Language,46.3,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Language,50.1,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Language,56.5,[],opencompass_240829.csv +yi_large,OpenCompass Language,48.7,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Language,45.8,[],opencompass_240829.csv +glm_4,OpenCompass Language,45.8,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Language,38.4,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Language,45.2,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Language,53.4,[],opencompass_240829.csv +240615,OpenCompass Language,31.1,[],opencompass_240829.csv +baichuan4,OpenCompass Language,37.2,[],opencompass_240829.csv +step_1_8k,OpenCompass Language,40.6,[],opencompass_240829.csv +abab6_5,OpenCompass Language,44.9,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Language,36.7,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Language,46.3,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Language,44.3,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Language,50.5,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Language,30.6,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Language,33.0,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Language,40.8,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Language,43.5,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Language,44.6,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Language,46.1,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Language,50.5,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Language,33.7,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Language,25.6,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Language,43.6,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Language,36.7,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Language,36.6,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Language,30.3,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Language,31.4,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Knowledge,85.0,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Knowledge,85.2,[],opencompass_240829.csv +mistral_large,OpenCompass Knowledge,83.4,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Knowledge,83.3,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Knowledge,78.8,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Knowledge,78.7,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Knowledge,79.0,[],opencompass_240829.csv +yi_large,OpenCompass Knowledge,75.3,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Knowledge,84.0,[],opencompass_240829.csv +glm_4,OpenCompass Knowledge,77.7,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Knowledge,81.4,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Knowledge,58.5,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Knowledge,79.3,[],opencompass_240829.csv +240615,OpenCompass Knowledge,78.3,[],opencompass_240829.csv +baichuan4,OpenCompass Knowledge,74.2,[],opencompass_240829.csv +step_1_8k,OpenCompass Knowledge,72.0,[],opencompass_240829.csv +abab6_5,OpenCompass Knowledge,69.8,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Knowledge,76.4,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Knowledge,61.0,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Knowledge,68.9,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Knowledge,65.0,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Knowledge,69.7,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Knowledge,72.2,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Knowledge,53.7,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Knowledge,64.1,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Knowledge,64.8,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Knowledge,56.0,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Knowledge,53.8,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Knowledge,63.2,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Knowledge,66.3,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Knowledge,41.3,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Knowledge,60.0,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Knowledge,50.4,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Knowledge,47.8,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Knowledge,41.3,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Reasoning,57.0,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Reasoning,55.8,[],opencompass_240829.csv +mistral_large,OpenCompass Reasoning,50.1,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Reasoning,50.0,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Reasoning,47.4,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Reasoning,45.4,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Reasoning,47.9,[],opencompass_240829.csv +yi_large,OpenCompass Reasoning,47.6,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Reasoning,44.7,[],opencompass_240829.csv +glm_4,OpenCompass Reasoning,46.1,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Reasoning,31.6,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Reasoning,45.4,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Reasoning,45.8,[],opencompass_240829.csv +240615,OpenCompass Reasoning,27.8,[],opencompass_240829.csv +baichuan4,OpenCompass Reasoning,38.5,[],opencompass_240829.csv +step_1_8k,OpenCompass Reasoning,35.8,[],opencompass_240829.csv +abab6_5,OpenCompass Reasoning,47.0,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Reasoning,41.3,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Reasoning,46.0,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Reasoning,40.0,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Reasoning,42.7,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Reasoning,36.8,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Reasoning,28.6,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Reasoning,41.9,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Reasoning,36.2,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Reasoning,39.3,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Reasoning,39.8,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Reasoning,40.5,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Reasoning,24.9,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Reasoning,20.8,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Reasoning,36.5,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Reasoning,18.9,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Reasoning,28.1,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Reasoning,20.7,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Reasoning,28.1,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Math,71.1,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Math,71.1,[],opencompass_240829.csv +mistral_large,OpenCompass Math,66.4,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Math,72.8,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Math,68.2,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Math,58.2,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Math,55.1,[],opencompass_240829.csv +yi_large,OpenCompass Math,54.8,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Math,57.7,[],opencompass_240829.csv +glm_4,OpenCompass Math,53.2,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Math,58.0,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Math,50.1,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Math,39.6,[],opencompass_240829.csv +240615,OpenCompass Math,67.5,[],opencompass_240829.csv +baichuan4,OpenCompass Math,51.8,[],opencompass_240829.csv +step_1_8k,OpenCompass Math,51.4,[],opencompass_240829.csv +abab6_5,OpenCompass Math,47.2,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Math,44.7,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Math,46.6,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Math,38.7,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Math,38.1,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Math,53.9,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Math,47.2,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Math,40.7,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Math,37.7,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Math,40.8,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Math,38.2,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Math,25.8,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Math,38.0,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Math,35.3,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Math,28.4,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Math,27.4,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Math,24.8,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Math,18.1,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Math,22.8,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Code,69.6,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Code,69.1,[],opencompass_240829.csv +mistral_large,OpenCompass Code,65.1,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Code,55.6,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Code,66.2,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Code,63.3,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Code,52.4,[],opencompass_240829.csv +yi_large,OpenCompass Code,54.3,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Code,49.5,[],opencompass_240829.csv +glm_4,OpenCompass Code,56.3,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Code,53.7,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Code,54.6,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Code,49.5,[],opencompass_240829.csv +240615,OpenCompass Code,50.2,[],opencompass_240829.csv +baichuan4,OpenCompass Code,44.1,[],opencompass_240829.csv +step_1_8k,OpenCompass Code,44.2,[],opencompass_240829.csv +abab6_5,OpenCompass Code,50.5,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Code,50.6,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Code,47.0,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Code,45.1,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Code,44.8,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Code,46.1,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Code,44.7,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Code,42.2,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Code,44.0,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Code,34.8,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Code,41.8,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Code,33.3,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Code,39.3,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Code,32.2,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Code,34.4,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Code,36.2,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Code,26.7,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Code,23.6,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Code,16.3,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Instruction,66.2,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Instruction,60.3,[],opencompass_240829.csv +mistral_large,OpenCompass Instruction,51.1,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Instruction,50.3,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Instruction,44.1,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Instruction,56.0,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Instruction,47.4,[],opencompass_240829.csv +yi_large,OpenCompass Instruction,40.0,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Instruction,34.0,[],opencompass_240829.csv +glm_4,OpenCompass Instruction,36.9,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Instruction,46.2,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Instruction,45.2,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Instruction,36.8,[],opencompass_240829.csv +240615,OpenCompass Instruction,30.6,[],opencompass_240829.csv +baichuan4,OpenCompass Instruction,39.4,[],opencompass_240829.csv +step_1_8k,OpenCompass Instruction,38.9,[],opencompass_240829.csv +abab6_5,OpenCompass Instruction,32.0,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Instruction,28.5,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Instruction,35.9,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Instruction,36.0,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Instruction,38.8,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Instruction,29.2,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Instruction,31.2,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Instruction,40.9,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Instruction,27.5,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Instruction,26.5,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Instruction,29.8,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Instruction,33.2,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Instruction,39.1,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Instruction,32.5,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Instruction,26.3,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Instruction,18.5,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Instruction,28.2,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Instruction,28.5,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Instruction,20.6,[],opencompass_240829.csv +claude_3_5_sonnet,OpenCompass Agent,81.7,[],opencompass_240829.csv +gpt_4o_20240513,OpenCompass Agent,84.4,[],opencompass_240829.csv +mistral_large,OpenCompass Agent,83.5,[],opencompass_240829.csv +mistral_large_instruct_2407,OpenCompass Agent,84.5,[],opencompass_240829.csv +deepseek_v2_chat0618,OpenCompass Agent,83.7,[],opencompass_240829.csv +gpt_4o_mini_20240718,OpenCompass Agent,85.7,[],opencompass_240829.csv +qwen_max_0428,OpenCompass Agent,83.8,[],opencompass_240829.csv +yi_large,OpenCompass Agent,86.1,[],opencompass_240829.csv +qwen2_72b_instruct,OpenCompass Agent,85.9,[],opencompass_240829.csv +glm_4,OpenCompass Agent,80.4,[],opencompass_240829.csv +llama3_1_70b_instruct,OpenCompass Agent,86.5,[],opencompass_240829.csv +gemma_2_27b_it,OpenCompass Agent,85.5,[],opencompass_240829.csv +qwen1_5_110b_chat,OpenCompass Agent,79.6,[],opencompass_240829.csv +240615,OpenCompass Agent,79.3,[],opencompass_240829.csv +baichuan4,OpenCompass Agent,84.5,[],opencompass_240829.csv +step_1_8k,OpenCompass Agent,84.2,[],opencompass_240829.csv +abab6_5,OpenCompass Agent,62.5,[],opencompass_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Agent,72.7,[],opencompass_240829.csv +moonshot_v1_8k,OpenCompass Agent,63.5,[],opencompass_240829.csv +glm_4_9b_chat,OpenCompass Agent,81.9,[],opencompass_240829.csv +yi_1_5_34b_chat,OpenCompass Agent,63.5,[],opencompass_240829.csv +hunyuan_standard_256k,OpenCompass Agent,65.6,[],opencompass_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Agent,86.0,[],opencompass_240829.csv +gemma_2_9b_it,OpenCompass Agent,69.9,[],opencompass_240829.csv +qwen2_7b_instruct,OpenCompass Agent,79.7,[],opencompass_240829.csv +internlm2_5_7b_chat,OpenCompass Agent,79.0,[],opencompass_240829.csv +yi_1_5_9b_chat,OpenCompass Agent,54.3,[],opencompass_240829.csv +nanbeige2_16b_chat,OpenCompass Agent,85.8,[],opencompass_240829.csv +llama3_1_8b_instruct,OpenCompass Agent,80.1,[],opencompass_240829.csv +dbrx_instructruct,OpenCompass Agent,75.3,[],opencompass_240829.csv +yi_1_5_6b_chat,OpenCompass Agent,55.4,[],opencompass_240829.csv +internlm2_chat_20b,OpenCompass Agent,80.3,[],opencompass_240829.csv +mixtral_8x7b_instruct_v0_1,OpenCompass Agent,71.0,[],opencompass_240829.csv +mistral_7b_instruct_v0_3,OpenCompass Agent,75.4,[],opencompass_240829.csv +deepseek_v2_lite_chat,OpenCompass Agent,72.4,[],opencompass_240829.csv +240615,OpenCompass Arena,1011.0,,opencompass_arena_240829.csv +abab6_5_chat,OpenCompass Arena,1027.0,,opencompass_arena_240829.csv +baichuan4,OpenCompass Arena,1007.0,,opencompass_arena_240829.csv +claude_3_5_sonnet_20240620,OpenCompass Arena,1055.0,,opencompass_arena_240829.csv +command_r_plus,OpenCompass Arena,977.0,,opencompass_arena_240829.csv +dbrx_instructruct,OpenCompass Arena,879.0,,opencompass_arena_240829.csv +deepseek_llm_67b_chat,OpenCompass Arena,937.0,,opencompass_arena_240829.csv +deepseek_moe_16b_chat,OpenCompass Arena,895.0,,opencompass_arena_240829.csv +deepseek_v2,OpenCompass Arena,1027.0,,opencompass_arena_240829.csv +deepseek_v2_chat,OpenCompass Arena,1048.0,,opencompass_arena_240829.csv +ernie_4_0_8k_preview_0518,OpenCompass Arena,1051.0,,opencompass_arena_240829.csv +glm_4_0520,OpenCompass Arena,1033.0,,opencompass_arena_240829.csv +gpt_40_20240513,OpenCompass Arena,1090.0,,opencompass_arena_240829.csv +gpt_4_turbo_20240409,OpenCompass Arena,1044.0,,opencompass_arena_240829.csv +hunyuan_pro,OpenCompass Arena,1069.0,,opencompass_arena_240829.csv +internlm2_5_7b_chat,OpenCompass Arena,958.0,,opencompass_arena_240829.csv +internlm2_chat_20b,OpenCompass Arena,992.0,,opencompass_arena_240829.csv +internlm2_chat_7b,OpenCompass Arena,968.0,,opencompass_arena_240829.csv +llama3_70b_instruct,OpenCompass Arena,926.0,,opencompass_arena_240829.csv +llama3_8b_instruct,OpenCompass Arena,920.0,,opencompass_arena_240829.csv +mixtral_8x22b_instruct_v0_1,OpenCompass Arena,933.0,,opencompass_arena_240829.csv +moonshot_v1_32k,OpenCompass Arena,994.0,,opencompass_arena_240829.csv +qwen1_5_14b_chat,OpenCompass Arena,968.0,,opencompass_arena_240829.csv +qwen1_5_32b_chat,OpenCompass Arena,1007.0,,opencompass_arena_240829.csv +qwen1_5_72b_chat,OpenCompass Arena,1007.0,,opencompass_arena_240829.csv +qwen1_5_7b_chat,OpenCompass Arena,970.0,,opencompass_arena_240829.csv +qwen2_72b_instruct,OpenCompass Arena,1085.0,,opencompass_arena_240829.csv +qwen_max_0428,OpenCompass Arena,1071.0,,opencompass_arena_240829.csv +yi_1_5_34b_chat,OpenCompass Arena,1016.0,,opencompass_arena_240829.csv +yi_34b_chat,OpenCompass Arena,983.0,,opencompass_arena_240829.csv +yi_large,OpenCompass Arena,1051.0,,opencompass_arena_240829.csv +claude_3_5_sonnet_20240620,LiveBench 240725,59.87,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench 240725,56.71,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench 240725,54.71,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench 240725,54.63,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench 240725,54.25,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench 240725,53.78,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench 240725,52.88,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench 240725,52.22,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench 240725,50.56,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench 240725,48.9,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench 240725,48.67,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench 240725,48.44,[],livebench_240829.csv +mistral_large_2407,LiveBench 240725,47.97,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench 240725,46.87,[],livebench_240829.csv +deepseek_coder_v2,LiveBench 240725,46.31,[],livebench_240829.csv +deepseek_chat_v2,LiveBench 240725,46.04,[],livebench_240829.csv +gpt_4_0613,LiveBench 240725,45.6,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench 240725,44.72,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench 240725,44.26,[],livebench_240829.csv +gemma_2_27b_it,LiveBench 240725,41.26,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench 240725,41.2,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench 240725,40.15,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench 240725,40.05,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench 240725,40.04,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench 240725,39.32,[],livebench_240829.csv +mistral_large_2402,LiveBench 240725,39.18,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench 240725,38.72,[],livebench_240829.csv +llama3_70b_instruct,LiveBench 240725,37.73,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench 240725,35.86,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench 240725,35.17,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench 240725,35.16,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench 240725,34.54,[],livebench_240829.csv +mistral_small_2402,LiveBench 240725,32.19,[],livebench_240829.csv +command_r_plus,LiveBench 240725,32.17,[],livebench_240829.csv +gemma_2_9b_it,LiveBench 240725,31.34,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench 240725,31.22,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench 240725,30.3,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench 240725,29.97,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench 240725,29.78,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench 240725,29.53,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench 240725,29.26,[],livebench_240829.csv +open_mistral_nemo,LiveBench 240725,29.17,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench 240725,28.3,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench 240725,28.03,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench 240725,27.98,[],livebench_240829.csv +llama3_8b_instruct,LiveBench 240725,27.56,[],livebench_240829.csv +command_r,LiveBench 240725,26.83,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench 240725,26.58,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench 240725,25.55,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench 240725,25.46,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench 240725,24.48,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench 240725,24.13,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench 240725,22.73,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench 240725,21.25,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench 240725,20.05,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench 240725,18.78,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench 240725,18.6,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench 240725,17.98,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench 240725,17.74,[],livebench_240829.csv +zephyr_7b_beta,LiveBench 240725,16.72,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench 240725,16.6,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench 240725,14.5,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench 240725,12.57,[],livebench_240829.csv +llama_2_7b_chat,LiveBench 240725,11.63,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench 240725,11.28,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench 240725,10.35,[],livebench_240829.csv +yi_6b_chat,LiveBench 240725,9.58,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench 240725,7.68,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench 240725,6.04,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench 240725,5.21,[],livebench_240829.csv +claude_3_5_sonnet_20240620,LiveBench Reasoning,58.67,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench Reasoning,54.67,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench Reasoning,52.0,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench Reasoning,50.0,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench Reasoning,53.33,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench Reasoning,49.33,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench Reasoning,51.33,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench Reasoning,48.67,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench Reasoning,41.33,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench Reasoning,47.33,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench Reasoning,44.0,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench Reasoning,40.67,[],livebench_240829.csv +mistral_large_2407,LiveBench Reasoning,42.0,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench Reasoning,47.33,[],livebench_240829.csv +deepseek_coder_v2,LiveBench Reasoning,45.33,[],livebench_240829.csv +deepseek_chat_v2,LiveBench Reasoning,40.0,[],livebench_240829.csv +gpt_4_0613,LiveBench Reasoning,34.67,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench Reasoning,35.33,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench Reasoning,35.33,[],livebench_240829.csv +gemma_2_27b_it,LiveBench Reasoning,32.0,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench Reasoning,40.0,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench Reasoning,41.33,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench Reasoning,33.33,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench Reasoning,29.33,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench Reasoning,36.0,[],livebench_240829.csv +mistral_large_2402,LiveBench Reasoning,36.0,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench Reasoning,28.67,[],livebench_240829.csv +llama3_70b_instruct,LiveBench Reasoning,30.67,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench Reasoning,29.33,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench Reasoning,29.33,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench Reasoning,38.67,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench Reasoning,26.67,[],livebench_240829.csv +mistral_small_2402,LiveBench Reasoning,26.0,[],livebench_240829.csv +command_r_plus,LiveBench Reasoning,28.67,[],livebench_240829.csv +gemma_2_9b_it,LiveBench Reasoning,17.33,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench Reasoning,36.67,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench Reasoning,34.0,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench Reasoning,30.0,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench Reasoning,30.67,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench Reasoning,26.0,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench Reasoning,23.33,[],livebench_240829.csv +open_mistral_nemo,LiveBench Reasoning,25.33,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench Reasoning,33.33,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench Reasoning,15.33,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench Reasoning,23.33,[],livebench_240829.csv +llama3_8b_instruct,LiveBench Reasoning,24.0,[],livebench_240829.csv +command_r,LiveBench Reasoning,25.33,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench Reasoning,20.0,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench Reasoning,28.0,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench Reasoning,28.0,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench Reasoning,18.0,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench Reasoning,20.0,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench Reasoning,17.33,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench Reasoning,16.0,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench Reasoning,14.0,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench Reasoning,14.67,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench Reasoning,12.0,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench Reasoning,16.0,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench Reasoning,16.0,[],livebench_240829.csv +zephyr_7b_beta,LiveBench Reasoning,12.67,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench Reasoning,18.67,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench Reasoning,15.33,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench Reasoning,12.67,[],livebench_240829.csv +llama_2_7b_chat,LiveBench Reasoning,12.0,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench Reasoning,10.67,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench Reasoning,8.0,[],livebench_240829.csv +yi_6b_chat,LiveBench Reasoning,10.67,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench Reasoning,6.0,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench Reasoning,3.33,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench Reasoning,2.67,[],livebench_240829.csv +claude_3_5_sonnet_20240620,LiveBench Coding,60.85,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench Coding,51.44,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench Coding,47.15,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench Coding,49.36,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench Coding,43.8,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench Coding,40.95,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench Coding,49.0,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench Coding,41.23,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench Coding,38.59,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench Coding,41.8,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench Coding,35.23,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench Coding,32.67,[],livebench_240829.csv +mistral_large_2407,LiveBench Coding,47.08,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench Coding,40.59,[],livebench_240829.csv +deepseek_coder_v2,LiveBench Coding,41.51,[],livebench_240829.csv +deepseek_chat_v2,LiveBench Coding,41.15,[],livebench_240829.csv +gpt_4_0613,LiveBench Coding,37.31,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench Coding,32.31,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench Coding,43.15,[],livebench_240829.csv +gemma_2_27b_it,LiveBench Coding,35.95,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench Coding,38.95,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench Coding,32.38,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench Coding,31.38,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench Coding,34.31,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench Coding,38.03,[],livebench_240829.csv +mistral_large_2402,LiveBench Coding,27.38,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench Coding,26.38,[],livebench_240829.csv +llama3_70b_instruct,LiveBench Coding,22.03,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench Coding,24.46,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench Coding,32.03,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench Coding,21.74,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench Coding,27.74,[],livebench_240829.csv +mistral_small_2402,LiveBench Coding,21.18,[],livebench_240829.csv +command_r_plus,LiveBench Coding,19.46,[],livebench_240829.csv +gemma_2_9b_it,LiveBench Coding,22.46,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench Coding,20.46,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench Coding,21.1,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench Coding,24.57,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench Coding,21.82,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench Coding,24.74,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench Coding,22.82,[],livebench_240829.csv +open_mistral_nemo,LiveBench Coding,28.74,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench Coding,15.9,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench Coding,19.74,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench Coding,20.26,[],livebench_240829.csv +llama3_8b_instruct,LiveBench Coding,19.82,[],livebench_240829.csv +command_r,LiveBench Coding,15.26,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench Coding,28.95,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench Coding,15.04,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench Coding,15.04,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench Coding,14.54,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench Coding,13.26,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench Coding,11.62,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench Coding,10.97,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench Coding,13.9,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench Coding,9.62,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench Coding,12.26,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench Coding,9.41,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench Coding,7.13,[],livebench_240829.csv +zephyr_7b_beta,LiveBench Coding,8.05,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench Coding,18.46,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench Coding,2.64,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench Coding,1.92,[],livebench_240829.csv +llama_2_7b_chat,LiveBench Coding,1.28,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench Coding,4.49,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench Coding,5.21,[],livebench_240829.csv +yi_6b_chat,LiveBench Coding,2.0,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench Coding,1.28,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench Coding,0.0,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench Coding,0.0,[],livebench_240829.csv +claude_3_5_sonnet_20240620,LiveBench Mathematics,53.75,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench Mathematics,52.29,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench Mathematics,52.19,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench Mathematics,49.88,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench Mathematics,46.55,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench Mathematics,56.28,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench Mathematics,48.99,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench Mathematics,47.46,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench Mathematics,46.54,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench Mathematics,42.75,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench Mathematics,45.68,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench Mathematics,45.58,[],livebench_240829.csv +mistral_large_2407,LiveBench Mathematics,40.48,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench Mathematics,36.29,[],livebench_240829.csv +deepseek_coder_v2,LiveBench Mathematics,52.54,[],livebench_240829.csv +deepseek_chat_v2,LiveBench Mathematics,52.11,[],livebench_240829.csv +gpt_4_0613,LiveBench Mathematics,36.22,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench Mathematics,42.42,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench Mathematics,41.58,[],livebench_240829.csv +gemma_2_27b_it,LiveBench Mathematics,36.23,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench Mathematics,42.77,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench Mathematics,43.44,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench Mathematics,28.32,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench Mathematics,38.89,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench Mathematics,40.67,[],livebench_240829.csv +mistral_large_2402,LiveBench Mathematics,32.2,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench Mathematics,29.65,[],livebench_240829.csv +llama3_70b_instruct,LiveBench Mathematics,32.31,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench Mathematics,25.72,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench Mathematics,28.33,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench Mathematics,33.3,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench Mathematics,26.93,[],livebench_240829.csv +mistral_small_2402,LiveBench Mathematics,28.15,[],livebench_240829.csv +command_r_plus,LiveBench Mathematics,24.85,[],livebench_240829.csv +gemma_2_9b_it,LiveBench Mathematics,23.98,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench Mathematics,31.36,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench Mathematics,25.64,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench Mathematics,28.97,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench Mathematics,26.28,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench Mathematics,34.44,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench Mathematics,26.82,[],livebench_240829.csv +open_mistral_nemo,LiveBench Mathematics,21.66,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench Mathematics,22.2,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench Mathematics,24.37,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench Mathematics,23.73,[],livebench_240829.csv +llama3_8b_instruct,LiveBench Mathematics,19.66,[],livebench_240829.csv +command_r,LiveBench Mathematics,16.92,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench Mathematics,26.87,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench Mathematics,17.06,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench Mathematics,20.84,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench Mathematics,17.84,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench Mathematics,20.45,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench Mathematics,20.71,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench Mathematics,14.56,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench Mathematics,17.08,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench Mathematics,15.21,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench Mathematics,9.96,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench Mathematics,15.29,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench Mathematics,14.08,[],livebench_240829.csv +zephyr_7b_beta,LiveBench Mathematics,11.23,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench Mathematics,14.86,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench Mathematics,9.04,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench Mathematics,7.1,[],livebench_240829.csv +llama_2_7b_chat,LiveBench Mathematics,4.78,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench Mathematics,9.86,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench Mathematics,9.94,[],livebench_240829.csv +yi_6b_chat,LiveBench Mathematics,8.53,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench Mathematics,7.35,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench Mathematics,3.53,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench Mathematics,4.43,[],livebench_240829.csv +claude_3_5_sonnet_20240620,LiveBench Data Analysis,56.74,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench Data Analysis,52.89,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench Data Analysis,54.43,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench Data Analysis,52.41,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench Data Analysis,53.51,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench Data Analysis,50.83,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench Data Analysis,51.32,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench Data Analysis,50.15,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench Data Analysis,54.32,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench Data Analysis,54.06,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench Data Analysis,47.99,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench Data Analysis,50.29,[],livebench_240829.csv +mistral_large_2407,LiveBench Data Analysis,46.61,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench Data Analysis,47.87,[],livebench_240829.csv +deepseek_coder_v2,LiveBench Data Analysis,38.25,[],livebench_240829.csv +deepseek_chat_v2,LiveBench Data Analysis,45.59,[],livebench_240829.csv +gpt_4_0613,LiveBench Data Analysis,44.03,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench Data Analysis,52.81,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench Data Analysis,44.52,[],livebench_240829.csv +gemma_2_27b_it,LiveBench Data Analysis,43.58,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench Data Analysis,26.24,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench Data Analysis,26.24,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench Data Analysis,48.11,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench Data Analysis,44.03,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench Data Analysis,26.19,[],livebench_240829.csv +mistral_large_2402,LiveBench Data Analysis,42.55,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench Data Analysis,44.56,[],livebench_240829.csv +llama3_70b_instruct,LiveBench Data Analysis,43.75,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench Data Analysis,41.54,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench Data Analysis,31.67,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench Data Analysis,40.46,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench Data Analysis,41.21,[],livebench_240829.csv +mistral_small_2402,LiveBench Data Analysis,31.88,[],livebench_240829.csv +command_r_plus,LiveBench Data Analysis,24.6,[],livebench_240829.csv +gemma_2_9b_it,LiveBench Data Analysis,35.06,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench Data Analysis,31.63,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench Data Analysis,32.12,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench Data Analysis,27.26,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench Data Analysis,31.45,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench Data Analysis,33.0,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench Data Analysis,32.98,[],livebench_240829.csv +open_mistral_nemo,LiveBench Data Analysis,33.35,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench Data Analysis,30.43,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench Data Analysis,32.15,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench Data Analysis,29.62,[],livebench_240829.csv +llama3_8b_instruct,LiveBench Data Analysis,26.0,[],livebench_240829.csv +command_r,LiveBench Data Analysis,31.69,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench Data Analysis,28.75,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench Data Analysis,34.02,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench Data Analysis,29.55,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench Data Analysis,27.89,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench Data Analysis,26.92,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench Data Analysis,28.13,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench Data Analysis,21.77,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench Data Analysis,14.62,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench Data Analysis,18.17,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench Data Analysis,17.4,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench Data Analysis,16.9,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench Data Analysis,18.19,[],livebench_240829.csv +zephyr_7b_beta,LiveBench Data Analysis,15.75,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench Data Analysis,2.0,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench Data Analysis,9.93,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench Data Analysis,3.33,[],livebench_240829.csv +llama_2_7b_chat,LiveBench Data Analysis,0.0,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench Data Analysis,9.13,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench Data Analysis,10.01,[],livebench_240829.csv +yi_6b_chat,LiveBench Data Analysis,4.38,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench Data Analysis,2.0,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench Data Analysis,3.33,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench Data Analysis,0.0,[],livebench_240829.csv +claude_3_5_sonnet_20240620,LiveBench Language,56.94,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench Language,54.37,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench Language,49.95,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench Language,53.94,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench Language,49.85,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench Language,49.31,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench Language,45.26,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench Language,46.96,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench Language,51.72,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench Language,43.55,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench Language,41.77,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench Language,42.36,[],livebench_240829.csv +mistral_large_2407,LiveBench Language,39.79,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench Language,31.04,[],livebench_240829.csv +deepseek_coder_v2,LiveBench Language,33.04,[],livebench_240829.csv +deepseek_chat_v2,LiveBench Language,32.77,[],livebench_240829.csv +gpt_4_0613,LiveBench Language,49.57,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench Language,38.25,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench Language,35.28,[],livebench_240829.csv +gemma_2_27b_it,LiveBench Language,32.4,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench Language,31.17,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench Language,29.21,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench Language,43.77,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench Language,30.69,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench Language,30.03,[],livebench_240829.csv +mistral_large_2402,LiveBench Language,28.74,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench Language,38.08,[],livebench_240829.csv +llama3_70b_instruct,LiveBench Language,34.11,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench Language,30.07,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench Language,26.48,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench Language,17.07,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench Language,24.22,[],livebench_240829.csv +mistral_small_2402,LiveBench Language,22.06,[],livebench_240829.csv +command_r_plus,LiveBench Language,23.92,[],livebench_240829.csv +gemma_2_9b_it,LiveBench Language,27.64,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench Language,13.91,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench Language,12.76,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench Language,15.53,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench Language,13.22,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench Language,10.64,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench Language,11.37,[],livebench_240829.csv +open_mistral_nemo,LiveBench Language,14.15,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench Language,9.67,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench Language,20.05,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench Language,15.13,[],livebench_240829.csv +llama3_8b_instruct,LiveBench Language,18.72,[],livebench_240829.csv +command_r,LiveBench Language,14.64,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench Language,10.21,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench Language,7.76,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench Language,8.06,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench Language,15.37,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench Language,11.37,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench Language,13.76,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench Language,11.85,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench Language,9.05,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench Language,10.65,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench Language,7.2,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench Language,6.18,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench Language,9.2,[],livebench_240829.csv +zephyr_7b_beta,LiveBench Language,4.28,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench Language,7.26,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench Language,7.92,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench Language,8.66,[],livebench_240829.csv +llama_2_7b_chat,LiveBench Language,6.86,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench Language,5.8,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench Language,3.05,[],livebench_240829.csv +yi_6b_chat,LiveBench Language,4.69,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench Language,2.8,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench Language,3.16,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench Language,2.88,[],livebench_240829.csv +claude_3_5_sonnet_20240620,LiveBench Instruction Following,72.3,[],livebench_240829.csv +gpt_4o_2024_08_06,LiveBench Instruction Following,74.58,[],livebench_240829.csv +chatgpt_4o_latest,LiveBench Instruction Following,72.52,[],livebench_240829.csv +gpt_4o_2024_05_13,LiveBench Instruction Following,72.17,[],livebench_240829.csv +llama3_1_405b_instruct_turbo,LiveBench Instruction Following,78.47,[],livebench_240829.csv +gemini_1_5_pro_exp_0827,LiveBench Instruction Following,75.95,[],livebench_240829.csv +gpt_4_turbo_2024_04_09,LiveBench Instruction Following,71.39,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,LiveBench Instruction Following,78.84,[],livebench_240829.csv +claude_3_opus_20240229,LiveBench Instruction Following,70.87,[],livebench_240829.csv +gpt_4_0125_preview,LiveBench Instruction Following,63.92,[],livebench_240829.csv +dracarys_llama3_1_70b_instruct,LiveBench Instruction Following,77.37,[],livebench_240829.csv +llama3_1_70b_instruct_turbo,LiveBench Instruction Following,79.08,[],livebench_240829.csv +mistral_large_2407,LiveBench Instruction Following,71.85,[],livebench_240829.csv +gemini_1_5_flash_exp_0827,LiveBench Instruction Following,78.11,[],livebench_240829.csv +deepseek_coder_v2,LiveBench Instruction Following,67.18,[],livebench_240829.csv +deepseek_chat_v2,LiveBench Instruction Following,64.61,[],livebench_240829.csv +gpt_4_0613,LiveBench Instruction Following,71.79,[],livebench_240829.csv +gemini_1_5_pro_api_0514,LiveBench Instruction Following,67.2,[],livebench_240829.csv +gpt_4o_mini_2024_07_18,LiveBench Instruction Following,65.68,[],livebench_240829.csv +gemma_2_27b_it,LiveBench Instruction Following,67.37,[],livebench_240829.csv +dracarys_72b_instruct,LiveBench Instruction Following,68.08,[],livebench_240829.csv +qwen2_72b_instruct,LiveBench Instruction Following,68.27,[],livebench_240829.csv +hermes_3_llama3_1_70b,LiveBench Instruction Following,55.37,[],livebench_240829.csv +gemini_1_5_flash_api_0514,LiveBench Instruction Following,63.01,[],livebench_240829.csv +smaug_qwen2_72b_instruct,LiveBench Instruction Following,65.0,[],livebench_240829.csv +mistral_large_2402,LiveBench Instruction Following,68.19,[],livebench_240829.csv +claude_3_sonnet_20240229,LiveBench Instruction Following,65.0,[],livebench_240829.csv +llama3_70b_instruct,LiveBench Instruction Following,63.5,[],livebench_240829.csv +claude_3_haiku_20240307,LiveBench Instruction Following,64.03,[],livebench_240829.csv +mixtral_8x22b_instruct_v0_1,LiveBench Instruction Following,63.17,[],livebench_240829.csv +phi_3_5_moe_instruct,LiveBench Instruction Following,59.73,[],livebench_240829.csv +gpt_3_5_turbo_0125,LiveBench Instruction Following,60.47,[],livebench_240829.csv +mistral_small_2402,LiveBench Instruction Following,63.91,[],livebench_240829.csv +command_r_plus,LiveBench Instruction Following,71.51,[],livebench_240829.csv +gemma_2_9b_it,LiveBench Instruction Following,61.55,[],livebench_240829.csv +phi_3_medium_4k_instruct,LiveBench Instruction Following,53.3,[],livebench_240829.csv +phi_3_medium_128k_instruct,LiveBench Instruction Following,56.15,[],livebench_240829.csv +phi_3_small_128k_instruct,LiveBench Instruction Following,53.47,[],livebench_240829.csv +qwen1_5_110b_chat,LiveBench Instruction Following,55.26,[],livebench_240829.csv +deepseek_coder_v2_lite_instruct,LiveBench Instruction Following,48.34,[],livebench_240829.csv +qwen1_5_72b_chat,LiveBench Instruction Following,58.25,[],livebench_240829.csv +open_mistral_nemo,LiveBench Instruction Following,51.8,[],livebench_240829.csv +phi_3_5_mini_instruct,LiveBench Instruction Following,58.3,[],livebench_240829.csv +llama3_1_8b_instruct_turbo,LiveBench Instruction Following,56.53,[],livebench_240829.csv +phi_3_small_8k_instruct,LiveBench Instruction Following,55.81,[],livebench_240829.csv +llama3_8b_instruct,LiveBench Instruction Following,57.14,[],livebench_240829.csv +command_r,LiveBench Instruction Following,57.16,[],livebench_240829.csv +qwen2_7b_instruct,LiveBench Instruction Following,44.74,[],livebench_240829.csv +phi_3_mini_128k_instruct,LiveBench Instruction Following,51.4,[],livebench_240829.csv +phi_3_mini_4k_instruct,LiveBench Instruction Following,51.25,[],livebench_240829.csv +mathstral_7b_v0_1,LiveBench Instruction Following,53.25,[],livebench_240829.csv +openhermes_2_5_mistral_7b,LiveBench Instruction Following,52.78,[],livebench_240829.csv +mixtral_8x7b_instruct_v0_1,LiveBench Instruction Following,44.81,[],livebench_240829.csv +mistral_7b_instruct_v0_3,LiveBench Instruction Following,52.37,[],livebench_240829.csv +mistral_7b_instruct_v0_2,LiveBench Instruction Following,51.65,[],livebench_240829.csv +gemma_1_1_7b_it,LiveBench Instruction Following,44.34,[],livebench_240829.csv +zephyr_7b_alpha,LiveBench Instruction Following,52.79,[],livebench_240829.csv +qwen1_5_7b_chat,LiveBench Instruction Following,44.12,[],livebench_240829.csv +deepseek_v2_lite_chat,LiveBench Instruction Following,41.83,[],livebench_240829.csv +zephyr_7b_beta,LiveBench Instruction Following,48.32,[],livebench_240829.csv +starling_lm_7b_beta,LiveBench Instruction Following,38.32,[],livebench_240829.csv +vicuna_7b_v1_5_16k,LiveBench Instruction Following,42.12,[],livebench_240829.csv +vicuna_7b_v1_5,LiveBench Instruction Following,41.75,[],livebench_240829.csv +llama_2_7b_chat,LiveBench Instruction Following,44.88,[],livebench_240829.csv +qwen1_5_4b_chat,LiveBench Instruction Following,27.75,[],livebench_240829.csv +qwen2_1_5b_instruct,LiveBench Instruction Following,25.9,[],livebench_240829.csv +yi_6b_chat,LiveBench Instruction Following,27.22,[],livebench_240829.csv +qwen2_0_5b_instruct,LiveBench Instruction Following,26.63,[],livebench_240829.csv +qwen1_5_1_8b_chat,LiveBench Instruction Following,22.9,[],livebench_240829.csv +qwen1_5_0_5b_chat,LiveBench Instruction Following,21.3,[],livebench_240829.csv +gemini_1_5_pro_exp_0801,Enkrypt AI Safety,84.0,[],enkrypt_ai_safety_240916.csv +gemini_1_5_pro_latest,Enkrypt AI Safety,81.0,[],enkrypt_ai_safety_240916.csv +gemma_2_27b_it,Enkrypt AI Safety,79.0,[],enkrypt_ai_safety_240916.csv +reflection_llama3_1_70b,Enkrypt AI Safety,81.0,[],enkrypt_ai_safety_240916.csv +llama_2_7b_chat_gguf_8bit,Enkrypt AI Safety,80.0,[],enkrypt_ai_safety_240916.csv +llama_2_7b_chat_gguf_4bit,Enkrypt AI Safety,80.0,[],enkrypt_ai_safety_240916.csv +smollm_360m_instruct,Enkrypt AI Safety,80.0,[],enkrypt_ai_safety_240916.csv +llama_2_7b_chat,Enkrypt AI Safety,78.0,[],enkrypt_ai_safety_240916.csv +flan_flan-ul2,Enkrypt AI Safety,76.0,[],enkrypt_ai_safety_240916.csv +o1_preview,Enkrypt AI Safety,76.0,[],enkrypt_ai_safety_240916.csv +llama3_8b_instruct_rr,Enkrypt AI Safety,81.0,[],enkrypt_ai_safety_240916.csv +claude_3_opus_20240229,Enkrypt AI Safety,75.0,[],enkrypt_ai_safety_240916.csv +gpt_4_0125_preview,Enkrypt AI Safety,79.0,[],enkrypt_ai_safety_240916.csv +sarvam_2b_v0_5,Enkrypt AI Safety,75.0,[],enkrypt_ai_safety_240916.csv +llama3_8b_instruct_mopeymule,Enkrypt AI Safety,73.0,[],enkrypt_ai_safety_240916.csv +claude_3_5_sonnet_20240620,Enkrypt AI Safety,71.0,[],enkrypt_ai_safety_240916.csv +sea_lion_7b_instruct,Enkrypt AI Safety,73.0,[],enkrypt_ai_safety_240916.csv +claude_instant_1_2,Enkrypt AI Safety,76.0,[],enkrypt_ai_safety_240916.csv +gpt_4_turbo_2024_04_09,Enkrypt AI Safety,75.0,[],enkrypt_ai_safety_240916.csv +llama3_1_8b_instruct_turbo,Enkrypt AI Safety,70.0,[],enkrypt_ai_safety_240916.csv +rakutenai_7b_chat,Enkrypt AI Safety,68.0,[],enkrypt_ai_safety_240916.csv +gemma_2_2b_it,Enkrypt AI Safety,67.0,[],enkrypt_ai_safety_240916.csv +llama3_8b_instruct,Enkrypt AI Safety,72.0,[],enkrypt_ai_safety_240916.csv +o1_mini,Enkrypt AI Safety,71.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_v0_1,Enkrypt AI Safety,70.0,[],enkrypt_ai_safety_240916.csv +llama_2_13b_chat,Enkrypt AI Safety,72.0,[],enkrypt_ai_safety_240916.csv +h2o_danube3_500m_chat,Enkrypt AI Safety,68.0,[],enkrypt_ai_safety_240916.csv +llama_2_70b_chat,Enkrypt AI Safety,68.0,[],enkrypt_ai_safety_240916.csv +gemma_2_9b_it,Enkrypt AI Safety,67.0,[],enkrypt_ai_safety_240916.csv +internlm2_chat_20b,Enkrypt AI Safety,59.0,[],enkrypt_ai_safety_240916.csv +gemma_2_9b,Enkrypt AI Safety,64.0,[],enkrypt_ai_safety_240916.csv +nexusraven_v2_13b,Enkrypt AI Safety,63.0,[],enkrypt_ai_safety_240916.csv +komodo_7b_base,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv +gpt_4o,Enkrypt AI Safety,64.0,[],enkrypt_ai_safety_240916.csv +phi_2,Enkrypt AI Safety,58.0,[],enkrypt_ai_safety_240916.csv +phi3_medium_128k,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv +gemma_7b_it,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv +claude_3_haiku_20240307,Enkrypt AI Safety,67.0,[],enkrypt_ai_safety_240916.csv +llama3_1_405b_instruct_turbo,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv +smollm_1_7b_instruct,Enkrypt AI Safety,60.0,[],enkrypt_ai_safety_240916.csv +gpt_4o_2024_08_06,Enkrypt AI Safety,60.0,[],enkrypt_ai_safety_240916.csv +powerlm_3b,Enkrypt AI Safety,53.0,[],enkrypt_ai_safety_240916.csv +llama3_70b_instruct,Enkrypt AI Safety,62.0,[],enkrypt_ai_safety_240916.csv +starling_lm_7b_beta_gguf_4bit,Enkrypt AI Safety,54.0,[],enkrypt_ai_safety_240916.csv +smaug_72b_v0_1,Enkrypt AI Safety,61.0,[],enkrypt_ai_safety_240916.csv +gpt_3_5_turbo,Enkrypt AI Safety,62.0,[],enkrypt_ai_safety_240916.csv +codellama_7b_instruct,Enkrypt AI Safety,56.0,[],enkrypt_ai_safety_240916.csv +smaug_llama3_70b_instruct,Enkrypt AI Safety,56.0,[],enkrypt_ai_safety_240916.csv +mixtral_8x7b_instruct_v0_1,Enkrypt AI Safety,54.0,[],enkrypt_ai_safety_240916.csv +jamba_instruct_preview,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv +mixtral_8x22b_instruct_v0_1,Enkrypt AI Safety,53.0,[],enkrypt_ai_safety_240916.csv +seallm_7b_v2,Enkrypt AI Safety,58.0,[],enkrypt_ai_safety_240916.csv +qwen2_72b_instruct,Enkrypt AI Safety,55.0,[],enkrypt_ai_safety_240916.csv +olmo_7b_instruct,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv +phi_3_mini_128k_instruct,Enkrypt AI Safety,55.0,[],enkrypt_ai_safety_240916.csv +dbrx_instructruct,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv +falcon_mamba_7b_instruct,Enkrypt AI Safety,49.0,[],enkrypt_ai_safety_240916.csv +gpt_4o_mini,Enkrypt AI Safety,55.0,[],enkrypt_ai_safety_240916.csv +phi_3_5_moe_instruct,Enkrypt AI Safety,54.0,[],enkrypt_ai_safety_240916.csv +qwen1_5_14b_chat,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv +c4ai_command_r_plus,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv +smaug_34b_v0_1,Enkrypt AI Safety,56.0,[],enkrypt_ai_safety_240916.csv +qwen2_7b_instruct,Enkrypt AI Safety,50.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_instruct_v0_2_gguf_4bit,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv +llama3_1_70b_instruct_turbo,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv +k2_chat,Enkrypt AI Safety,50.0,[],enkrypt_ai_safety_240916.csv +phi_3_mini_4k_instruct,Enkrypt AI Safety,50.0,[],enkrypt_ai_safety_240916.csv +starling_lm_7b_beta,Enkrypt AI Safety,51.0,[],enkrypt_ai_safety_240916.csv +olmoe_1b_7b_0924_instruct,Enkrypt AI Safety,49.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_instruct_v0_2_gguf_8bit,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv +h2o_danube3_4b_chat,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv +rakutenai_7b_instruct,Enkrypt AI Safety,44.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_instruct_v0_2,Enkrypt AI Safety,46.0,[],enkrypt_ai_safety_240916.csv +jamba_1_5_mini,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv +aya_23_35b,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv +jamba_1_5_large,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv +phi_3_small_8k_instruct,Enkrypt AI Safety,48.0,[],enkrypt_ai_safety_240916.csv +phi_3_small_128k_instruct,Enkrypt AI Safety,46.0,[],enkrypt_ai_safety_240916.csv +zephyr_7b_beta,Enkrypt AI Safety,43.0,[],enkrypt_ai_safety_240916.csv +powermoe_3b,Enkrypt AI Safety,47.0,[],enkrypt_ai_safety_240916.csv +longwriter_glm4_9b,Enkrypt AI Safety,46.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_instruct_v0_1_gguf_4bit,Enkrypt AI Safety,39.0,[],enkrypt_ai_safety_240916.csv +snowflake_arctic_instruct,Enkrypt AI Safety,45.0,[],enkrypt_ai_safety_240916.csv +qwen2_57b_a14b_instruct,Enkrypt AI Safety,45.0,[],enkrypt_ai_safety_240916.csv +palm_2_chat_bison,Enkrypt AI Safety,40.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_instruct_v0_1_gguf_8bit,Enkrypt AI Safety,40.0,[],enkrypt_ai_safety_240916.csv +glm_4_9b_chat,Enkrypt AI Safety,43.0,[],enkrypt_ai_safety_240916.csv +phi_3_medium_4k_instruct,Enkrypt AI Safety,43.0,[],enkrypt_ai_safety_240916.csv +aya_23_8b,Enkrypt AI Safety,40.0,[],enkrypt_ai_safety_240916.csv +mistral_7b_instruct_v0_3,Enkrypt AI Safety,39.0,[],enkrypt_ai_safety_240916.csv +phi_3_5_mini_instruct,Enkrypt AI Safety,37.0,[],enkrypt_ai_safety_240916.csv +dolphin_2_5_mixtral_8x7b,Enkrypt AI Safety,32.0,[],enkrypt_ai_safety_240916.csv +gpt_4o_2024_05_13,WildBench Elo LC,1227.1,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Elo LC,1215.4,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Elo LC,1214.6,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Elo LC,1209.6,[],wildbench_240829.csv +yi_large_preview,WildBench Elo LC,1208.9,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Elo LC,1199.1,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Elo LC,1197.3,[],wildbench_240829.csv +claude_3_opus,WildBench Elo LC,1196.3,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Elo LC,1192.0,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Elo LC,1187.5,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Elo LC,1184.9,[],wildbench_240829.csv +yi_large,WildBench Elo LC,1181.8,[],wildbench_240829.csv +athene_70b,WildBench Elo LC,1180.7,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Elo LC,1178.6,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Elo LC,1176.4,[],wildbench_240829.csv +mistral_large_2,WildBench Elo LC,1176.3,[],wildbench_240829.csv +claude_3_sonnet,WildBench Elo LC,1174.7,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Elo LC,1173.5,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Elo LC,1172.3,[],wildbench_240829.csv +reka_core,WildBench Elo LC,1170.4,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Elo LC,1166.6,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Elo LC,1166.6,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Elo LC,1159.6,[],wildbench_240829.csv +claude_3_haiku,WildBench Elo LC,1159.1,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Elo LC,1158.6,[],wildbench_240829.csv +mistral_large,WildBench Elo LC,1157.0,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Elo LC,1156.4,[],wildbench_240829.csv +command_r_plus,WildBench Elo LC,1151.4,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Elo LC,1148.5,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Elo LC,1148.4,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Elo LC,1148.0,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Elo LC,1147.5,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Elo LC,1147.4,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Elo LC,1147.4,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Elo LC,1145.5,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Elo LC,1144.0,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Elo LC,1139.5,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Elo LC,1139.5,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Elo LC,1137.4,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Elo LC,1136.0,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Elo LC,1134.3,[],wildbench_240829.csv +reka_flash,WildBench Elo LC,1132.7,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Elo LC,1129.7,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Elo LC,1129.2,[],wildbench_240829.csv +dbrx_instruct,WildBench Elo LC,1128.5,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Elo LC,1126.6,[],wildbench_240829.csv +neo_7b_instruct,WildBench Elo LC,1126.2,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Elo LC,1126.2,[],wildbench_240829.csv +command_r,WildBench Elo LC,1125.6,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Elo LC,1124.7,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Elo LC,1122.7,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Elo LC,1121.0,[],wildbench_240829.csv +reka_edge,WildBench Elo LC,1120.8,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Elo LC,1105.0,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Elo LC,1101.9,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Elo LC,1092.7,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Elo LC,1085.8,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Elo LC,1082.1,[],wildbench_240829.csv +gemma_7b_it,WildBench Elo LC,1079.2,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Elo LC,1052.5,[],wildbench_240829.csv +gpt_4o_2024_05_13,WildBench Information Seeking,58.6,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Information Seeking,55.5,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Information Seeking,52.2,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Information Seeking,57.2,[],wildbench_240829.csv +yi_large_preview,WildBench Information Seeking,57.7,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Information Seeking,52.7,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Information Seeking,54.4,[],wildbench_240829.csv +claude_3_opus,WildBench Information Seeking,53.5,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Information Seeking,48.7,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Information Seeking,52.3,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Information Seeking,40.0,[],wildbench_240829.csv +yi_large,WildBench Information Seeking,51.0,[],wildbench_240829.csv +athene_70b,WildBench Information Seeking,60.8,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Information Seeking,53.0,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Information Seeking,50.5,[],wildbench_240829.csv +mistral_large_2,WildBench Information Seeking,57.4,[],wildbench_240829.csv +claude_3_sonnet,WildBench Information Seeking,47.1,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Information Seeking,57.4,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Information Seeking,49.5,[],wildbench_240829.csv +reka_core,WildBench Information Seeking,52.3,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Information Seeking,56.5,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Information Seeking,58.2,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Information Seeking,50.3,[],wildbench_240829.csv +claude_3_haiku,WildBench Information Seeking,45.3,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Information Seeking,51.9,[],wildbench_240829.csv +mistral_large,WildBench Information Seeking,46.1,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Information Seeking,49.0,[],wildbench_240829.csv +command_r_plus,WildBench Information Seeking,49.2,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Information Seeking,46.3,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Information Seeking,48.9,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Information Seeking,42.6,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Information Seeking,47.9,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Information Seeking,47.9,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Information Seeking,48.2,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Information Seeking,47.3,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Information Seeking,46.1,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Information Seeking,35.7,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Information Seeking,39.3,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Information Seeking,41.6,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Information Seeking,42.9,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Information Seeking,41.0,[],wildbench_240829.csv +reka_flash,WildBench Information Seeking,41.5,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Information Seeking,39.9,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Information Seeking,36.5,[],wildbench_240829.csv +dbrx_instruct,WildBench Information Seeking,41.1,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Information Seeking,34.9,[],wildbench_240829.csv +neo_7b_instruct,WildBench Information Seeking,36.3,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Information Seeking,41.9,[],wildbench_240829.csv +command_r,WildBench Information Seeking,44.1,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Information Seeking,41.9,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Information Seeking,31.4,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Information Seeking,40.7,[],wildbench_240829.csv +reka_edge,WildBench Information Seeking,34.4,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Information Seeking,40.1,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Information Seeking,38.3,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Information Seeking,34.0,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Information Seeking,39.8,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Information Seeking,28.6,[],wildbench_240829.csv +gemma_7b_it,WildBench Information Seeking,12.7,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Information Seeking,27.7,[],wildbench_240829.csv +gpt_4o_2024_05_13,WildBench Creative,59.1,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Creative,55.6,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Creative,55.1,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Creative,58.7,[],wildbench_240829.csv +yi_large_preview,WildBench Creative,57.6,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Creative,56.4,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Creative,57.6,[],wildbench_240829.csv +claude_3_opus,WildBench Creative,53.0,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Creative,51.7,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Creative,54.3,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Creative,40.8,[],wildbench_240829.csv +yi_large,WildBench Creative,51.8,[],wildbench_240829.csv +athene_70b,WildBench Creative,60.4,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Creative,53.3,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Creative,53.6,[],wildbench_240829.csv +mistral_large_2,WildBench Creative,58.9,[],wildbench_240829.csv +claude_3_sonnet,WildBench Creative,46.3,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Creative,60.1,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Creative,49.9,[],wildbench_240829.csv +reka_core,WildBench Creative,55.5,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Creative,58.0,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Creative,59.1,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Creative,53.5,[],wildbench_240829.csv +claude_3_haiku,WildBench Creative,42.9,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Creative,54.6,[],wildbench_240829.csv +mistral_large,WildBench Creative,49.7,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Creative,51.0,[],wildbench_240829.csv +command_r_plus,WildBench Creative,52.6,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Creative,47.8,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Creative,49.2,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Creative,45.6,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Creative,50.6,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Creative,51.8,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Creative,50.4,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Creative,49.1,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Creative,51.1,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Creative,33.2,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Creative,43.6,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Creative,39.8,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Creative,44.3,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Creative,44.7,[],wildbench_240829.csv +reka_flash,WildBench Creative,42.4,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Creative,43.6,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Creative,37.4,[],wildbench_240829.csv +dbrx_instruct,WildBench Creative,42.3,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Creative,38.5,[],wildbench_240829.csv +neo_7b_instruct,WildBench Creative,39.5,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Creative,43.8,[],wildbench_240829.csv +command_r,WildBench Creative,47.4,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Creative,42.8,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Creative,31.1,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Creative,42.7,[],wildbench_240829.csv +reka_edge,WildBench Creative,36.2,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Creative,42.1,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Creative,40.0,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Creative,38.3,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Creative,37.9,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Creative,30.6,[],wildbench_240829.csv +gemma_7b_it,WildBench Creative,21.2,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Creative,29.8,[],wildbench_240829.csv +gpt_4o_2024_05_13,WildBench Code Debugging,60.5,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Code Debugging,56.5,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Code Debugging,55.2,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Code Debugging,55.1,[],wildbench_240829.csv +yi_large_preview,WildBench Code Debugging,54.3,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Code Debugging,55.0,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Code Debugging,52.9,[],wildbench_240829.csv +claude_3_opus,WildBench Code Debugging,53.3,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Code Debugging,48.7,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Code Debugging,44.7,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Code Debugging,48.9,[],wildbench_240829.csv +yi_large,WildBench Code Debugging,47.7,[],wildbench_240829.csv +athene_70b,WildBench Code Debugging,59.0,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Code Debugging,46.3,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Code Debugging,47.0,[],wildbench_240829.csv +mistral_large_2,WildBench Code Debugging,53.8,[],wildbench_240829.csv +claude_3_sonnet,WildBench Code Debugging,46.1,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Code Debugging,57.2,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Code Debugging,39.8,[],wildbench_240829.csv +reka_core,WildBench Code Debugging,40.6,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Code Debugging,50.9,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Code Debugging,50.5,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Code Debugging,42.1,[],wildbench_240829.csv +claude_3_haiku,WildBench Code Debugging,37.0,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Code Debugging,39.7,[],wildbench_240829.csv +mistral_large,WildBench Code Debugging,33.7,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Code Debugging,36.7,[],wildbench_240829.csv +command_r_plus,WildBench Code Debugging,28.4,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Code Debugging,35.4,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Code Debugging,33.7,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Code Debugging,35.0,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Code Debugging,31.8,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Code Debugging,31.5,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Code Debugging,35.4,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Code Debugging,28.6,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Code Debugging,27.3,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Code Debugging,18.2,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Code Debugging,22.0,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Code Debugging,23.1,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Code Debugging,25.3,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Code Debugging,11.0,[],wildbench_240829.csv +reka_flash,WildBench Code Debugging,22.1,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Code Debugging,17.9,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Code Debugging,26.5,[],wildbench_240829.csv +dbrx_instruct,WildBench Code Debugging,26.4,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Code Debugging,12.8,[],wildbench_240829.csv +neo_7b_instruct,WildBench Code Debugging,14.0,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Code Debugging,24.4,[],wildbench_240829.csv +command_r,WildBench Code Debugging,19.3,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Code Debugging,25.0,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Code Debugging,16.6,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Code Debugging,20.7,[],wildbench_240829.csv +reka_edge,WildBench Code Debugging,13.5,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Code Debugging,18.4,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Code Debugging,9.3,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Code Debugging,14.9,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Code Debugging,26.0,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Code Debugging,21.6,[],wildbench_240829.csv +gemma_7b_it,WildBench Code Debugging,1.8,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Code Debugging,-6.8,[],wildbench_240829.csv +gpt_4o_2024_05_13,WildBench Math & Data,57.3,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Math & Data,50.2,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Math & Data,48.6,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Math & Data,51.0,[],wildbench_240829.csv +yi_large_preview,WildBench Math & Data,51.9,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Math & Data,51.4,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Math & Data,45.8,[],wildbench_240829.csv +claude_3_opus,WildBench Math & Data,46.7,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Math & Data,45.3,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Math & Data,42.1,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Math & Data,46.4,[],wildbench_240829.csv +yi_large,WildBench Math & Data,44.5,[],wildbench_240829.csv +athene_70b,WildBench Math & Data,57.1,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Math & Data,40.8,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Math & Data,43.9,[],wildbench_240829.csv +mistral_large_2,WildBench Math & Data,52.7,[],wildbench_240829.csv +claude_3_sonnet,WildBench Math & Data,40.6,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Math & Data,54.0,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Math & Data,41.0,[],wildbench_240829.csv +reka_core,WildBench Math & Data,40.3,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Math & Data,48.6,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Math & Data,47.1,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Math & Data,39.4,[],wildbench_240829.csv +claude_3_haiku,WildBench Math & Data,31.4,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Math & Data,35.6,[],wildbench_240829.csv +mistral_large,WildBench Math & Data,30.9,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Math & Data,36.4,[],wildbench_240829.csv +command_r_plus,WildBench Math & Data,23.5,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Math & Data,29.8,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Math & Data,29.8,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Math & Data,32.2,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Math & Data,24.0,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Math & Data,24.4,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Math & Data,29.8,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Math & Data,21.2,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Math & Data,23.5,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Math & Data,23.0,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Math & Data,17.0,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Math & Data,18.7,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Math & Data,18.6,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Math & Data,12.7,[],wildbench_240829.csv +reka_flash,WildBench Math & Data,20.5,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Math & Data,15.8,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Math & Data,21.6,[],wildbench_240829.csv +dbrx_instruct,WildBench Math & Data,24.5,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Math & Data,12.6,[],wildbench_240829.csv +neo_7b_instruct,WildBench Math & Data,15.0,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Math & Data,17.0,[],wildbench_240829.csv +command_r,WildBench Math & Data,16.0,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Math & Data,22.1,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Math & Data,16.8,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Math & Data,14.8,[],wildbench_240829.csv +reka_edge,WildBench Math & Data,8.9,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Math & Data,10.1,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Math & Data,4.2,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Math & Data,11.9,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Math & Data,21.8,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Math & Data,18.6,[],wildbench_240829.csv +gemma_7b_it,WildBench Math & Data,-3.7,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Math & Data,-7.2,[],wildbench_240829.csv +gpt_4o_2024_05_13,WildBench Reasoning & Planning,60.2,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Reasoning & Planning,55.6,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Reasoning & Planning,53.7,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Reasoning & Planning,56.2,[],wildbench_240829.csv +yi_large_preview,WildBench Reasoning & Planning,56.6,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Reasoning & Planning,54.8,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Reasoning & Planning,53.5,[],wildbench_240829.csv +claude_3_opus,WildBench Reasoning & Planning,52.5,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Reasoning & Planning,50.8,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Reasoning & Planning,50.1,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Reasoning & Planning,47.2,[],wildbench_240829.csv +yi_large,WildBench Reasoning & Planning,51.3,[],wildbench_240829.csv +athene_70b,WildBench Reasoning & Planning,61.0,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Reasoning & Planning,49.1,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Reasoning & Planning,50.6,[],wildbench_240829.csv +mistral_large_2,WildBench Reasoning & Planning,57.2,[],wildbench_240829.csv +claude_3_sonnet,WildBench Reasoning & Planning,47.4,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Reasoning & Planning,58.2,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Reasoning & Planning,46.8,[],wildbench_240829.csv +reka_core,WildBench Reasoning & Planning,48.0,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Reasoning & Planning,55.6,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Reasoning & Planning,55.5,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Reasoning & Planning,48.1,[],wildbench_240829.csv +claude_3_haiku,WildBench Reasoning & Planning,41.3,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Reasoning & Planning,47.4,[],wildbench_240829.csv +mistral_large,WildBench Reasoning & Planning,41.8,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Reasoning & Planning,46.7,[],wildbench_240829.csv +command_r_plus,WildBench Reasoning & Planning,41.9,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Reasoning & Planning,42.5,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Reasoning & Planning,42.7,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Reasoning & Planning,42.4,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Reasoning & Planning,40.9,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Reasoning & Planning,40.7,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Reasoning & Planning,43.5,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Reasoning & Planning,39.5,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Reasoning & Planning,39.8,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Reasoning & Planning,32.3,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Reasoning & Planning,34.4,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Reasoning & Planning,33.7,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Reasoning & Planning,36.3,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Reasoning & Planning,31.6,[],wildbench_240829.csv +reka_flash,WildBench Reasoning & Planning,35.0,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Reasoning & Planning,33.8,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Reasoning & Planning,33.4,[],wildbench_240829.csv +dbrx_instruct,WildBench Reasoning & Planning,36.2,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Reasoning & Planning,28.7,[],wildbench_240829.csv +neo_7b_instruct,WildBench Reasoning & Planning,31.4,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Reasoning & Planning,34.1,[],wildbench_240829.csv +command_r,WildBench Reasoning & Planning,34.6,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Reasoning & Planning,34.6,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Reasoning & Planning,27.3,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Reasoning & Planning,32.3,[],wildbench_240829.csv +reka_edge,WildBench Reasoning & Planning,25.0,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Reasoning & Planning,30.1,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Reasoning & Planning,26.8,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Reasoning & Planning,28.9,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Reasoning & Planning,34.2,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Reasoning & Planning,28.1,[],wildbench_240829.csv +gemma_7b_it,WildBench Reasoning & Planning,10.2,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Reasoning & Planning,15.4,[],wildbench_240829.csv +gpt_4o_2024_05_13,WildBench Score,59.3,[],wildbench_240829.csv +claude_3_5_sonnet,WildBench Score,54.7,[],wildbench_240829.csv +gemini_1_5_pro,WildBench Score,53.0,[],wildbench_240829.csv +gpt_4_turbo_2024_04_09,WildBench Score,55.2,[],wildbench_240829.csv +yi_large_preview,WildBench Score,55.3,[],wildbench_240829.csv +deepseek_v2_chat_0628_api,WildBench Score,54.0,[],wildbench_240829.csv +gpt_4_0125_preview,WildBench Score,52.3,[],wildbench_240829.csv +claude_3_opus,WildBench Score,51.7,[],wildbench_240829.csv +gemini_1_5_flash,WildBench Score,48.9,[],wildbench_240829.csv +llama3_70b_instruct,WildBench Score,47.8,[],wildbench_240829.csv +deepseek_v2_coder_0614_api,WildBench Score,45.7,[],wildbench_240829.csv +yi_large,WildBench Score,48.9,[],wildbench_240829.csv +athene_70b,WildBench Score,59.5,[],wildbench_240829.csv +nemotron_4_340b_inst,WildBench Score,47.7,[],wildbench_240829.csv +gemma_2_27b_it,WildBench Score,48.5,[],wildbench_240829.csv +mistral_large_2,WildBench Score,55.6,[],wildbench_240829.csv +claude_3_sonnet,WildBench Score,45.5,[],wildbench_240829.csv +gpt_4o_mini_2024_07_18,WildBench Score,57.1,[],wildbench_240829.csv +qwen2_72b_instruct,WildBench Score,44.5,[],wildbench_240829.csv +reka_core,WildBench Score,45.9,[],wildbench_240829.csv +gemma_2_9b_it_simpo,WildBench Score,53.3,[],wildbench_240829.csv +gemma_2_9b_it_dpo,WildBench Score,53.2,[],wildbench_240829.csv +yi_1_5_34b_chat,WildBench Score,45.6,[],wildbench_240829.csv +claude_3_haiku,WildBench Score,38.9,[],wildbench_240829.csv +mistral_nemo_inst_12b,WildBench Score,44.4,[],wildbench_240829.csv +mistral_large,WildBench Score,38.9,[],wildbench_240829.csv +gemma_2_9b_it,WildBench Score,42.7,[],wildbench_240829.csv +command_r_plus,WildBench Score,36.8,[],wildbench_240829.csv +glm_4_9b_chat,WildBench Score,39.1,[],wildbench_240829.csv +magpie_8b_align_v0_1,WildBench Score,39.3,[],wildbench_240829.csv +yi_1_5_9b_chat,WildBench Score,38.7,[],wildbench_240829.csv +llama3_inst_8b_simpo,WildBench Score,37.0,[],wildbench_240829.csv +llama3_inst_8b_simpo_v0_2,WildBench Score,37.2,[],wildbench_240829.csv +qwen1_5_72b_chat,WildBench Score,39.9,[],wildbench_240829.csv +llama3_inst_8b_simpo_expo,WildBench Score,35.0,[],wildbench_240829.csv +selm_llama3_8b_inst_iter3,WildBench Score,35.3,[],wildbench_240829.csv +phi_3_medium_128k,WildBench Score,27.3,[],wildbench_240829.csv +llama3_8b_instruct,WildBench Score,29.2,[],wildbench_240829.csv +hermes_2_theta_llama3_8b,WildBench Score,29.6,[],wildbench_240829.csv +starling_lm_7b_beta_expo,WildBench Score,31.6,[],wildbench_240829.csv +selm_zephyr_7b_iter3,WildBench Score,25.1,[],wildbench_240829.csv +reka_flash,WildBench Score,30.4,[],wildbench_240829.csv +gemma_2_2b_it,WildBench Score,27.8,[],wildbench_240829.csv +gpt_3_5_turbo_0125,WildBench Score,30.0,[],wildbench_240829.csv +dbrx_instruct,WildBench Score,32.6,[],wildbench_240829.csv +neo_7b_instruct_expo,WildBench Score,23.1,[],wildbench_240829.csv +neo_7b_instruct,WildBench Score,25.0,[],wildbench_240829.csv +starlinglm_7b_beta,WildBench Score,30.2,[],wildbench_240829.csv +command_r,WildBench Score,29.5,[],wildbench_240829.csv +mixtral_8x7b_instruct,WildBench Score,31.5,[],wildbench_240829.csv +yi_1_5_6b_chat,WildBench Score,23.3,[],wildbench_240829.csv +tulu_2_dpo_70b,WildBench Score,28.0,[],wildbench_240829.csv +reka_edge,WildBench Score,21.3,[],wildbench_240829.csv +mistral_7b_instruct_v0_2,WildBench Score,25.6,[],wildbench_240829.csv +llama_2_70b_chat,WildBench Score,20.7,[],wildbench_240829.csv +qwen1_5_7b_chat,WildBench Score,23.4,[],wildbench_240829.csv +hermes_2_mixtral_8x7b_dpo,WildBench Score,30.7,[],wildbench_240829.csv +phi_3_mini_128k,WildBench Score,24.7,[],wildbench_240829.csv +gemma_7b_it,WildBench Score,6.6,[],wildbench_240829.csv +llama_2_7b_chat,WildBench Score,8.3,[],wildbench_240829.csv +o1_mini,Decentralized Arena (0-1 Normalized),1.0,[],dec_arena_241022.csv +o1_preview,Decentralized Arena (0-1 Normalized),0.988296,[],dec_arena_241022.csv +chatgpt_4o_latest_2024_09_03,Decentralized Arena (0-1 Normalized),0.971391,[],dec_arena_241022.csv +yi_lightning,Decentralized Arena (0-1 Normalized),0.955415,[],dec_arena_241022.csv +glm_4_plus,Decentralized Arena (0-1 Normalized),0.910273,[],dec_arena_241022.csv +claude_3_5_sonnet,Decentralized Arena (0-1 Normalized),0.897083,[],dec_arena_241022.csv +gpt_4o_2024_05_13,Decentralized Arena (0-1 Normalized),0.894297,[],dec_arena_241022.csv +gpt_4o_2024_08_06,Decentralized Arena (0-1 Normalized),0.889095,[],dec_arena_241022.csv +nemotron_70b,Decentralized Arena (0-1 Normalized),0.881107,[],dec_arena_241022.csv +gpt_4o_mini_2024_07_18,Decentralized Arena (0-1 Normalized),0.873119,[],dec_arena_241022.csv +gpt_4_turbo_2024_04_09,Decentralized Arena (0-1 Normalized),0.865131,[],dec_arena_241022.csv +gemini_1_5_pro_001,Decentralized Arena (0-1 Normalized),0.854542,[],dec_arena_241022.csv +qwen2_72b_instruct,Decentralized Arena (0-1 Normalized),0.814787,[],dec_arena_241022.csv +claude_3_opus,Decentralized Arena (0-1 Normalized),0.804198,[],dec_arena_241022.csv +gpt4_1106,Decentralized Arena (0-1 Normalized),0.761657,[],dec_arena_241022.csv +gemini_1_5_flash_001,Decentralized Arena (0-1 Normalized),0.761657,[],dec_arena_241022.csv +llama3_1_70b_instruct,Decentralized Arena (0-1 Normalized),0.759056,[],dec_arena_241022.csv +gemma_2_9b_it_simpo,Decentralized Arena (0-1 Normalized),0.73695,[],dec_arena_241022.csv +gemma_2_27b_it,Decentralized Arena (0-1 Normalized),0.716515,[],dec_arena_241022.csv +google_gemma_2_9b_it,Decentralized Arena (0-1 Normalized),0.687349,[],dec_arena_241022.csv +yi_1_5_34b_chat,Decentralized Arena (0-1 Normalized),0.671373,[],dec_arena_241022.csv +llama3_70b_instruct,Decentralized Arena (0-1 Normalized),0.658183,[],dec_arena_241022.csv +claude_3_haiku,Decentralized Arena (0-1 Normalized),0.591863,[],dec_arena_241022.csv +qwen1_5_72b_chat,Decentralized Arena (0-1 Normalized),0.583875,[],dec_arena_241022.csv +llama3_1_8b_instruct,Decentralized Arena (0-1 Normalized),0.533346,[],dec_arena_241022.csv +qwen1_5_32b_chat,Decentralized Arena (0-1 Normalized),0.533346,[],dec_arena_241022.csv +claude_2_1,Decentralized Arena (0-1 Normalized),0.509567,[],dec_arena_241022.csv +claude_2_0,Decentralized Arena (0-1 Normalized),0.501579,[],dec_arena_241022.csv +starling_lm_7b_beta,Decentralized Arena (0-1 Normalized),0.464425,[],dec_arena_241022.csv +qwen1_5_14b_chat,Decentralized Arena (0-1 Normalized),0.43786,[],dec_arena_241022.csv +mistral_8x7b_instruct_v0_1,Decentralized Arena (0-1 Normalized),0.43786,[],dec_arena_241022.csv +llama3_8b_instruct,Decentralized Arena (0-1 Normalized),0.421884,[],dec_arena_241022.csv +gemma_2_2b_it,Decentralized Arena (0-1 Normalized),0.414081,[],dec_arena_241022.csv +gpt3_5_turbo_0125,Decentralized Arena (0-1 Normalized),0.411295,[],dec_arena_241022.csv +command_r_08_2024,Decentralized Arena (0-1 Normalized),0.392718,[],dec_arena_241022.csv +openchat_3_5_0106,Decentralized Arena (0-1 Normalized),0.387516,[],dec_arena_241022.csv +openchat_3_5,Decentralized Arena (0-1 Normalized),0.374141,[],dec_arena_241022.csv +command_r_04_2024,Decentralized Arena (0-1 Normalized),0.339773,[],dec_arena_241022.csv +gemma_1_1_7b_it,Decentralized Arena (0-1 Normalized),0.336987,[],dec_arena_241022.csv +starling_lm_7b_alpha,Decentralized Arena (0-1 Normalized),0.331785,[],dec_arena_241022.csv +gemini_1_0_pro_001,Decentralized Arena (0-1 Normalized),0.326398,[],dec_arena_241022.csv +mistral_7b_instruct_2,Decentralized Arena (0-1 Normalized),0.260078,[],dec_arena_241022.csv +llama3_2_3b_it,Decentralized Arena (0-1 Normalized),0.25209,[],dec_arena_241022.csv +vicuna_33b,Decentralized Arena (0-1 Normalized),0.2389,[],dec_arena_241022.csv +gemma_7b_it,Decentralized Arena (0-1 Normalized),0.228311,[],dec_arena_241022.csv +qwen1_5_4b_chat,Decentralized Arena (0-1 Normalized),0.146015,[],dec_arena_241022.csv +mistral_7b_instruct_1,Decentralized Arena (0-1 Normalized),0.143229,[],dec_arena_241022.csv +vicuna_13b,Decentralized Arena (0-1 Normalized),0.140628,[],dec_arena_241022.csv +gemma_1_1_2b_it,Decentralized Arena (0-1 Normalized),0.135426,[],dec_arena_241022.csv +llama2_7b_chat,Decentralized Arena (0-1 Normalized),0.127438,[],dec_arena_241022.csv +llama2_13b_chat,Decentralized Arena (0-1 Normalized),0.116849,[],dec_arena_241022.csv +gemma_2b_it,Decentralized Arena (0-1 Normalized),0.087498,[],dec_arena_241022.csv +vicuna_7b,Decentralized Arena (0-1 Normalized),0.071707,[],dec_arena_241022.csv +zephyr_7b_beta,Decentralized Arena (0-1 Normalized),0.058332,[],dec_arena_241022.csv +koala_13b,Decentralized Arena (0-1 Normalized),0.026565,[],dec_arena_241022.csv +openassistant_pythia_12b,Decentralized Arena (0-1 Normalized),0.0,[],dec_arena_241022.csv +claude_3_5_sonnet_20240620,Arena Hard,79.3,[],arena_hard_240829.csv +gpt_4o_2024_05_13,Arena Hard,79.2,[],arena_hard_240829.csv +gpt_4_0125_preview,Arena Hard,78.0,[],arena_hard_240829.csv +gpt_4o_2024_08_06,Arena Hard,77.9,[],arena_hard_240829.csv +athene_70b,Arena Hard,77.6,[],arena_hard_240829.csv +gpt_4o_mini,Arena Hard,74.9,[],arena_hard_240829.csv +gemini_1_5_pro_api_preview,Arena Hard,72.0,[],arena_hard_240829.csv +mistral_large_2407,Arena Hard,70.4,[],arena_hard_240829.csv +llama3_1_405b_instruct,Arena Hard,64.1,[],arena_hard_240829.csv +glm_4_0520,Arena Hard,63.8,[],arena_hard_240829.csv +yi_large,Arena Hard,63.7,[],arena_hard_240829.csv +deepseek_coder_v2,Arena Hard,62.3,[],arena_hard_240829.csv +claude_3_opus_20240229,Arena Hard,60.4,[],arena_hard_240829.csv +gemma_2_27b_it,Arena Hard,57.5,[],arena_hard_240829.csv +llama3_1_70b_instruct,Arena Hard,55.7,[],arena_hard_240829.csv +glm_4_0116,Arena Hard,55.7,[],arena_hard_240829.csv +glm_4_air,Arena Hard,50.9,[],arena_hard_240829.csv +gpt_4_0314,Arena Hard,50.0,[],arena_hard_240829.csv +gemini_1_5_flash_api_preview,Arena Hard,49.6,[],arena_hard_240829.csv +qwen2_72b_instruct,Arena Hard,46.9,[],arena_hard_240829.csv +claude_3_sonnet_20240229,Arena Hard,46.8,[],arena_hard_240829.csv +llama3_70b_instruct,Arena Hard,46.6,[],arena_hard_240829.csv +claude_3_haiku_20240307,Arena Hard,41.5,[],arena_hard_240829.csv +gpt_4_0613,Arena Hard,37.9,[],arena_hard_240829.csv +mistral_large_2402,Arena Hard,37.7,[],arena_hard_240829.csv +mixtral_8x22b_instruct_v0_1,Arena Hard,36.4,[],arena_hard_240829.csv +qwen1_5_72b_chat,Arena Hard,36.1,[],arena_hard_240829.csv +phi_3_medium_4k_instruct,Arena Hard,33.4,[],arena_hard_240829.csv +command_r_plus,Arena Hard,33.1,[],arena_hard_240829.csv +mistral_medium,Arena Hard,31.9,[],arena_hard_240829.csv +internlm2_5_20b_chat,Arena Hard,31.2,[],arena_hard_240829.csv +phi_3_small_8k_instruct,Arena Hard,29.8,[],arena_hard_240829.csv +mistral_next,Arena Hard,27.4,[],arena_hard_240829.csv +gpt_3_5_turbo_0613,Arena Hard,24.8,[],arena_hard_240829.csv +dbrx_instructruct_preview,Arena Hard,24.6,[],arena_hard_240829.csv +internlm2_20b_chat,Arena Hard,24.4,[],arena_hard_240829.csv +claude_2_0,Arena Hard,24.0,[],arena_hard_240829.csv +mixtral_8x7b_instruct_v0_1,Arena Hard,23.4,[],arena_hard_240829.csv +gpt_3_5_turbo_0125,Arena Hard,23.3,[],arena_hard_240829.csv +yi_34b_chat,Arena Hard,23.1,[],arena_hard_240829.csv +starling_lm_7b_beta,Arena Hard,23.0,[],arena_hard_240829.csv +claude_2_1,Arena Hard,22.8,[],arena_hard_240829.csv +llama3_1_8b_instruct,Arena Hard,21.3,[],arena_hard_240829.csv +snorkel_mistral_pairrm_dpo,Arena Hard,20.7,[],arena_hard_240829.csv +llama3_8b_instruct,Arena Hard,20.6,[],arena_hard_240829.csv +gpt_3_5_turbo_1106,Arena Hard,18.9,[],arena_hard_240829.csv +gpt_3_5_turbo_0301,Arena Hard,18.1,[],arena_hard_240829.csv +gemini_1_0_pro,Arena Hard,17.8,[],arena_hard_240829.csv +snowflake_arctic_instruct,Arena Hard,17.6,[],arena_hard_240829.csv +command_r,Arena Hard,17.0,[],arena_hard_240829.csv +phi_3_mini_128k_instruct,Arena Hard,15.4,[],arena_hard_240829.csv +tulu_2_dpo_70b,Arena Hard,15.0,[],arena_hard_240829.csv +starling_lm_7b_alpha,Arena Hard,12.8,[],arena_hard_240829.csv +mistral_7b_instruct,Arena Hard,12.6,[],arena_hard_240829.csv +gemma_1_1_7b_it,Arena Hard,12.1,[],arena_hard_240829.csv +llama_2_70b_chat,Arena Hard,11.6,[],arena_hard_240829.csv +vicuna_33b_v1_3,Arena Hard,8.6,[],arena_hard_240829.csv +gemma_7b_it,Arena Hard,7.5,[],arena_hard_240829.csv +llama_2_7b_chat,Arena Hard,4.6,[],arena_hard_240829.csv +gemma_1_1_2b_it,Arena Hard,3.4,[],arena_hard_240829.csv +gemma_2b_it,Arena Hard,3.0,[],arena_hard_240829.csv +gpt_4_0613,AgentBench,4.01,[],agenbench_240829.csv +claude_2,AgentBench,2.49,[],agenbench_240829.csv +claude_v1_3,AgentBench,2.44,[],agenbench_240829.csv +gpt_3_5_turbo_0613,AgentBench,2.32,[],agenbench_240829.csv +text_davinci_003,AgentBench,1.71,[],agenbench_240829.csv +claude_instant_v1_1,AgentBench,1.6,[],agenbench_240829.csv +chat_bison_001,AgentBench,1.39,[],agenbench_240829.csv +text_davinci_002,AgentBench,1.25,[],agenbench_240829.csv +llama_2_70b_chat,AgentBench,0.78,[],agenbench_240829.csv +guanaco_65b,AgentBench,0.54,[],agenbench_240829.csv +codellama34b_instruct,AgentBench,0.96,[],agenbench_240829.csv +vicuna_33b_v1_3,AgentBench,0.73,[],agenbench_240829.csv +wizardlm_30b_v1_0,AgentBench,0.46,[],agenbench_240829.csv +guanaco_33b,AgentBench,0.39,[],agenbench_240829.csv +vicuna_13b_v1_5,AgentBench,0.93,[],agenbench_240829.csv +llama_2_13b_chat,AgentBench,0.77,[],agenbench_240829.csv +openchat_13b_v3_2,AgentBench,0.7,[],agenbench_240829.csv +wizardlm_13b_v1_2,AgentBench,0.66,[],agenbench_240829.csv +vicuna_7b_v1_5,AgentBench,0.56,[],agenbench_240829.csv +codellama_13b_instruct,AgentBench,0.56,[],agenbench_240829.csv +codellama_7b_instruct,AgentBench,0.5,[],agenbench_240829.csv +koala_13b,AgentBench,0.34,[],agenbench_240829.csv +llama_2_7b_chat,AgentBench,0.34,[],agenbench_240829.csv +codegeex2_6b,AgentBench,0.27,[],agenbench_240829.csv +dolly_12b_v2,AgentBench,0.14,[],agenbench_240829.csv +chatglm_6b_v1_1,AgentBench,0.11,[],agenbench_240829.csv +oasst_12b_sft_4,AgentBench,0.03,[],agenbench_240829.csv +gpt_4,MT-Bench,8.99,[],mtbench_240829_frozen.csv +gpt_3_5_turbo,MT-Bench,7.94,[],mtbench_240829_frozen.csv +claude_v1,MT-Bench,7.9,[],mtbench_240829_frozen.csv +claude_instant_v1,MT-Bench,7.85,[],mtbench_240829_frozen.csv +vicuna_33b,MT-Bench,7.12,[],mtbench_240829_frozen.csv +wizardlm_30b,MT-Bench,7.01,[],mtbench_240829_frozen.csv +guanaco_33b,MT-Bench,6.53,[],mtbench_240829_frozen.csv +tulu_30b,MT-Bench,6.43,[],mtbench_240829_frozen.csv +guanaco_65b,MT-Bench,6.41,[],mtbench_240829_frozen.csv +openassistant_llama30b,MT-Bench,6.41,[],mtbench_240829_frozen.csv +palm_chat_bison_001,MT-Bench,6.4,[],mtbench_240829_frozen.csv +vicuna_13b,MT-Bench,6.39,[],mtbench_240829_frozen.csv +mpt_30b_chat,MT-Bench,6.39,[],mtbench_240829_frozen.csv +wizardlm_13b,MT-Bench,6.35,[],mtbench_240829_frozen.csv +vicuna_7b,MT-Bench,6.0,[],mtbench_240829_frozen.csv +baize_v2_13b,MT-Bench,5.75,[],mtbench_240829_frozen.csv +nous_hermes_13b,MT-Bench,5.51,[],mtbench_240829_frozen.csv +mpt_7b_chat,MT-Bench,5.42,[],mtbench_240829_frozen.csv +gpt4all_13b_snoozy,MT-Bench,5.41,[],mtbench_240829_frozen.csv +koala_13b,MT-Bench,5.35,[],mtbench_240829_frozen.csv +mpt_30b_instruct,MT-Bench,5.22,[],mtbench_240829_frozen.csv +falcon_40b_instruct,MT-Bench,5.17,[],mtbench_240829_frozen.csv +h2o_oasst_openllama_13b,MT-Bench,4.63,[],mtbench_240829_frozen.csv +alpaca_13b,MT-Bench,4.53,[],mtbench_240829_frozen.csv +chatglm_6b,MT-Bench,4.5,[],mtbench_240829_frozen.csv +openassistant_pythia_12b,MT-Bench,4.32,[],mtbench_240829_frozen.csv +rwkv_4_raven_14b,MT-Bench,3.98,[],mtbench_240829_frozen.csv +dolly_v2_12b,MT-Bench,3.28,[],mtbench_240829_frozen.csv +fastchat_t5_3b,MT-Bench,3.04,[],mtbench_240829_frozen.csv +stablelm_tuned_alpha_7b,MT-Bench,2.75,[],mtbench_240829_frozen.csv +llama_13b,MT-Bench,2.61,[],mtbench_240829_frozen.csv +0001_dpo_iter_2,HF OpenLLM v1,59.01,,hf_open_llm_v1_240829_frozen.csv +0001_dpo_iter_2,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +0001_dpo_iter_2,HFv1 GSM8K,18.8,,hf_open_llm_v1_240829_frozen.csv +0001_dpo_iter_2,HFv1 HellaSwag,84.52,,hf_open_llm_v1_240829_frozen.csv +0001_dpo_iter_2,HFv1 MMLU,60.02,,hf_open_llm_v1_240829_frozen.csv +0001_dpo_iter_2,HFv1 TruthfulQA,53.11,,hf_open_llm_v1_240829_frozen.csv +0001_dpo_iter_2,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HF OpenLLM v1,58.94,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 MMLU,60.46,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 TruthfulQA,51.71,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_5102lr_iter_4,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HF OpenLLM v1,59.6,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 GSM8K,20.62,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 HellaSwag,84.83,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 MMLU,60.6,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 TruthfulQA,52.2,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_2,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HF OpenLLM v1,58.97,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 GSM8K,17.13,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 MMLU,60.49,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 TruthfulQA,51.79,,hf_open_llm_v1_240829_frozen.csv +0_0001_withdpo_4iters_bs256_511lr_iter_3,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HF OpenLLM v1,68.56,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 ARC,63.99,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 GSM8K,68.92,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 HellaSwag,80.62,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 MMLU,68.28,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 TruthfulQA,53.85,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HF OpenLLM v1,68.73,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 HellaSwag,81.38,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 MMLU,68.11,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 TruthfulQA,56.0,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_2,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HF OpenLLM v1,68.68,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 GSM8K,64.9,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 HellaSwag,81.61,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 MMLU,68.08,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 TruthfulQA,57.36,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_4iters_bs128_5551lr_iter_3,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HF OpenLLM v1,68.73,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 HellaSwag,81.24,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 MMLU,68.36,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 TruthfulQA,55.21,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HF OpenLLM v1,68.63,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 HellaSwag,81.37,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 MMLU,68.1,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 TruthfulQA,56.33,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HF OpenLLM v1,68.23,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 HellaSwag,81.21,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 MMLU,68.29,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 TruthfulQA,56.35,,hf_open_llm_v1_240829_frozen.csv +0_0005_llama_nodpo_3iters_bs128_531lr_oldtrl_iter_2,HFv1 Winogrande,74.35,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HF OpenLLM v1,58.46,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 GSM8K,15.01,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 MMLU,60.17,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 TruthfulQA,52.22,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_5551lr_iter_4,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HF OpenLLM v1,59.5,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 GSM8K,19.79,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 HellaSwag,84.82,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 MMLU,60.56,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 TruthfulQA,52.25,,hf_open_llm_v1_240829_frozen.csv +0_0005_withdpo_4iters_bs256_555lr_iter_2,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HF OpenLLM v1,59.19,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 GSM8K,11.68,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 MMLU,60.25,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 TruthfulQA,57.74,,hf_open_llm_v1_240829_frozen.csv +0_001_3iters_bs256_nodpo_only4w_iter_3,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HF OpenLLM v1,58.66,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 GSM8K,12.59,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 MMLU,60.08,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 TruthfulQA,56.04,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_iter_4,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HF OpenLLM v1,60.64,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 GSM8K,23.35,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 HellaSwag,84.43,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 TruthfulQA,55.39,,hf_open_llm_v1_240829_frozen.csv +0_001_4iters_bs256_nodpo_only4w_userresponse_iter_4,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HF OpenLLM v1,59.03,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HFv1 GSM8K,15.24,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HFv1 HellaSwag,85.03,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HFv1 MMLU,60.09,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HFv1 TruthfulQA,55.2,,hf_open_llm_v1_240829_frozen.csv +0_001_ablation_5iters_bs256_iter_5,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HF OpenLLM v1,60.33,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HFv1 GSM8K,27.82,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HFv1 HellaSwag,84.81,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HFv1 MMLU,61.11,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HFv1 TruthfulQA,48.18,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_2,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HF OpenLLM v1,60.82,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HFv1 GSM8K,28.28,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HFv1 HellaSwag,84.98,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HFv1 MMLU,60.69,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HFv1 TruthfulQA,50.33,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_3,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HF OpenLLM v1,60.67,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HFv1 GSM8K,27.37,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HFv1 HellaSwag,85.09,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HFv1 MMLU,60.47,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HFv1 TruthfulQA,51.01,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_4iters_iter_4,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HF OpenLLM v1,61.96,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HFv1 GSM8K,30.25,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HFv1 MMLU,60.52,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HFv1 TruthfulQA,54.23,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_2,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HF OpenLLM v1,61.7,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HFv1 GSM8K,28.51,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HFv1 HellaSwag,85.3,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HFv1 MMLU,60.31,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HFv1 TruthfulQA,54.72,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_declr_iter_3,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HF OpenLLM v1,61.31,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HFv1 GSM8K,28.81,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HFv1 MMLU,60.74,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HFv1 TruthfulQA,52.05,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_1,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HF OpenLLM v1,62.51,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HFv1 GSM8K,32.22,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HFv1 HellaSwag,85.47,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HFv1 MMLU,60.72,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HFv1 TruthfulQA,54.4,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_2,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HF OpenLLM v1,62.62,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HFv1 GSM8K,32.75,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HFv1 HellaSwag,85.58,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HFv1 MMLU,60.33,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HFv1 TruthfulQA,55.48,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_iter_3,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HF OpenLLM v1,61.77,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HFv1 GSM8K,30.33,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HFv1 HellaSwag,85.42,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HFv1 MMLU,60.66,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HFv1 TruthfulQA,52.85,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_2,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HF OpenLLM v1,62.45,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HFv1 GSM8K,31.69,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HFv1 HellaSwag,85.55,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HFv1 MMLU,60.48,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HFv1 TruthfulQA,54.31,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_noreplacerej_iter_3,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HF OpenLLM v1,60.81,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 GSM8K,23.96,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 HellaSwag,85.21,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 MMLU,60.24,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 TruthfulQA,54.44,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_2,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HF OpenLLM v1,60.38,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 GSM8K,21.83,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 HellaSwag,85.26,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 MMLU,60.18,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 TruthfulQA,54.38,,hf_open_llm_v1_240829_frozen.csv +0_001_idpo_same_noreplacerej_declr_iter_3,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HF OpenLLM v1,68.77,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 HellaSwag,81.25,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 MMLU,68.38,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 TruthfulQA,55.24,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_1,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HF OpenLLM v1,68.61,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 GSM8K,66.64,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 HellaSwag,81.36,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 MMLU,68.18,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 TruthfulQA,56.47,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_2,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HF OpenLLM v1,68.67,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 HellaSwag,81.4,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 MMLU,68.16,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 TruthfulQA,56.61,,hf_open_llm_v1_240829_frozen.csv +0_001_llama3_nodpo_3iters_bs128_531lr_iter_3,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HF OpenLLM v1,60.85,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 GSM8K,33.97,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 HellaSwag,84.18,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 MMLU,61.41,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 TruthfulQA,45.45,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_1,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HF OpenLLM v1,59.24,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 GSM8K,18.42,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 HellaSwag,85.14,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 MMLU,60.11,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 TruthfulQA,52.75,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_3,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HF OpenLLM v1,58.86,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 HellaSwag,85.02,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 MMLU,60.01,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 TruthfulQA,52.25,,hf_open_llm_v1_240829_frozen.csv +0_001_zephyr_5551_4iters_bs256_iter_4,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HF OpenLLM v1,60.81,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 HellaSwag,84.08,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 MMLU,61.54,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 TruthfulQA,45.45,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_1,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HF OpenLLM v1,59.55,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 GSM8K,20.32,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 HellaSwag,84.82,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 MMLU,60.54,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 TruthfulQA,52.18,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_2,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HF OpenLLM v1,58.95,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 ARC,61.52,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 GSM8K,16.22,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 MMLU,60.47,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 TruthfulQA,53.18,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_3,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HF OpenLLM v1,58.28,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 HellaSwag,85.06,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 MMLU,60.16,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 TruthfulQA,54.17,,hf_open_llm_v1_240829_frozen.csv +0_0_ablation_sample1_4iters_bs256_iter_4,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HF OpenLLM v1,68.8,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 HellaSwag,81.26,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 MMLU,68.34,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 TruthfulQA,55.25,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_1,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HF OpenLLM v1,68.63,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 GSM8K,66.64,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 HellaSwag,81.35,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 MMLU,68.21,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 TruthfulQA,56.4,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_2,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HF OpenLLM v1,68.66,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 HellaSwag,81.38,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 MMLU,68.12,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 TruthfulQA,56.54,,hf_open_llm_v1_240829_frozen.csv +0_0_llama_nodpo_3iters_bs128_531lr_iter_3,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HF OpenLLM v1,58.72,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 GSM8K,15.77,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 HellaSwag,84.92,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 MMLU,60.28,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 TruthfulQA,52.64,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_531lr_iter_3,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HF OpenLLM v1,58.69,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 GSM8K,14.4,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 MMLU,60.4,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 TruthfulQA,53.88,,hf_open_llm_v1_240829_frozen.csv +0_0_withdpo_4iters_bs256_5551lr_iter_4,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HF OpenLLM v1,59.98,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 GSM8K,22.21,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 HellaSwag,84.95,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 MMLU,60.43,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 TruthfulQA,52.35,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_2,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HF OpenLLM v1,59.17,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 HellaSwag,85.16,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 MMLU,60.23,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 TruthfulQA,53.42,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_3,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HF OpenLLM v1,58.69,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 GSM8K,15.54,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 HellaSwag,85.08,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 MMLU,60.12,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 TruthfulQA,52.67,,hf_open_llm_v1_240829_frozen.csv +0_0_zephyr_withdpo_4iters_bs128_5551lr_iter_4,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HF OpenLLM v1,73.83,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HFv1 GSM8K,60.96,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HFv1 HellaSwag,88.23,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HFv1 MMLU,66.25,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HFv1 TruthfulQA,75.38,,hf_open_llm_v1_240829_frozen.csv +10_7bx2_dpo_200,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HF OpenLLM v1,54.72,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HFv1 HellaSwag,83.57,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HFv1 MMLU,56.95,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HFv1 TruthfulQA,49.52,,hf_open_llm_v1_240829_frozen.csv +13b_thorns_l2,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HF OpenLLM v1,50.23,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HFv1 GSM8K,1.14,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HFv1 HellaSwag,62.31,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HFv1 MMLU,62.01,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HFv1 TruthfulQA,60.23,,hf_open_llm_v1_240829_frozen.csv +22_neuro_model,HFv1 Winogrande,66.54,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HF OpenLLM v1,57.26,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HFv1 GSM8K,10.24,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HFv1 HellaSwag,83.24,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HFv1 MMLU,58.64,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HFv1 TruthfulQA,51.88,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_nova_13b,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HF OpenLLM v1,55.33,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HFv1 HellaSwag,82.56,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HFv1 MMLU,58.25,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HFv1 TruthfulQA,54.77,,hf_open_llm_v1_240829_frozen.csv +2x_lora_assemble_platypus2_13b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HF OpenLLM v1,36.88,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HFv1 ARC,36.26,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HFv1 HellaSwag,61.9,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HFv1 MMLU,25.42,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HFv1 TruthfulQA,36.31,,hf_open_llm_v1_240829_frozen.csv +3b_redpajama_conditional_alpha,HFv1 Winogrande,60.77,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HF OpenLLM v1,36.61,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HFv1 HellaSwag,58.96,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HFv1 MMLU,25.51,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HFv1 TruthfulQA,39.98,,hf_open_llm_v1_240829_frozen.csv +42dot_llm_sft_1_3b,HFv1 Winogrande,58.41,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HF OpenLLM v1,33.22,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HFv1 HellaSwag,53.88,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HFv1 MMLU,25.33,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HFv1 TruthfulQA,34.44,,hf_open_llm_v1_240829_frozen.csv +774m_03_09_2024,HFv1 Winogrande,55.09,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HF OpenLLM v1,40.56,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HFv1 ARC,42.58,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HFv1 HellaSwag,69.91,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HFv1 MMLU,26.53,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HFv1 TruthfulQA,36.42,,hf_open_llm_v1_240829_frozen.csv +7b_redpajama_conditional_alpha,HFv1 Winogrande,67.17,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HF OpenLLM v1,73.2,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HFv1 GSM8K,71.95,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HFv1 HellaSwag,86.89,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HFv1 MMLU,64.73,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HFv1 TruthfulQA,65.66,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HF OpenLLM v1,72.99,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HFv1 GSM8K,71.34,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HFv1 HellaSwag,86.8,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HFv1 MMLU,64.5,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HFv1 TruthfulQA,65.6,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_2e,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HF OpenLLM v1,70.85,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HFv1 GSM8K,63.38,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HFv1 HellaSwag,86.12,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HFv1 MMLU,62.23,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HFv1 TruthfulQA,68.99,,hf_open_llm_v1_240829_frozen.csv +7bx4_dpo_700,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HF OpenLLM v1,63.28,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HFv1 GSM8K,54.89,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HFv1 HellaSwag,77.97,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HFv1 MMLU,57.73,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HFv1 TruthfulQA,51.56,,hf_open_llm_v1_240829_frozen.csv +aanaphi2_v0_1,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HF OpenLLM v1,36.76,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HFv1 ARC,35.41,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HFv1 HellaSwag,66.31,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HFv1 MMLU,25.66,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HFv1 TruthfulQA,30.18,,hf_open_llm_v1_240829_frozen.csv +ablation_model_fineweb_v1,HFv1 Winogrande,61.48,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HF OpenLLM v1,52.75,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HFv1 GSM8K,19.64,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HFv1 HellaSwag,78.22,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HFv1 MMLU,47.67,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HFv1 TruthfulQA,42.75,,hf_open_llm_v1_240829_frozen.csv +adelie_sft,HFv1 Winogrande,74.11,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HF OpenLLM v1,29.15,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HFv1 ARC,20.99,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HFv1 HellaSwag,32.24,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HFv1 MMLU,26.15,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HFv1 TruthfulQA,46.17,,hf_open_llm_v1_240829_frozen.csv +aeonium_v1_baseweb_1b,HFv1 Winogrande,49.33,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HF OpenLLM v1,59.05,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HFv1 GSM8K,32.98,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HFv1 HellaSwag,81.76,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HFv1 MMLU,60.53,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv +aether_7b_chat_v1_0,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HF OpenLLM v1,68.4,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HFv1 ARC,69.45,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HFv1 GSM8K,46.47,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HFv1 HellaSwag,86.64,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HFv1 MMLU,61.15,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HFv1 TruthfulQA,67.97,,hf_open_llm_v1_240829_frozen.csv +agiin_13_6b_v0_1,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HF OpenLLM v1,29.32,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HFv1 ARC,23.21,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HFv1 HellaSwag,26.97,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HFv1 MMLU,24.86,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HFv1 TruthfulQA,50.63,,hf_open_llm_v1_240829_frozen.csv +aira_2_1b1,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HF OpenLLM v1,31.0,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HFv1 ARC,27.56,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HFv1 HellaSwag,38.92,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HFv1 MMLU,27.26,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HFv1 TruthfulQA,38.53,,hf_open_llm_v1_240829_frozen.csv +aira_2_355m,HFv1 Winogrande,53.75,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HF OpenLLM v1,31.33,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HFv1 ARC,28.75,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HFv1 HellaSwag,40.8,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HFv1 MMLU,25.1,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HFv1 TruthfulQA,41.33,,hf_open_llm_v1_240829_frozen.csv +aira_2_774m,HFv1 Winogrande,52.01,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,53.15,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.46,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,54.62,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,47.71,,hf_open_llm_v1_240829_frozen.csv +airboros2_1_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HF OpenLLM v1,59.95,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HFv1 GSM8K,30.86,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HFv1 HellaSwag,82.98,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HFv1 MMLU,60.67,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HFv1 TruthfulQA,48.24,,hf_open_llm_v1_240829_frozen.csv +airic_the_mistral,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HF OpenLLM v1,53.23,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HFv1 GSM8K,2.81,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HFv1 HellaSwag,82.91,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HFv1 MMLU,54.77,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HFv1 TruthfulQA,45.14,,hf_open_llm_v1_240829_frozen.csv +airoboros_2_1_llama_2_13b_qlora,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HF OpenLLM v1,57.16,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HFv1 HellaSwag,84.97,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HFv1 MMLU,57.37,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HFv1 TruthfulQA,52.17,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_2_1,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HF OpenLLM v1,57.43,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HFv1 GSM8K,13.04,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HFv1 HellaSwag,85.04,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HFv1 MMLU,58.53,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HFv1 TruthfulQA,45.36,,hf_open_llm_v1_240829_frozen.csv +airoboros_33b_gpt4_1_3,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HF OpenLLM v1,51.52,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HFv1 ARC,54.69,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HFv1 HellaSwag,76.45,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HFv1 MMLU,55.08,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HFv1 TruthfulQA,46.15,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_1,HFv1 Winogrande,68.43,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HF OpenLLM v1,55.15,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HFv1 ARC,54.69,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HFv1 GSM8K,20.02,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HFv1 HellaSwag,76.84,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HFv1 MMLU,55.43,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HFv1 TruthfulQA,51.36,,hf_open_llm_v1_240829_frozen.csv +airoboros_c34b_2_2_1,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HF OpenLLM v1,56.36,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HFv1 GSM8K,11.6,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HFv1 HellaSwag,83.77,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HFv1 MMLU,56.47,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HFv1 TruthfulQA,49.42,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_13b_2_2_1,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HF OpenLLM v1,69.13,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HFv1 GSM8K,44.88,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HFv1 HellaSwag,87.95,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HFv1 MMLU,69.79,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HFv1 TruthfulQA,59.49,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_70b_2_2_1,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HF OpenLLM v1,51.22,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HFv1 GSM8K,6.14,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HFv1 HellaSwag,80.06,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HFv1 MMLU,47.64,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HFv1 TruthfulQA,44.65,,hf_open_llm_v1_240829_frozen.csv +airoboros_l2_7b_2_2_1,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HF OpenLLM v1,49.61,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HFv1 ARC,54.18,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HFv1 HellaSwag,73.84,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HFv1 MMLU,50.67,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HFv1 TruthfulQA,40.7,,hf_open_llm_v1_240829_frozen.csv +airocoder_34b_2_1,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HF OpenLLM v1,56.98,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 GSM8K,23.2,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 HellaSwag,81.35,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 MMLU,51.76,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 TruthfulQA,53.0,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_llama2_koen_13b_v0_9_24,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HF OpenLLM v1,61.05,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HFv1 GSM8K,34.27,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HFv1 HellaSwag,84.2,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HFv1 MMLU,52.86,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HFv1 TruthfulQA,51.35,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_31,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HF OpenLLM v1,59.79,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HFv1 GSM8K,15.09,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HFv1 HellaSwag,84.66,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HFv1 MMLU,63.13,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HFv1 TruthfulQA,51.19,,hf_open_llm_v1_240829_frozen.csv +aisquare_instruct_solar_10_7b_v0_5_32,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HF OpenLLM v1,28.97,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HFv1 ARC,22.53,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HFv1 HellaSwag,28.32,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HFv1 MMLU,25.83,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HFv1 TruthfulQA,45.54,,hf_open_llm_v1_240829_frozen.csv +algae_550m_base,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HF OpenLLM v1,49.32,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HFv1 ARC,55.55,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HFv1 HellaSwag,79.45,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HFv1 MMLU,49.52,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HFv1 TruthfulQA,36.09,,hf_open_llm_v1_240829_frozen.csv +alma_13b_r,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv +alooowso,HF OpenLLM v1,65.63,,hf_open_llm_v1_240829_frozen.csv +alooowso,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +alooowso,HFv1 GSM8K,39.58,,hf_open_llm_v1_240829_frozen.csv +alooowso,HFv1 HellaSwag,84.87,,hf_open_llm_v1_240829_frozen.csv +alooowso,HFv1 MMLU,60.78,,hf_open_llm_v1_240829_frozen.csv +alooowso,HFv1 TruthfulQA,68.18,,hf_open_llm_v1_240829_frozen.csv +alooowso,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HF OpenLLM v1,54.2,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HFv1 MMLU,55.55,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HFv1 TruthfulQA,38.65,,hf_open_llm_v1_240829_frozen.csv +alpagasus_2_13b_qlora_merged,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HF OpenLLM v1,75.99,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HFv1 HellaSwag,89.18,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HFv1 TruthfulQA,77.91,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_7b,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HF OpenLLM v1,75.94,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HFv1 GSM8K,66.26,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HFv1 HellaSwag,89.23,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HFv1 MMLU,64.43,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HFv1 TruthfulQA,78.01,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_daser,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HF OpenLLM v1,75.86,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HFv1 HellaSwag,89.26,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HFv1 MMLU,64.47,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HFv1 TruthfulQA,78.02,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_dora,HFv1 Winogrande,84.45,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HF OpenLLM v1,76.0,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HFv1 ARC,73.12,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HFv1 HellaSwag,89.21,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HFv1 MMLU,64.43,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HFv1 TruthfulQA,77.9,,hf_open_llm_v1_240829_frozen.csv +alphamonarch_laser,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv +amber,HF OpenLLM v1,40.97,,hf_open_llm_v1_240829_frozen.csv +amber,HFv1 ARC,40.96,,hf_open_llm_v1_240829_frozen.csv +amber,HFv1 GSM8K,2.81,,hf_open_llm_v1_240829_frozen.csv +amber,HFv1 HellaSwag,73.79,,hf_open_llm_v1_240829_frozen.csv +amber,HFv1 MMLU,26.84,,hf_open_llm_v1_240829_frozen.csv +amber,HFv1 TruthfulQA,33.56,,hf_open_llm_v1_240829_frozen.csv +amber,HFv1 Winogrande,67.88,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HF OpenLLM v1,73.96,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HFv1 GSM8K,74.0,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HFv1 HellaSwag,87.43,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HFv1 MMLU,74.79,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HFv1 TruthfulQA,58.63,,hf_open_llm_v1_240829_frozen.csv +anfeng_v3_avocet,HFv1 Winogrande,81.14,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HF OpenLLM v1,71.81,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HFv1 GSM8K,68.99,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HFv1 HellaSwag,85.5,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HFv1 TruthfulQA,63.52,,hf_open_llm_v1_240829_frozen.csv +apollo_7b_orpo_experimental,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HF OpenLLM v1,54.5,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HFv1 HellaSwag,81.99,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HFv1 MMLU,76.02,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HFv1 TruthfulQA,40.8,,hf_open_llm_v1_240829_frozen.csv +aquila2_34b,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +arc1,HF OpenLLM v1,66.69,,hf_open_llm_v1_240829_frozen.csv +arc1,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv +arc1,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv +arc1,HFv1 HellaSwag,76.41,,hf_open_llm_v1_240829_frozen.csv +arc1,HFv1 MMLU,65.73,,hf_open_llm_v1_240829_frozen.csv +arc1,HFv1 TruthfulQA,52.73,,hf_open_llm_v1_240829_frozen.csv +arc1,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HF OpenLLM v1,52.87,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HFv1 GSM8K,16.83,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HFv1 HellaSwag,80.8,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HFv1 MMLU,47.84,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HFv1 TruthfulQA,39.42,,hf_open_llm_v1_240829_frozen.csv +archangel_sft_kto_llama13b,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HF OpenLLM v1,50.25,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HFv1 ARC,55.89,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HFv1 HellaSwag,79.66,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HFv1 MMLU,52.38,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HFv1 TruthfulQA,40.76,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_13b,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HF OpenLLM v1,47.15,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HFv1 ARC,50.85,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HFv1 HellaSwag,76.53,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HFv1 MMLU,43.61,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HFv1 TruthfulQA,43.31,,hf_open_llm_v1_240829_frozen.csv +asclepius_llama2_7b,HFv1 Winogrande,68.27,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,53.16,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,56.66,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,80.56,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.43,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,53.62,,hf_open_llm_v1_240829_frozen.csv +athena_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HF OpenLLM v1,59.34,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HFv1 GSM8K,28.13,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HFv1 HellaSwag,81.63,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HFv1 MMLU,59.8,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HFv1 TruthfulQA,54.2,,hf_open_llm_v1_240829_frozen.csv +athena_zephyr_7b,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HF OpenLLM v1,78.17,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HFv1 GSM8K,84.23,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HFv1 HellaSwag,86.09,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HFv1 MMLU,80.07,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HFv1 TruthfulQA,62.82,,hf_open_llm_v1_240829_frozen.csv +autotrain_llama3_70b_orpo_v2,HFv1 Winogrande,84.93,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HF OpenLLM v1,69.64,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HFv1 ARC,68.6,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HFv1 GSM8K,56.56,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HFv1 MMLU,70.84,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HFv1 TruthfulQA,54.51,,hf_open_llm_v1_240829_frozen.csv +average_dolphin_8x7b,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HF OpenLLM v1,53.96,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 HellaSwag,75.43,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 MMLU,49.05,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 TruthfulQA,57.27,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_dolfin_v0_3_dpo,HFv1 Winogrande,70.09,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HF OpenLLM v1,58.12,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 GSM8K,25.93,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 HellaSwag,77.21,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 MMLU,52.31,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 TruthfulQA,58.4,,hf_open_llm_v1_240829_frozen.csv +awanllm_llama3_8b_instruct_dpo_v0_2,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HF OpenLLM v1,36.92,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HFv1 HellaSwag,61.59,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HFv1 MMLU,25.37,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HFv1 TruthfulQA,35.84,,hf_open_llm_v1_240829_frozen.csv +babyllama_v0_6,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HF OpenLLM v1,64.82,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HFv1 GSM8K,47.31,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HFv1 HellaSwag,82.67,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HFv1 MMLU,62.25,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HFv1 TruthfulQA,54.2,,hf_open_llm_v1_240829_frozen.csv +bagel_7b_v0_4,HFv1 Winogrande,78.93,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HF OpenLLM v1,74.69,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HFv1 GSM8K,60.96,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HFv1 MMLU,76.58,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HFv1 TruthfulQA,70.05,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_34b_v0_2,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HF OpenLLM v1,67.13,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HFv1 GSM8K,46.85,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HFv1 HellaSwag,84.3,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HFv1 MMLU,61.95,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HFv1 TruthfulQA,63.94,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_4,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HF OpenLLM v1,68.84,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HFv1 GSM8K,53.37,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HFv1 HellaSwag,84.22,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HFv1 MMLU,65.27,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HFv1 TruthfulQA,62.41,,hf_open_llm_v1_240829_frozen.csv +bagel_dpo_7b_v0_5,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HF OpenLLM v1,70.41,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HFv1 GSM8K,57.39,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HFv1 MMLU,64.3,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HFv1 TruthfulQA,63.76,,hf_open_llm_v1_240829_frozen.csv +bagellake_7b_slerp,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HF OpenLLM v1,69.22,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HFv1 ARC,67.15,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HFv1 GSM8K,55.04,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HFv1 HellaSwag,84.79,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HFv1 MMLU,64.31,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HFv1 TruthfulQA,62.15,,hf_open_llm_v1_240829_frozen.csv +bageltoppylake_7b_slerp,HFv1 Winogrande,81.85,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HF OpenLLM v1,67.05,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HFv1 ARC,66.38,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HFv1 GSM8K,45.72,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HFv1 HellaSwag,85.51,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HFv1 MMLU,62.23,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HFv1 TruthfulQA,65.57,,hf_open_llm_v1_240829_frozen.csv +bageluccine_2_7b_slerp,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HF OpenLLM v1,65.97,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HFv1 ARC,65.1,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HFv1 GSM8K,46.25,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HFv1 HellaSwag,85.06,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HFv1 MMLU,61.75,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HFv1 TruthfulQA,60.33,,hf_open_llm_v1_240829_frozen.csv +bageluccine_7b_slerp,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HF OpenLLM v1,47.62,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HFv1 GSM8K,4.4,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HFv1 HellaSwag,77.32,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HFv1 MMLU,37.09,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HFv1 TruthfulQA,39.96,,hf_open_llm_v1_240829_frozen.csv +baize_healthcare_lora_7b,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HF OpenLLM v1,73.91,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HFv1 GSM8K,67.4,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HFv1 HellaSwag,87.4,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HFv1 MMLU,64.78,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HFv1 TruthfulQA,70.49,,hf_open_llm_v1_240829_frozen.csv +beyonder_4x7b_random_lora,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HF OpenLLM v1,64.82,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HFv1 HellaSwag,81.6,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HFv1 MMLU,59.66,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HFv1 TruthfulQA,53.68,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_1,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HF OpenLLM v1,63.08,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HFv1 GSM8K,44.12,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HFv1 HellaSwag,82.18,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HFv1 TruthfulQA,54.63,,hf_open_llm_v1_240829_frozen.csv +bggpt_7b_instruct_v0_2,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HF OpenLLM v1,51.26,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HFv1 ARC,47.53,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HFv1 GSM8K,30.25,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HFv1 HellaSwag,68.91,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HFv1 MMLU,49.47,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HFv1 TruthfulQA,46.18,,hf_open_llm_v1_240829_frozen.csv +bielik_7b_instruct_v0_1,HFv1 Winogrande,65.59,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HF OpenLLM v1,62.17,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HFv1 GSM8K,26.91,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HFv1 MMLU,59.14,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HFv1 TruthfulQA,68.21,,hf_open_llm_v1_240829_frozen.csv +bigstral_12b_32k,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HF OpenLLM v1,54.29,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HFv1 ARC,56.06,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HFv1 GSM8K,21.61,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HFv1 HellaSwag,75.9,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HFv1 MMLU,64.6,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HFv1 TruthfulQA,37.33,,hf_open_llm_v1_240829_frozen.csv +bigyi_15b,HFv1 Winogrande,70.24,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HF OpenLLM v1,32.14,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HFv1 ARC,29.18,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HFv1 HellaSwag,43.73,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HFv1 MMLU,23.1,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HFv1 TruthfulQA,45.0,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b,HFv1 Winogrande,51.85,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HF OpenLLM v1,32.23,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HFv1 ARC,28.58,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HFv1 HellaSwag,43.94,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HFv1 MMLU,25.38,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HFv1 TruthfulQA,47.48,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_8k,HFv1 Winogrande,47.99,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HF OpenLLM v1,32.5,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HFv1 ARC,28.24,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HFv1 HellaSwag,47.9,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HFv1 TruthfulQA,43.5,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_ppo,HFv1 Winogrande,52.25,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HF OpenLLM v1,32.46,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HFv1 ARC,28.07,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HFv1 HellaSwag,47.5,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HFv1 TruthfulQA,43.76,,hf_open_llm_v1_240829_frozen.csv +bilingual_gpt_neox_4b_instruction_sft,HFv1 Winogrande,52.33,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HF OpenLLM v1,38.73,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HFv1 HellaSwag,66.56,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HFv1 MMLU,25.75,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HFv1 TruthfulQA,37.46,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_a,HFv1 Winogrande,63.93,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HF OpenLLM v1,38.49,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HFv1 ARC,37.63,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HFv1 HellaSwag,66.72,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HFv1 MMLU,25.68,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HFv1 TruthfulQA,37.09,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_b,HFv1 Winogrande,63.77,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HF OpenLLM v1,39.01,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HFv1 ARC,38.74,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HFv1 HellaSwag,66.83,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HFv1 MMLU,26.57,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HFv1 TruthfulQA,36.54,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_c,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HF OpenLLM v1,38.57,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HFv1 ARC,37.8,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HFv1 HellaSwag,66.5,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HFv1 MMLU,26.64,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HFv1 TruthfulQA,36.46,,hf_open_llm_v1_240829_frozen.csv +black_goo_recipe_d,HFv1 Winogrande,63.61,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HF OpenLLM v1,73.89,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HFv1 GSM8K,65.13,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HFv1 HellaSwag,88.24,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HFv1 MMLU,64.37,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HFv1 TruthfulQA,67.83,,hf_open_llm_v1_240829_frozen.csv +bleagle_7b_v0_1_test,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv +bloom,HF OpenLLM v1,46.07,,hf_open_llm_v1_240829_frozen.csv +bloom,HFv1 ARC,50.43,,hf_open_llm_v1_240829_frozen.csv +bloom,HFv1 GSM8K,6.9,,hf_open_llm_v1_240829_frozen.csv +bloom,HFv1 HellaSwag,76.41,,hf_open_llm_v1_240829_frozen.csv +bloom,HFv1 MMLU,30.85,,hf_open_llm_v1_240829_frozen.csv +bloom,HFv1 TruthfulQA,39.76,,hf_open_llm_v1_240829_frozen.csv +bloom,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HF OpenLLM v1,30.14,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HFv1 ARC,27.99,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HFv1 HellaSwag,26.19,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HFv1 MMLU,26.86,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HFv1 TruthfulQA,48.88,,hf_open_llm_v1_240829_frozen.csv +bloom_1b1_rlhf,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HF OpenLLM v1,29.86,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HFv1 ARC,24.4,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HFv1 HellaSwag,36.96,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HFv1 MMLU,23.63,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HFv1 TruthfulQA,40.76,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf,HFv1 Winogrande,53.12,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HF OpenLLM v1,30.43,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HFv1 ARC,26.45,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HFv1 HellaSwag,37.67,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HFv1 MMLU,23.95,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HFv1 TruthfulQA,43.51,,hf_open_llm_v1_240829_frozen.csv +bloom_560m_rlhf_v2,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HF OpenLLM v1,39.18,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HFv1 ARC,41.13,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HFv1 HellaSwag,62.0,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HFv1 MMLU,26.25,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HFv1 TruthfulQA,38.9,,hf_open_llm_v1_240829_frozen.csv +bloom_7b1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HF OpenLLM v1,51.71,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HFv1 GSM8K,4.78,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HFv1 MMLU,51.66,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HFv1 TruthfulQA,46.84,,hf_open_llm_v1_240829_frozen.csv +blossom_v2_llama2_7b,HFv1 Winogrande,74.35,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HF OpenLLM v1,66.74,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HFv1 ARC,57.34,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HFv1 HellaSwag,79.84,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HFv1 MMLU,67.92,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HFv1 TruthfulQA,55.21,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_14b,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HF OpenLLM v1,56.34,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HFv1 ARC,46.08,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HFv1 GSM8K,51.1,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HFv1 HellaSwag,70.8,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HFv1 MMLU,55.11,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HFv1 TruthfulQA,47.29,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_4b,HFv1 Winogrande,67.64,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HF OpenLLM v1,62.11,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HFv1 ARC,54.44,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HFv1 HellaSwag,76.11,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HFv1 MMLU,60.43,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HFv1 TruthfulQA,53.69,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_qwen1_5_7b,HFv1 Winogrande,71.27,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HF OpenLLM v1,71.67,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HFv1 ARC,66.81,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HFv1 GSM8K,64.14,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HFv1 HellaSwag,84.44,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HFv1 MMLU,74.34,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HFv1 TruthfulQA,57.89,,hf_open_llm_v1_240829_frozen.csv +blossom_v4_yi_34b,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HF OpenLLM v1,67.57,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HFv1 GSM8K,67.78,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HFv1 HellaSwag,80.72,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HFv1 MMLU,68.45,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HFv1 TruthfulQA,54.89,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_14b,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HF OpenLLM v1,72.04,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HFv1 HellaSwag,83.54,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HFv1 MMLU,74.27,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HFv1 TruthfulQA,58.24,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_32b,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HF OpenLLM v1,72.65,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HFv1 GSM8K,62.02,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HFv1 HellaSwag,84.79,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HFv1 MMLU,76.0,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HFv1 TruthfulQA,62.68,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_34b,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HF OpenLLM v1,56.16,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HFv1 ARC,46.76,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HFv1 HellaSwag,71.87,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HFv1 MMLU,55.04,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HFv1 TruthfulQA,47.51,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_4b,HFv1 Winogrande,67.4,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HF OpenLLM v1,63.57,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HFv1 ARC,56.06,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HFv1 GSM8K,60.05,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HFv1 HellaSwag,77.36,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HFv1 MMLU,61.29,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HFv1 TruthfulQA,54.29,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_7b,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HF OpenLLM v1,64.69,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HFv1 HellaSwag,78.41,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HFv1 MMLU,69.81,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HFv1 TruthfulQA,52.78,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_9b,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HF OpenLLM v1,64.21,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HFv1 ARC,56.83,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HFv1 HellaSwag,83.05,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HFv1 MMLU,65.48,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HFv1 TruthfulQA,57.12,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_llama3_8b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HF OpenLLM v1,61.88,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HFv1 GSM8K,31.84,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HFv1 HellaSwag,84.26,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HFv1 MMLU,62.45,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HFv1 TruthfulQA,51.83,,hf_open_llm_v1_240829_frozen.csv +blossom_v5_mistral_7b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HF OpenLLM v1,76.26,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HFv1 GSM8K,69.67,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HFv1 HellaSwag,89.07,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HFv1 MMLU,64.37,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HFv1 TruthfulQA,76.61,,hf_open_llm_v1_240829_frozen.csv +blur_7b_slerp_v1_46,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HF OpenLLM v1,67.74,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HFv1 GSM8K,52.84,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HFv1 HellaSwag,83.88,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HFv1 MMLU,63.45,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HFv1 TruthfulQA,60.3,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_2,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HF OpenLLM v1,74.18,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HFv1 HellaSwag,88.07,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HFv1 TruthfulQA,67.99,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_21,HFv1 Winogrande,83.82,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HF OpenLLM v1,63.35,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HFv1 GSM8K,31.16,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HFv1 HellaSwag,82.0,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HFv1 MMLU,58.03,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HFv1 TruthfulQA,68.01,,hf_open_llm_v1_240829_frozen.csv +blur_7b_v1_22,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HF OpenLLM v1,65.92,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HFv1 ARC,63.99,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HFv1 GSM8K,46.93,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HFv1 HellaSwag,83.56,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HFv1 MMLU,63.19,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HFv1 TruthfulQA,58.12,,hf_open_llm_v1_240829_frozen.csv +blured_ties_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HFv1 GSM8K,69.9,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HFv1 HellaSwag,88.58,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HFv1 MMLU,64.95,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HFv1 TruthfulQA,69.39,,hf_open_llm_v1_240829_frozen.csv +blurred_beagle_7b_slerp,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HF OpenLLM v1,69.08,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HFv1 GSM8K,62.85,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HFv1 HellaSwag,85.38,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HFv1 MMLU,65.18,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HFv1 TruthfulQA,53.4,,hf_open_llm_v1_240829_frozen.csv +blurstral_7b_slerp,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HF OpenLLM v1,66.59,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HFv1 HellaSwag,84.4,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HFv1 MMLU,64.96,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HFv1 TruthfulQA,52.31,,hf_open_llm_v1_240829_frozen.csv +bookworm_10_7b_v0_4_dpo,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HF OpenLLM v1,28.44,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HFv1 HellaSwag,31.58,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HFv1 MMLU,25.66,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HFv1 TruthfulQA,39.17,,hf_open_llm_v1_240829_frozen.csv +boomer_1b,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HF OpenLLM v1,75.86,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HFv1 ARC,73.81,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HFv1 HellaSwag,88.98,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HFv1 TruthfulQA,74.13,,hf_open_llm_v1_240829_frozen.csv +brocae_area_7b_slerp,HFv1 Winogrande,85.08,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HF OpenLLM v1,74.08,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HFv1 HellaSwag,88.34,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HFv1 MMLU,66.04,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HFv1 TruthfulQA,71.36,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboard,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HF OpenLLM v1,59.33,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HFv1 GSM8K,25.93,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HFv1 HellaSwag,81.25,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HFv1 MMLU,58.36,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HFv1 TruthfulQA,52.0,,hf_open_llm_v1_240829_frozen.csv +brokenkeyboardmerge,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HF OpenLLM v1,74.24,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HFv1 HellaSwag,88.37,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HFv1 MMLU,64.74,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HFv1 TruthfulQA,70.05,,hf_open_llm_v1_240829_frozen.csv +brurrydog_7b_v0_1,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HF OpenLLM v1,50.04,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HFv1 ARC,52.73,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HFv1 GSM8K,8.95,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HFv1 HellaSwag,79.48,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HFv1 MMLU,49.93,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HFv1 TruthfulQA,37.2,,hf_open_llm_v1_240829_frozen.csv +btlm_v1_7b_base_v0_1,HFv1 Winogrande,71.98,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HF OpenLLM v1,64.42,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HFv1 GSM8K,38.29,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HFv1 HellaSwag,84.0,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HFv1 MMLU,60.42,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HFv1 TruthfulQA,65.72,,hf_open_llm_v1_240829_frozen.csv +buddhi_128k_chat_7b,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HF OpenLLM v1,29.72,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HFv1 ARC,28.33,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HFv1 HellaSwag,26.57,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HFv1 TruthfulQA,47.93,,hf_open_llm_v1_240829_frozen.csv +bulgakovlm_3b,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HF OpenLLM v1,65.33,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HFv1 GSM8K,57.62,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HFv1 HellaSwag,81.9,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HFv1 MMLU,64.0,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HFv1 TruthfulQA,50.54,,hf_open_llm_v1_240829_frozen.csv +buzz_8b_large_v0_5,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HF OpenLLM v1,74.62,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HFv1 GSM8K,70.74,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HFv1 HellaSwag,88.56,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HFv1 MMLU,75.73,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HFv1 TruthfulQA,56.95,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_plus,HFv1 Winogrande,85.4,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HF OpenLLM v1,68.54,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HFv1 ARC,65.53,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HFv1 HellaSwag,87.0,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HFv1 MMLU,68.2,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HFv1 TruthfulQA,52.32,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HF OpenLLM v1,68.85,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HFv1 GSM8K,60.05,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HFv1 HellaSwag,85.62,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HFv1 MMLU,67.61,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HFv1 TruthfulQA,51.01,,hf_open_llm_v1_240829_frozen.csv +c4ai_command_r_v0_1_japanese_instruct,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HF OpenLLM v1,43.27,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HFv1 ARC,40.27,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HFv1 GSM8K,4.93,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HFv1 HellaSwag,68.12,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HFv1 MMLU,39.39,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HFv1 TruthfulQA,41.96,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat,HFv1 Winogrande,64.96,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HF OpenLLM v1,44.03,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HFv1 ARC,41.04,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HFv1 HellaSwag,68.99,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HFv1 MMLU,39.82,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HFv1 TruthfulQA,43.13,,hf_open_llm_v1_240829_frozen.csv +calm2_7b_chat_dpo_experimental,HFv1 Winogrande,65.67,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HF OpenLLM v1,65.39,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HFv1 GSM8K,23.96,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HFv1 HellaSwag,87.71,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HFv1 MMLU,69.83,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HFv1 TruthfulQA,57.77,,hf_open_llm_v1_240829_frozen.csv +camel_platypus2_70b,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HF OpenLLM v1,59.4,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HFv1 GSM8K,34.57,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HFv1 HellaSwag,82.73,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HFv1 MMLU,57.21,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HFv1 TruthfulQA,43.37,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x13b,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HF OpenLLM v1,54.47,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HFv1 GSM8K,22.82,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HFv1 HellaSwag,79.18,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HFv1 MMLU,50.1,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HFv1 TruthfulQA,42.86,,hf_open_llm_v1_240829_frozen.csv +camelidae_8x7b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HF OpenLLM v1,56.93,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HFv1 GSM8K,30.71,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HFv1 HellaSwag,75.8,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HFv1 MMLU,63.07,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HFv1 TruthfulQA,42.26,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_6b_preview202402,HFv1 Winogrande,74.11,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HF OpenLLM v1,60.2,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HFv1 GSM8K,46.55,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HFv1 MMLU,63.83,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HFv1 TruthfulQA,43.58,,hf_open_llm_v1_240829_frozen.csv +cantonesellm_cpt_202405,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HF OpenLLM v1,68.57,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 GSM8K,54.06,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 HellaSwag,84.99,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 MMLU,75.37,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 TruthfulQA,52.84,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HF OpenLLM v1,71.57,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 GSM8K,59.82,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 MMLU,77.35,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 TruthfulQA,57.63,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_extremedensity,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HF OpenLLM v1,72.15,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 GSM8K,61.33,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 HellaSwag,85.77,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 MMLU,77.44,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 TruthfulQA,57.84,,hf_open_llm_v1_240829_frozen.csv +caplattessdolxaboros_yi_34b_200k_dare_ties_highdensity,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HF OpenLLM v1,70.57,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HFv1 GSM8K,57.39,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HFv1 HellaSwag,86.24,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HFv1 MMLU,74.89,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HFv1 TruthfulQA,56.37,,hf_open_llm_v1_240829_frozen.csv +capybara_tess_yi_34b_200k,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HF OpenLLM v1,68.14,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HFv1 ARC,65.78,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HFv1 GSM8K,59.29,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HFv1 HellaSwag,85.45,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HFv1 MMLU,63.13,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HFv1 TruthfulQA,56.91,,hf_open_llm_v1_240829_frozen.csv +capybarahermes_2_5_mistral_7b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HF OpenLLM v1,76.1,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HFv1 GSM8K,66.11,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HFv1 HellaSwag,89.31,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HFv1 MMLU,66.55,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HFv1 TruthfulQA,78.55,,hf_open_llm_v1_240829_frozen.csv +carbonbeagle_11b_truthy,HFv1 Winogrande,83.82,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HF OpenLLM v1,74.28,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HFv1 HellaSwag,88.46,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HFv1 MMLU,66.42,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HFv1 TruthfulQA,71.98,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v1,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HF OpenLLM v1,74.42,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HFv1 HellaSwag,88.4,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HFv1 TruthfulQA,71.94,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v2,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HF OpenLLM v1,74.41,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HFv1 MMLU,66.34,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HFv1 TruthfulQA,71.84,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v3,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HF OpenLLM v1,74.52,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HFv1 GSM8K,65.58,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HFv1 MMLU,66.27,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HFv1 TruthfulQA,71.95,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v4,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HF OpenLLM v1,74.31,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HFv1 GSM8K,64.44,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HFv1 HellaSwag,88.51,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HFv1 MMLU,66.44,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HFv1 TruthfulQA,71.97,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_10_7b_v5,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HF OpenLLM v1,74.28,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HFv1 HellaSwag,88.46,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HFv1 MMLU,66.42,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HFv1 TruthfulQA,71.98,,hf_open_llm_v1_240829_frozen.csv +carbonvillain_en_13b_v1,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HF OpenLLM v1,72.25,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HFv1 HellaSwag,86.64,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HFv1 MMLU,65.26,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HFv1 TruthfulQA,61.72,,hf_open_llm_v1_240829_frozen.csv +catppt_base,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HF OpenLLM v1,74.7,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HFv1 GSM8K,69.29,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HFv1 MMLU,64.95,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HFv1 TruthfulQA,70.01,,hf_open_llm_v1_240829_frozen.csv +catunalaserpi_dpo,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HF OpenLLM v1,63.8,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HFv1 GSM8K,57.24,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HFv1 HellaSwag,80.06,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HFv1 TruthfulQA,47.57,,hf_open_llm_v1_240829_frozen.csv +causallm_platypus_14b,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HF OpenLLM v1,69.66,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HFv1 HellaSwag,85.2,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HFv1 TruthfulQA,58.8,,hf_open_llm_v1_240829_frozen.csv +cerberus_7b_model_stock,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HF OpenLLM v1,27.75,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HFv1 ARC,20.22,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HFv1 HellaSwag,26.73,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HFv1 MMLU,25.51,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HFv1 TruthfulQA,46.31,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_111m,HFv1 Winogrande,47.75,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HF OpenLLM v1,37.4,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HFv1 HellaSwag,60.01,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HFv1 MMLU,25.92,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HFv1 TruthfulQA,39.19,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_13b,HFv1 Winogrande,59.83,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HF OpenLLM v1,31.3,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HFv1 ARC,26.28,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HFv1 HellaSwag,38.54,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HFv1 MMLU,26.59,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HFv1 TruthfulQA,42.7,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_1_3b,HFv1 Winogrande,53.43,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HF OpenLLM v1,29.38,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HFv1 HellaSwag,28.99,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HFv1 MMLU,26.83,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HFv1 TruthfulQA,45.98,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_256m,HFv1 Winogrande,52.49,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HF OpenLLM v1,33.25,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HFv1 HellaSwag,49.29,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HFv1 MMLU,25.17,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HFv1 TruthfulQA,41.37,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_2_7b,HFv1 Winogrande,54.14,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HF OpenLLM v1,36.27,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HFv1 ARC,35.07,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HFv1 HellaSwag,59.36,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HFv1 TruthfulQA,38.02,,hf_open_llm_v1_240829_frozen.csv +cerebras_gpt_6_7b,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HF OpenLLM v1,29.27,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HFv1 ARC,28.67,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HFv1 HellaSwag,26.41,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HFv1 TruthfulQA,47.94,,hf_open_llm_v1_240829_frozen.csv +changpt_bart,HFv1 Winogrande,49.49,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HF OpenLLM v1,57.84,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HFv1 HellaSwag,84.28,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HFv1 MMLU,58.58,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HFv1 TruthfulQA,51.28,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_nova_13b,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HF OpenLLM v1,55.93,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HFv1 GSM8K,2.96,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HFv1 HellaSwag,84.03,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HFv1 MMLU,57.83,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HFv1 TruthfulQA,54.52,,hf_open_llm_v1_240829_frozen.csv +chat_ayb_platypus2_13b,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HF OpenLLM v1,57.76,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HFv1 GSM8K,8.87,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HFv1 HellaSwag,83.05,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HFv1 MMLU,58.72,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HFv1 TruthfulQA,56.12,,hf_open_llm_v1_240829_frozen.csv +chatayt_lora_assamble_marcoroni,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HF OpenLLM v1,69.73,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HFv1 HellaSwag,85.4,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HFv1 MMLU,65.17,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HFv1 TruthfulQA,52.3,,hf_open_llm_v1_240829_frozen.csv +chathercules_2_5_mistral_7b_dpo,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HF OpenLLM v1,72.84,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HFv1 HellaSwag,87.09,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HFv1 MMLU,64.84,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HFv1 TruthfulQA,64.43,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_slerp,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HF OpenLLM v1,62.46,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HFv1 GSM8K,29.34,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HFv1 HellaSwag,81.55,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HFv1 MMLU,53.46,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HFv1 TruthfulQA,64.34,,hf_open_llm_v1_240829_frozen.csv +chimera_7b_ties,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HF OpenLLM v1,68.13,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HFv1 HellaSwag,81.19,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HFv1 MMLU,67.62,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HFv1 TruthfulQA,52.4,,hf_open_llm_v1_240829_frozen.csv +chimerallama3_8b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HF OpenLLM v1,57.41,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HFv1 GSM8K,25.02,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HFv1 HellaSwag,79.76,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HFv1 MMLU,55.12,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HFv1 TruthfulQA,50.22,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HF OpenLLM v1,54.12,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HFv1 GSM8K,21.08,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HFv1 HellaSwag,77.41,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HFv1 MMLU,51.28,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HFv1 TruthfulQA,46.5,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_13b_16k,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HF OpenLLM v1,29.34,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HFv1 ARC,24.49,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HFv1 HellaSwag,30.17,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HFv1 MMLU,25.88,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HFv1 TruthfulQA,44.6,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HF OpenLLM v1,29.39,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HFv1 ARC,23.89,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HFv1 HellaSwag,30.01,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HFv1 MMLU,26.53,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HFv1 TruthfulQA,45.06,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_1_3b_rlhf,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HF OpenLLM v1,50.21,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HFv1 ARC,49.57,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HFv1 GSM8K,13.72,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HFv1 HellaSwag,72.64,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HFv1 MMLU,46.55,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HFv1 TruthfulQA,48.63,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b,HFv1 Winogrande,70.17,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HF OpenLLM v1,48.02,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HFv1 ARC,48.46,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HFv1 GSM8K,9.33,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HFv1 HellaSwag,70.3,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HFv1 MMLU,42.94,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HFv1 TruthfulQA,48.59,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_16k,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HF OpenLLM v1,50.92,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HFv1 ARC,49.49,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HFv1 GSM8K,15.01,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HFv1 HellaSwag,72.61,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HFv1 MMLU,46.29,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HFv1 TruthfulQA,51.19,,hf_open_llm_v1_240829_frozen.csv +chinese_alpaca_2_7b_rlhf,HFv1 Winogrande,70.96,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HF OpenLLM v1,58.57,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HFv1 ARC,67.49,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HFv1 MMLU,70.31,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HFv1 TruthfulQA,46.75,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HF OpenLLM v1,66.69,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HFv1 GSM8K,51.71,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HFv1 MMLU,70.95,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv +chinese_mixtral_8x7b,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HF OpenLLM v1,55.22,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HFv1 HellaSwag,82.75,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HFv1 MMLU,58.45,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HFv1 TruthfulQA,51.9,,hf_open_llm_v1_240829_frozen.csv +chronorctypus_limarobormes_13b,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HF OpenLLM v1,68.25,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HFv1 GSM8K,42.61,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HFv1 HellaSwag,87.52,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HFv1 MMLU,69.33,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HFv1 TruthfulQA,57.65,,hf_open_llm_v1_240829_frozen.csv +chronos007_70b,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HF OpenLLM v1,53.51,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HFv1 ARC,49.83,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HFv1 GSM8K,32.45,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HFv1 HellaSwag,75.09,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HFv1 MMLU,49.28,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HFv1 TruthfulQA,45.37,,hf_open_llm_v1_240829_frozen.csv +code_millenials_34b,HFv1 Winogrande,69.06,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HF OpenLLM v1,42.59,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HFv1 ARC,46.76,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HFv1 GSM8K,2.65,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HFv1 HellaSwag,71.87,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HFv1 MMLU,32.35,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HFv1 TruthfulQA,33.95,,hf_open_llm_v1_240829_frozen.csv +codegen_16b_nl,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HF OpenLLM v1,32.43,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HFv1 ARC,27.22,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HFv1 HellaSwag,41.11,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HFv1 MMLU,25.71,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HFv1 TruthfulQA,45.65,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_multi,HFv1 Winogrande,53.91,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HF OpenLLM v1,40.0,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HFv1 ARC,42.32,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HFv1 HellaSwag,68.59,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HFv1 TruthfulQA,34.47,,hf_open_llm_v1_240829_frozen.csv +codegen_6b_nl,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HF OpenLLM v1,55.28,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HFv1 ARC,54.18,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HFv1 GSM8K,34.34,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HFv1 HellaSwag,75.82,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HFv1 MMLU,54.92,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HFv1 TruthfulQA,39.11,,hf_open_llm_v1_240829_frozen.csv +codellama34b,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HF OpenLLM v1,44.33,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HFv1 ARC,40.78,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HFv1 GSM8K,31.01,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HFv1 HellaSwag,35.66,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HFv1 MMLU,39.72,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HFv1 TruthfulQA,44.29,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HF OpenLLM v1,43.0,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HFv1 ARC,40.78,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HFv1 GSM8K,23.05,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HFv1 HellaSwag,35.66,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HFv1 MMLU,39.72,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HFv1 TruthfulQA,44.29,,hf_open_llm_v1_240829_frozen.csv +codellama34b_instruct_fp16,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HF OpenLLM v1,40.27,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HFv1 ARC,40.19,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HFv1 GSM8K,14.33,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HFv1 HellaSwag,36.82,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HFv1 MMLU,34.79,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HFv1 TruthfulQA,44.28,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HF OpenLLM v1,40.27,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HFv1 GSM8K,20.02,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HFv1 HellaSwag,34.8,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HFv1 MMLU,32.95,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HFv1 TruthfulQA,43.57,,hf_open_llm_v1_240829_frozen.csv +codellama34b_python_fp16,HFv1 Winogrande,72.14,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HF OpenLLM v1,43.35,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HFv1 ARC,40.87,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HFv1 HellaSwag,63.35,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HFv1 MMLU,32.81,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HFv1 TruthfulQA,43.79,,hf_open_llm_v1_240829_frozen.csv +codellama_13b,HFv1 Winogrande,67.17,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HF OpenLLM v1,45.82,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HFv1 ARC,44.54,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HFv1 GSM8K,12.66,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HFv1 HellaSwag,64.93,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HFv1 MMLU,38.89,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct,HFv1 Winogrande,68.03,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HF OpenLLM v1,45.82,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HFv1 ARC,44.62,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HFv1 GSM8K,12.66,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HFv1 HellaSwag,64.94,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HFv1 MMLU,38.77,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_instruct_fp16,HFv1 Winogrande,68.03,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HF OpenLLM v1,44.85,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HFv1 ARC,45.39,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HFv1 GSM8K,13.19,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HFv1 HellaSwag,62.36,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HFv1 MMLU,35.36,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_oasst_sft_v10,HFv1 Winogrande,67.8,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HF OpenLLM v1,37.0,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HFv1 ARC,32.59,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HFv1 GSM8K,8.64,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HFv1 HellaSwag,43.94,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HFv1 MMLU,27.23,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HFv1 TruthfulQA,44.59,,hf_open_llm_v1_240829_frozen.csv +codellama_13b_python,HFv1 Winogrande,65.04,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HF OpenLLM v1,58.93,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HFv1 ARC,56.74,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HFv1 GSM8K,43.97,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HFv1 HellaSwag,78.21,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HFv1 MMLU,59.67,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HFv1 TruthfulQA,39.79,,hf_open_llm_v1_240829_frozen.csv +codellama_70b,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HF OpenLLM v1,59.98,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HFv1 GSM8K,46.25,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HFv1 HellaSwag,77.24,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HFv1 MMLU,56.4,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HFv1 TruthfulQA,50.44,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_instruct,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HF OpenLLM v1,58.0,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HFv1 ARC,55.12,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HFv1 GSM8K,43.44,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HFv1 HellaSwag,78.48,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HFv1 MMLU,56.17,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HFv1 TruthfulQA,41.78,,hf_open_llm_v1_240829_frozen.csv +codellama_70b_python,HFv1 Winogrande,73.01,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HF OpenLLM v1,39.81,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HFv1 ARC,39.93,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HFv1 GSM8K,5.16,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HFv1 HellaSwag,60.8,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HFv1 MMLU,31.12,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HFv1 TruthfulQA,37.82,,hf_open_llm_v1_240829_frozen.csv +codellama_7b,HFv1 Winogrande,64.01,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HF OpenLLM v1,40.05,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HFv1 ARC,36.52,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HFv1 GSM8K,7.96,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HFv1 HellaSwag,55.44,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HFv1 MMLU,34.54,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HFv1 TruthfulQA,41.25,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_instruct,HFv1 Winogrande,64.56,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HF OpenLLM v1,36.89,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HFv1 ARC,31.31,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HFv1 GSM8K,5.16,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HFv1 HellaSwag,52.86,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HFv1 MMLU,28.37,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HFv1 TruthfulQA,42.21,,hf_open_llm_v1_240829_frozen.csv +codellama_7b_python,HFv1 Winogrande,64.01,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HF OpenLLM v1,29.48,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HFv1 ARC,21.67,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HFv1 HellaSwag,28.34,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HFv1 MMLU,25.55,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HFv1 TruthfulQA,50.87,,hf_open_llm_v1_240829_frozen.csv +codeparrot,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HF OpenLLM v1,66.7,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HFv1 GSM8K,62.02,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HFv1 HellaSwag,81.76,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HFv1 MMLU,62.21,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HFv1 TruthfulQA,56.7,,hf_open_llm_v1_240829_frozen.csv +codestral_22b_v0_1,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HF OpenLLM v1,72.96,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HFv1 GSM8K,67.4,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HFv1 HellaSwag,87.33,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HFv1 TruthfulQA,65.16,,hf_open_llm_v1_240829_frozen.csv +cognate_7b_slerp,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HF OpenLLM v1,71.87,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HFv1 ARC,87.46,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HFv1 GSM8K,39.27,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HFv1 HellaSwag,83.29,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HFv1 MMLU,68.13,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HFv1 TruthfulQA,72.79,,hf_open_llm_v1_240829_frozen.csv +cokal_v1_70b,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HF OpenLLM v1,62.92,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HFv1 GSM8K,35.86,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HFv1 HellaSwag,84.17,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HFv1 MMLU,62.35,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HFv1 TruthfulQA,57.62,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_1_mistral_7b,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HF OpenLLM v1,60.1,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HFv1 GSM8K,17.89,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HFv1 HellaSwag,85.5,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HFv1 MMLU,62.76,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HFv1 TruthfulQA,54.48,,hf_open_llm_v1_240829_frozen.csv +collectivecognition_v1_mistral_7b,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HF OpenLLM v1,74.91,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HFv1 HellaSwag,88.19,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HFv1 MMLU,64.89,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HFv1 TruthfulQA,71.14,,hf_open_llm_v1_240829_frozen.csv +complect_7b_slerp,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HF OpenLLM v1,72.63,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HFv1 GSM8K,65.88,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HFv1 HellaSwag,87.34,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HFv1 MMLU,64.62,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HFv1 TruthfulQA,65.88,,hf_open_llm_v1_240829_frozen.csv +complectmaid_7b_slerp,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HF OpenLLM v1,70.1,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HFv1 HellaSwag,83.49,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HFv1 MMLU,65.23,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HFv1 TruthfulQA,62.51,,hf_open_llm_v1_240829_frozen.csv +configurable_hermes_2_pro_llama3_8b,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HF OpenLLM v1,68.3,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HFv1 HellaSwag,79.51,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HFv1 MMLU,67.18,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HFv1 TruthfulQA,56.16,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_1,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HF OpenLLM v1,68.58,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HFv1 GSM8K,70.2,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HFv1 HellaSwag,79.77,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HFv1 MMLU,67.02,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HFv1 TruthfulQA,56.79,,hf_open_llm_v1_240829_frozen.csv +configurable_llama3_8b_v0_2,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HF OpenLLM v1,53.91,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HFv1 ARC,54.01,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HFv1 GSM8K,12.05,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HFv1 HellaSwag,75.31,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HFv1 MMLU,51.07,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HFv1 TruthfulQA,55.66,,hf_open_llm_v1_240829_frozen.csv +configurable_mistral_22b,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HF OpenLLM v1,70.5,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HFv1 GSM8K,70.58,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HFv1 HellaSwag,81.7,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HFv1 MMLU,70.99,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HFv1 TruthfulQA,58.75,,hf_open_llm_v1_240829_frozen.csv +configurable_yi_1_5_9b_chat,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HF OpenLLM v1,75.4,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HFv1 GSM8K,63.91,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HFv1 HellaSwag,88.85,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HFv1 MMLU,66.71,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HFv1 TruthfulQA,77.13,,hf_open_llm_v1_240829_frozen.csv +configurablebeagle_11b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HF OpenLLM v1,68.89,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HFv1 GSM8K,61.41,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HFv1 HellaSwag,84.31,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HFv1 MMLU,62.44,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HFv1 TruthfulQA,61.71,,hf_open_llm_v1_240829_frozen.csv +configurablehermes_7b,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HF OpenLLM v1,73.94,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HFv1 GSM8K,63.38,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HFv1 HellaSwag,88.03,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HFv1 MMLU,66.44,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HFv1 TruthfulQA,72.34,,hf_open_llm_v1_240829_frozen.csv +configurablesolar_10_7b,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HFv1 HellaSwag,88.37,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HFv1 MMLU,64.96,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HFv1 TruthfulQA,71.16,,hf_open_llm_v1_240829_frozen.csv +connate_7b_slerp,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HF OpenLLM v1,81.14,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HFv1 ARC,78.07,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HFv1 HellaSwag,90.22,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HFv1 MMLU,78.92,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HFv1 TruthfulQA,82.29,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0,HFv1 Winogrande,88.16,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HF OpenLLM v1,81.14,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HFv1 ARC,78.07,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HFv1 HellaSwag,90.22,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HFv1 MMLU,78.92,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HFv1 TruthfulQA,82.29,,hf_open_llm_v1_240829_frozen.csv +contaminated_proof_7b_v1_0_safetensor,HFv1 Winogrande,88.16,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HF OpenLLM v1,65.26,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HFv1 GSM8K,33.81,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HFv1 HellaSwag,85.52,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HFv1 MMLU,60.28,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HFv1 TruthfulQA,71.67,,hf_open_llm_v1_240829_frozen.csv +contextual_kto_mistral_pairrm,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HF OpenLLM v1,36.59,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HFv1 ARC,38.57,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HFv1 HellaSwag,55.13,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HFv1 MMLU,26.69,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HFv1 TruthfulQA,38.26,,hf_open_llm_v1_240829_frozen.csv +cosmo_1b,HFv1 Winogrande,55.49,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HF OpenLLM v1,71.06,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HFv1 GSM8K,68.39,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HFv1 HellaSwag,85.53,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HFv1 MMLU,65.76,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HFv1 TruthfulQA,57.73,,hf_open_llm_v1_240829_frozen.csv +coven_7b_128k_orpo_alpha,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HF OpenLLM v1,40.71,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HFv1 ARC,37.2,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HFv1 GSM8K,14.03,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HFv1 HellaSwag,53.71,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HFv1 MMLU,38.53,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HFv1 TruthfulQA,42.2,,hf_open_llm_v1_240829_frozen.csv +coven_tiny_1_1b_32k_orpo_alpha,HFv1 Winogrande,58.56,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HF OpenLLM v1,77.32,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HFv1 GSM8K,66.57,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HFv1 HellaSwag,87.85,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HFv1 MMLU,74.73,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HFv1 TruthfulQA,80.47,,hf_open_llm_v1_240829_frozen.csv +cr_model_v1,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HF OpenLLM v1,50.93,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HFv1 ARC,52.73,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HFv1 GSM8K,8.49,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HFv1 HellaSwag,78.58,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HFv1 MMLU,48.3,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HFv1 TruthfulQA,45.58,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HF OpenLLM v1,49.72,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HFv1 ARC,51.45,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HFv1 GSM8K,6.75,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HFv1 HellaSwag,77.35,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HFv1 MMLU,46.47,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HFv1 TruthfulQA,45.52,,hf_open_llm_v1_240829_frozen.csv +cria_llama2_7b_v1_3_peft,HFv1 Winogrande,70.8,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HF OpenLLM v1,34.45,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HFv1 HellaSwag,54.58,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HFv1 MMLU,24.54,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HFv1 TruthfulQA,39.34,,hf_open_llm_v1_240829_frozen.csv +croissantcool_v0_2,HFv1 Winogrande,56.43,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HF OpenLLM v1,34.41,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HFv1 ARC,31.57,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HFv1 HellaSwag,54.18,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HFv1 MMLU,25.72,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HFv1 TruthfulQA,37.49,,hf_open_llm_v1_240829_frozen.csv +croissantllmbase,HFv1 Winogrande,57.46,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HF OpenLLM v1,38.97,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HFv1 ARC,39.25,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HFv1 HellaSwag,47.92,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HFv1 MMLU,36.66,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HFv1 TruthfulQA,47.9,,hf_open_llm_v1_240829_frozen.csv +cross_lingual_epoch2,HFv1 Winogrande,62.12,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HF OpenLLM v1,29.12,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HFv1 ARC,25.51,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HFv1 HellaSwag,25.87,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HFv1 MMLU,24.8,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HFv1 TruthfulQA,48.28,,hf_open_llm_v1_240829_frozen.csv +crow_1b,HFv1 Winogrande,49.41,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HF OpenLLM v1,37.78,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HFv1 GSM8K,5.23,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HFv1 HellaSwag,58.93,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HFv1 MMLU,25.33,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HFv1 TruthfulQA,42.79,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b,HFv1 Winogrande,56.67,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HF OpenLLM v1,35.57,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HFv1 HellaSwag,54.65,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HFv1 MMLU,26.85,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HFv1 TruthfulQA,38.15,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_chat_v0_1,HFv1 Winogrande,55.72,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HF OpenLLM v1,36.09,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HFv1 ARC,33.62,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HFv1 HellaSwag,58.29,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HFv1 MMLU,25.74,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HFv1 TruthfulQA,39.92,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_orpo_bf16,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HF OpenLLM v1,35.91,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HFv1 GSM8K,2.43,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HFv1 HellaSwag,55.83,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HFv1 MMLU,24.81,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HFv1 TruthfulQA,40.09,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_bf16,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HF OpenLLM v1,35.9,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HFv1 ARC,33.7,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HFv1 HellaSwag,55.97,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HFv1 MMLU,24.7,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HFv1 TruthfulQA,40.08,,hf_open_llm_v1_240829_frozen.csv +csg_wukong_1b_sft_dpo_bf16,HFv1 Winogrande,58.8,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HF OpenLLM v1,72.6,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HFv1 GSM8K,69.98,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HFv1 HellaSwag,86.96,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HFv1 MMLU,65.2,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HFv1 TruthfulQA,63.47,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_bf16,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HF OpenLLM v1,72.21,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HFv1 GSM8K,62.09,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HFv1 HellaSwag,87.22,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HFv1 MMLU,64.95,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HFv1 TruthfulQA,68.04,,hf_open_llm_v1_240829_frozen.csv +cultrix_moe_model,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +cutie,HF OpenLLM v1,29.87,,hf_open_llm_v1_240829_frozen.csv +cutie,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv +cutie,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +cutie,HFv1 HellaSwag,27.02,,hf_open_llm_v1_240829_frozen.csv +cutie,HFv1 MMLU,24.17,,hf_open_llm_v1_240829_frozen.csv +cutie,HFv1 TruthfulQA,48.42,,hf_open_llm_v1_240829_frozen.csv +cutie,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HF OpenLLM v1,43.05,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HFv1 ARC,39.59,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HFv1 GSM8K,14.48,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HFv1 HellaSwag,67.45,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HFv1 MMLU,31.14,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HFv1 TruthfulQA,40.44,,hf_open_llm_v1_240829_frozen.csv +cypher_mini_1_8b,HFv1 Winogrande,65.19,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HF OpenLLM v1,75.98,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HFv1 GSM8K,69.22,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HFv1 HellaSwag,88.19,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HFv1 MMLU,64.6,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HFv1 TruthfulQA,77.01,,hf_open_llm_v1_240829_frozen.csv +cyrax_7b,HFv1 Winogrande,83.9,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HF OpenLLM v1,60.49,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HFv1 GSM8K,50.64,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HFv1 HellaSwag,74.52,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HFv1 MMLU,56.34,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HFv1 TruthfulQA,46.74,,hf_open_llm_v1_240829_frozen.csv +damysus_2_7b_chat,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HF OpenLLM v1,64.34,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HFv1 GSM8K,39.27,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HFv1 HellaSwag,84.01,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HFv1 MMLU,60.54,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HFv1 TruthfulQA,64.2,,hf_open_llm_v1_240829_frozen.csv +damysus_coder_v0_1,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HF OpenLLM v1,72.32,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HFv1 GSM8K,71.8,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HFv1 HellaSwag,85.05,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HFv1 MMLU,69.1,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HFv1 TruthfulQA,60.0,,hf_open_llm_v1_240829_frozen.csv +daredevil_8b_abliterated_dpomix,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HF OpenLLM v1,74.28,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HFv1 GSM8K,65.58,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HFv1 HellaSwag,88.27,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HFv1 MMLU,64.15,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HFv1 TruthfulQA,71.03,,hf_open_llm_v1_240829_frozen.csv +datura_7b,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HF OpenLLM v1,71.9,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HFv1 HellaSwag,89.0,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HFv1 MMLU,74.7,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HFv1 TruthfulQA,55.07,,hf_open_llm_v1_240829_frozen.csv +dbrx_base,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HF OpenLLM v1,74.47,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HFv1 GSM8K,67.32,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HFv1 HellaSwag,88.85,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HFv1 MMLU,73.72,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HFv1 TruthfulQA,67.02,,hf_open_llm_v1_240829_frozen.csv +dbrx_instructruct,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HF OpenLLM v1,53.63,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HFv1 ARC,57.85,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HFv1 GSM8K,10.39,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HFv1 HellaSwag,82.63,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HFv1 MMLU,55.25,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HFv1 TruthfulQA,39.33,,hf_open_llm_v1_240829_frozen.csv +deacon_13b,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HF OpenLLM v1,36.03,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HFv1 ARC,33.7,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HFv1 HellaSwag,52.33,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HFv1 MMLU,33.97,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HFv1 TruthfulQA,39.05,,hf_open_llm_v1_240829_frozen.csv +deacon_1_8b,HFv1 Winogrande,57.14,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HF OpenLLM v1,35.21,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HFv1 ARC,32.42,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HFv1 HellaSwag,58.62,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HFv1 MMLU,24.89,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HFv1 TruthfulQA,35.05,,hf_open_llm_v1_240829_frozen.csv +deacon_1b,HFv1 Winogrande,59.59,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HF OpenLLM v1,61.28,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HFv1 GSM8K,29.19,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HFv1 HellaSwag,81.74,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HFv1 MMLU,60.7,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HFv1 TruthfulQA,58.49,,hf_open_llm_v1_240829_frozen.csv +deacon_20b,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HF OpenLLM v1,71.16,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HFv1 GSM8K,61.18,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HFv1 HellaSwag,85.57,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HFv1 MMLU,76.28,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HFv1 TruthfulQA,56.24,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_adapter,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HF OpenLLM v1,71.39,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HFv1 GSM8K,62.24,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HFv1 HellaSwag,85.56,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HFv1 MMLU,76.38,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HFv1 TruthfulQA,56.21,,hf_open_llm_v1_240829_frozen.csv +deacon_34b_qlora_adapter,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HF OpenLLM v1,39.05,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HFv1 ARC,39.68,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HFv1 HellaSwag,66.42,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HFv1 MMLU,27.13,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HFv1 TruthfulQA,36.07,,hf_open_llm_v1_240829_frozen.csv +deacon_3b,HFv1 Winogrande,64.64,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HF OpenLLM v1,29.37,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HFv1 ARC,21.16,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HFv1 HellaSwag,31.09,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HFv1 MMLU,24.34,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv +decicoder_1b,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HF OpenLLM v1,61.55,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HFv1 GSM8K,47.38,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HFv1 HellaSwag,82.51,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HFv1 MMLU,59.76,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HFv1 TruthfulQA,40.33,,hf_open_llm_v1_240829_frozen.csv +decilm_7b,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HF OpenLLM v1,63.19,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HFv1 GSM8K,46.02,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HFv1 HellaSwag,82.37,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HFv1 MMLU,60.24,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HFv1 TruthfulQA,49.75,,hf_open_llm_v1_240829_frozen.csv +decilm_7b_instruct,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HF OpenLLM v1,32.4,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HFv1 ARC,28.58,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HFv1 HellaSwag,39.87,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HFv1 MMLU,28.47,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HFv1 TruthfulQA,44.02,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_1_3b_instruct,HFv1 Winogrande,52.41,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HF OpenLLM v1,40.87,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HFv1 HellaSwag,53.46,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HFv1 MMLU,38.39,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HFv1 TruthfulQA,40.28,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_base,HFv1 Winogrande,58.09,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HF OpenLLM v1,43.57,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HFv1 ARC,38.14,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HFv1 GSM8K,26.76,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HFv1 HellaSwag,55.09,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HFv1 MMLU,39.02,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HFv1 TruthfulQA,45.56,,hf_open_llm_v1_240829_frozen.csv +deepseek_coder_6_7b_instruct,HFv1 Winogrande,56.83,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HF OpenLLM v1,69.38,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HFv1 HellaSwag,87.1,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HFv1 MMLU,71.78,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_base,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HF OpenLLM v1,71.79,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HFv1 GSM8K,63.68,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HFv1 HellaSwag,86.82,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HFv1 MMLU,72.42,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HFv1 TruthfulQA,55.85,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_67b_chat,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HF OpenLLM v1,59.38,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HFv1 ARC,55.8,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HFv1 GSM8K,46.55,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HFv1 HellaSwag,79.38,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HFv1 MMLU,51.75,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HFv1 TruthfulQA,47.98,,hf_open_llm_v1_240829_frozen.csv +deepseek_llm_7b_chat,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HF OpenLLM v1,51.07,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HFv1 HellaSwag,79.77,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HFv1 MMLU,46.31,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HFv1 TruthfulQA,36.08,,hf_open_llm_v1_240829_frozen.csv +deepseek_moe_16b_base,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HF OpenLLM v1,42.96,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HFv1 ARC,36.52,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HFv1 GSM8K,15.62,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HFv1 HellaSwag,60.63,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HFv1 MMLU,45.62,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv +deita_1_8b,HFv1 Winogrande,59.35,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HF OpenLLM v1,52.35,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HFv1 ARC,44.71,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HFv1 GSM8K,41.32,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HFv1 HellaSwag,70.39,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HFv1 MMLU,52.79,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HFv1 TruthfulQA,39.61,,hf_open_llm_v1_240829_frozen.csv +deita_2b,HFv1 Winogrande,65.27,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HF OpenLLM v1,72.16,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HFv1 GSM8K,72.33,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HFv1 HellaSwag,84.67,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HFv1 MMLU,73.95,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HFv1 TruthfulQA,58.11,,hf_open_llm_v1_240829_frozen.csv +deita_32b,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HF OpenLLM v1,71.56,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HFv1 GSM8K,66.19,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HFv1 HellaSwag,85.29,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HFv1 MMLU,76.66,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HFv1 TruthfulQA,54.35,,hf_open_llm_v1_240829_frozen.csv +deita_34b,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HF OpenLLM v1,56.43,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HFv1 ARC,46.08,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HFv1 GSM8K,48.9,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HFv1 HellaSwag,71.81,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HFv1 MMLU,55.46,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HFv1 TruthfulQA,50.23,,hf_open_llm_v1_240829_frozen.csv +deita_4b,HFv1 Winogrande,66.14,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HF OpenLLM v1,38.22,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HFv1 ARC,29.27,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HFv1 GSM8K,8.95,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HFv1 HellaSwag,50.0,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HFv1 MMLU,39.41,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HFv1 TruthfulQA,43.94,,hf_open_llm_v1_240829_frozen.csv +deita_500m,HFv1 Winogrande,57.77,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HF OpenLLM v1,42.96,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HFv1 ARC,36.52,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HFv1 GSM8K,15.62,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HFv1 HellaSwag,60.63,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HFv1 MMLU,45.62,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv +deita_qwen_1_8b,HFv1 Winogrande,59.35,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HF OpenLLM v1,61.04,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HFv1 GSM8K,46.93,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HFv1 HellaSwag,76.29,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HFv1 MMLU,59.06,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HFv1 TruthfulQA,51.74,,hf_open_llm_v1_240829_frozen.csv +delta_4b_base,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HF OpenLLM v1,54.23,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HFv1 ARC,57.59,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HFv1 HellaSwag,76.1,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HFv1 MMLU,57.26,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HFv1 TruthfulQA,54.31,,hf_open_llm_v1_240829_frozen.csv +delta_4b_notso_base,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HF OpenLLM v1,62.23,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HFv1 GSM8K,48.14,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HFv1 HellaSwag,76.59,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HFv1 MMLU,56.5,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HFv1 TruthfulQA,56.82,,hf_open_llm_v1_240829_frozen.csv +delta_4b_orange,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HF OpenLLM v1,68.42,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HFv1 GSM8K,60.88,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HFv1 HellaSwag,85.15,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HFv1 MMLU,63.5,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HFv1 TruthfulQA,55.75,,hf_open_llm_v1_240829_frozen.csv +distilabeled_hermes_2_5_mistral_7b,HFv1 Winogrande,78.93,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HF OpenLLM v1,73.63,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HFv1 ARC,70.73,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HFv1 HellaSwag,87.47,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HFv1 MMLU,65.22,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HFv1 TruthfulQA,65.1,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HF OpenLLM v1,73.4,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HFv1 HellaSwag,87.55,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HFv1 MMLU,65.33,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HFv1 TruthfulQA,64.21,,hf_open_llm_v1_240829_frozen.csv +distilabeled_marcoro14_7b_slerp_full,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HF OpenLLM v1,44.2,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HFv1 ARC,47.87,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HFv1 HellaSwag,78.11,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HFv1 MMLU,27.78,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HFv1 TruthfulQA,34.26,,hf_open_llm_v1_240829_frozen.csv +dociprollm_7b,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HF OpenLLM v1,61.12,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HFv1 GSM8K,20.77,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HFv1 HellaSwag,84.92,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HFv1 MMLU,63.32,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HF OpenLLM v1,65.5,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HFv1 GSM8K,47.23,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HFv1 HellaSwag,84.78,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HFv1 TruthfulQA,55.24,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_laser,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HF OpenLLM v1,65.03,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HFv1 GSM8K,47.23,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HFv1 HellaSwag,84.29,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HFv1 MMLU,63.02,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HFv1 TruthfulQA,54.75,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_1_mistral_7b_snr_math_laser,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HF OpenLLM v1,70.6,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HFv1 GSM8K,56.79,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HFv1 HellaSwag,85.97,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HFv1 MMLU,69.18,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HFv1 TruthfulQA,60.14,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_70b,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HF OpenLLM v1,46.67,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HFv1 ARC,42.15,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HFv1 GSM8K,3.71,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HFv1 HellaSwag,68.18,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HFv1 MMLU,55.47,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HFv1 TruthfulQA,45.93,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_2_yi_34b_200k,HFv1 Winogrande,64.56,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HF OpenLLM v1,40.62,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 ARC,38.99,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 HellaSwag,61.01,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 MMLU,27.32,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_6_mistral_7b_dpo_5_93b,HFv1 Winogrande,62.67,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HF OpenLLM v1,68.6,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HFv1 GSM8K,63.61,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HFv1 MMLU,63.24,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HFv1 TruthfulQA,55.1,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_8_experiment26_7b,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HF OpenLLM v1,63.7,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 ARC,58.53,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 GSM8K,57.01,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 HellaSwag,74.69,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 MMLU,65.98,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 TruthfulQA,52.25,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_phi_3_kensho_4_5b,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HF OpenLLM v1,75.05,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HFv1 GSM8K,73.01,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HFv1 HellaSwag,85.53,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HFv1 MMLU,77.52,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HFv1 TruthfulQA,62.34,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_34b,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HF OpenLLM v1,68.93,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HFv1 ARC,65.7,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HFv1 GSM8K,65.35,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HFv1 HellaSwag,81.02,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HFv1 MMLU,70.82,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HFv1 TruthfulQA,53.76,,hf_open_llm_v1_240829_frozen.csv +dolphin_2_9_1_yi_1_5_9b,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HF OpenLLM v1,58.69,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HFv1 GSM8K,33.06,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HFv1 MMLU,53.44,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HFv1 TruthfulQA,57.97,,hf_open_llm_v1_240829_frozen.csv +dolphin_nebula_7b,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HF OpenLLM v1,35.28,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HFv1 ARC,33.11,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HFv1 HellaSwag,54.31,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HFv1 MMLU,24.55,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HFv1 TruthfulQA,39.26,,hf_open_llm_v1_240829_frozen.csv +dopeyplats_1_1b_2t_v1,HFv1 Winogrande,58.8,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HF OpenLLM v1,36.74,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HFv1 ARC,34.39,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HFv1 HellaSwag,64.31,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HFv1 MMLU,25.4,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HFv1 TruthfulQA,38.21,,hf_open_llm_v1_240829_frozen.csv +dopeyshearedplats_1_3b_v1,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HF OpenLLM v1,29.37,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HFv1 ARC,23.89,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HFv1 HellaSwag,24.76,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HFv1 MMLU,23.13,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HFv1 TruthfulQA,53.4,,hf_open_llm_v1_240829_frozen.csv +dough_instruct_base_001,HFv1 Winogrande,51.07,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HF OpenLLM v1,76.17,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HFv1 GSM8K,68.01,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HFv1 HellaSwag,88.89,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HFv1 MMLU,64.09,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HFv1 TruthfulQA,79.07,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neuraltrix_7b,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HF OpenLLM v1,76.31,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HFv1 HellaSwag,89.05,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HFv1 MMLU,64.6,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HFv1 TruthfulQA,76.9,,hf_open_llm_v1_240829_frozen.csv +dpo_binarized_neutrixomnibe_7b,HFv1 Winogrande,85.08,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HF OpenLLM v1,35.13,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HFv1 ARC,30.63,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HFv1 HellaSwag,54.05,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HFv1 MMLU,24.79,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HFv1 TruthfulQA,42.69,,hf_open_llm_v1_240829_frozen.csv +dpo_miniguanaco_1_5t,HFv1 Winogrande,58.64,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HF OpenLLM v1,61.26,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HFv1 GSM8K,54.44,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HFv1 HellaSwag,75.13,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HFv1 MMLU,58.1,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HFv1 TruthfulQA,43.99,,hf_open_llm_v1_240829_frozen.csv +dpo_phi2,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HF OpenLLM v1,32.84,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 HellaSwag,41.45,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 MMLU,31.04,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 TruthfulQA,40.04,,hf_open_llm_v1_240829_frozen.csv +dpo_qlora_qwen1_5_0_5b_chat_xtuner,HFv1 Winogrande,53.28,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HF OpenLLM v1,33.47,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HFv1 ARC,29.61,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HFv1 GSM8K,2.81,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HFv1 HellaSwag,42.71,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HFv1 MMLU,30.64,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HFv1 TruthfulQA,41.23,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat,HFv1 Winogrande,53.83,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HF OpenLLM v1,35.68,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 GSM8K,6.97,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 HellaSwag,44.49,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 MMLU,33.46,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 TruthfulQA,42.05,,hf_open_llm_v1_240829_frozen.csv +dpo_qwen1_5_0_5b_chat_alignment_handbook,HFv1 Winogrande,55.25,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HF OpenLLM v1,39.42,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HFv1 ARC,39.25,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HFv1 HellaSwag,67.46,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HFv1 MMLU,24.21,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HFv1 TruthfulQA,39.81,,hf_open_llm_v1_240829_frozen.csv +dpo_test_hermes_open_llama3b,HFv1 Winogrande,64.4,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HF OpenLLM v1,67.58,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HFv1 ARC,65.7,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HFv1 GSM8K,54.36,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HFv1 HellaSwag,85.96,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HFv1 MMLU,63.89,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HFv1 TruthfulQA,56.95,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HF OpenLLM v1,69.58,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HFv1 ARC,66.64,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HFv1 GSM8K,63.61,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HFv1 MMLU,63.64,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HFv1 TruthfulQA,59.22,,hf_open_llm_v1_240829_frozen.csv +dpopenhermes_7b_v2,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HF OpenLLM v1,68.42,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HFv1 GSM8K,53.22,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HFv1 HellaSwag,85.45,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HFv1 MMLU,69.51,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HFv1 TruthfulQA,55.72,,hf_open_llm_v1_240829_frozen.csv +ds_diasum_md_mixtral,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HF OpenLLM v1,56.57,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HFv1 MMLU,57.5,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HFv1 TruthfulQA,52.27,,hf_open_llm_v1_240829_frozen.csv +duplicitous_mammal_13b,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HF OpenLLM v1,56.62,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HFv1 GSM8K,8.79,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HFv1 HellaSwag,83.92,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HFv1 MMLU,57.53,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HFv1 TruthfulQA,52.33,,hf_open_llm_v1_240829_frozen.csv +duplicitous_slurpbeast_13b,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HF OpenLLM v1,42.12,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HFv1 ARC,39.51,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HFv1 HellaSwag,48.92,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HFv1 MMLU,56.2,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HFv1 TruthfulQA,49.83,,hf_open_llm_v1_240829_frozen.csv +eastasia_4x7b_moe_experiment,HFv1 Winogrande,58.09,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HF OpenLLM v1,55.9,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HFv1 ARC,57.25,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HFv1 GSM8K,36.39,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HFv1 HellaSwag,72.15,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HFv1 MMLU,51.62,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HFv1 TruthfulQA,44.27,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_2_8b_v1_0,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HF OpenLLM v1,66.48,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HFv1 GSM8K,50.72,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HFv1 HellaSwag,83.04,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HFv1 MMLU,64.23,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HFv1 TruthfulQA,54.09,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_10_8b_v1_0,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HF OpenLLM v1,58.71,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HFv1 GSM8K,45.11,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HFv1 HellaSwag,72.42,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HFv1 MMLU,53.35,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HFv1 TruthfulQA,48.32,,hf_open_llm_v1_240829_frozen.csv +eeve_korean_instruct_2_8b_v1_0,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HF OpenLLM v1,71.73,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HFv1 ARC,68.86,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HFv1 MMLU,64.57,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HFv1 TruthfulQA,62.07,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_7b_full_slerp,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HF OpenLLM v1,73.08,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 HellaSwag,87.04,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 MMLU,65.32,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 TruthfulQA,64.37,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_marcoro14_nddmpk_krishnahercules_7b_slerp,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HF OpenLLM v1,72.5,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HFv1 GSM8K,69.6,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HFv1 HellaSwag,86.52,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HFv1 MMLU,65.41,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HFv1 TruthfulQA,62.29,,hf_open_llm_v1_240829_frozen.csv +einstein_4d_moe_2x7b_test,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HF OpenLLM v1,66.62,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HFv1 GSM8K,57.62,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HFv1 HellaSwag,83.75,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HFv1 MMLU,62.31,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HFv1 TruthfulQA,55.15,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_7b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HF OpenLLM v1,60.77,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HFv1 GSM8K,53.98,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HFv1 HellaSwag,74.07,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HFv1 MMLU,56.89,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HFv1 TruthfulQA,45.8,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_phi2,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HF OpenLLM v1,68.54,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HFv1 GSM8K,51.71,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HFv1 HellaSwag,83.85,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HFv1 MMLU,74.04,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HFv1 TruthfulQA,58.86,,hf_open_llm_v1_240829_frozen.csv +einstein_v4_qwen_1_5_32b,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HF OpenLLM v1,65.65,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HFv1 GSM8K,59.67,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HFv1 HellaSwag,80.99,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HFv1 MMLU,61.02,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HFv1 TruthfulQA,52.59,,hf_open_llm_v1_240829_frozen.csv +einstein_v5_v0_2_7b,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HF OpenLLM v1,69.01,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HFv1 HellaSwag,81.56,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HFv1 MMLU,68.23,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HFv1 TruthfulQA,52.44,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_llama3_8b_instruct_ties,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HF OpenLLM v1,61.25,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HFv1 GSM8K,56.48,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HFv1 HellaSwag,74.31,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HFv1 MMLU,56.82,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HFv1 TruthfulQA,46.24,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_1_phi2,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HF OpenLLM v1,67.12,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HFv1 GSM8K,63.53,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HFv1 HellaSwag,82.76,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HFv1 MMLU,62.23,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HFv1 TruthfulQA,52.02,,hf_open_llm_v1_240829_frozen.csv +einstein_v6_7b,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HF OpenLLM v1,49.78,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HFv1 GSM8K,7.88,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HFv1 HellaSwag,78.25,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HFv1 MMLU,47.07,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HFv1 TruthfulQA,39.08,,hf_open_llm_v1_240829_frozen.csv +elyza_japanese_llama_2_7b_instruct,HFv1 Winogrande,73.24,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HF OpenLLM v1,75.39,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HFv1 GSM8K,66.41,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HFv1 HellaSwag,89.12,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HFv1 MMLU,64.47,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HFv1 TruthfulQA,75.96,,hf_open_llm_v1_240829_frozen.csv +emertonbeagle_7b_dpo,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HF OpenLLM v1,75.74,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HFv1 HellaSwag,89.16,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HFv1 MMLU,64.05,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HFv1 TruthfulQA,78.09,,hf_open_llm_v1_240829_frozen.csv +emertonmonarch_7b,HFv1 Winogrande,85.16,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HF OpenLLM v1,75.67,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HFv1 HellaSwag,88.44,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HFv1 MMLU,64.44,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HFv1 TruthfulQA,75.62,,hf_open_llm_v1_240829_frozen.csv +emertonomnibeagle_7b_dpo,HFv1 Winogrande,84.29,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.76,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.66,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.94,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.92,,hf_open_llm_v1_240829_frozen.csv +ensemble5_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HF OpenLLM v1,56.49,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HFv1 GSM8K,10.77,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HFv1 HellaSwag,82.55,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HFv1 MMLU,56.79,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HFv1 TruthfulQA,49.86,,hf_open_llm_v1_240829_frozen.csv +ensemblev5_nova_13b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HF OpenLLM v1,74.87,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HFv1 HellaSwag,88.28,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HFv1 MMLU,64.71,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HFv1 TruthfulQA,70.94,,hf_open_llm_v1_240829_frozen.csv +eris_floramix_dpo_7b,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HF OpenLLM v1,74.71,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HFv1 HellaSwag,88.03,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HFv1 MMLU,65.29,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HFv1 TruthfulQA,68.92,,hf_open_llm_v1_240829_frozen.csv +eris_remix_dpo_7b,HFv1 Winogrande,84.77,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HF OpenLLM v1,71.76,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HFv1 ARC,69.2,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HFv1 HellaSwag,86.25,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HFv1 MMLU,65.15,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HFv1 TruthfulQA,62.9,,hf_open_llm_v1_240829_frozen.csv +eros_n_psyche_7b_model_stock,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HF OpenLLM v1,59.84,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HFv1 HellaSwag,72.38,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HFv1 MMLU,55.68,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HFv1 TruthfulQA,54.42,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_nca_fixed,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HF OpenLLM v1,58.32,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HFv1 GSM8K,44.35,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HFv1 HellaSwag,73.33,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HFv1 MMLU,55.37,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HFv1 TruthfulQA,49.55,,hf_open_llm_v1_240829_frozen.csv +eurus_70b_sft_fixed,HFv1 Winogrande,72.14,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HF OpenLLM v1,37.54,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HFv1 ARC,35.07,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HFv1 GSM8K,1.14,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HFv1 HellaSwag,60.93,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HFv1 MMLU,25.36,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HFv1 TruthfulQA,37.78,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b,HFv1 Winogrande,64.96,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HF OpenLLM v1,39.43,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HFv1 HellaSwag,60.97,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HFv1 MMLU,26.12,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HFv1 TruthfulQA,38.28,,hf_open_llm_v1_240829_frozen.csv +evaloric_1_1b_test,HFv1 Winogrande,69.61,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HF OpenLLM v1,71.71,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HFv1 GSM8K,66.94,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HFv1 HellaSwag,86.45,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HFv1 MMLU,63.97,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HFv1 TruthfulQA,64.01,,hf_open_llm_v1_240829_frozen.csv +evangelion_7b,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HF OpenLLM v1,72.54,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HFv1 GSM8K,63.68,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HFv1 HellaSwag,87.7,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HFv1 MMLU,64.88,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HFv1 TruthfulQA,66.07,,hf_open_llm_v1_240829_frozen.csv +everynight_7b_slerp,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HF OpenLLM v1,49.48,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HFv1 ARC,45.99,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HFv1 GSM8K,39.8,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HFv1 HellaSwag,61.71,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HFv1 MMLU,44.05,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HFv1 TruthfulQA,42.26,,hf_open_llm_v1_240829_frozen.csv +everyone_coder_33b_base,HFv1 Winogrande,63.06,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HF OpenLLM v1,54.24,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HFv1 ARC,58.36,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HFv1 HellaSwag,81.03,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HFv1 MMLU,54.7,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HFv1 TruthfulQA,52.98,,hf_open_llm_v1_240829_frozen.csv +everythinglm_13b_v3_peft,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HF OpenLLM v1,43.11,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HFv1 ARC,39.93,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HFv1 GSM8K,4.32,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HFv1 HellaSwag,68.11,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HFv1 MMLU,39.44,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HFv1 TruthfulQA,42.01,,hf_open_llm_v1_240829_frozen.csv +ex_llm_e1,HFv1 Winogrande,64.88,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HF OpenLLM v1,73.84,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HFv1 GSM8K,65.43,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HFv1 HellaSwag,87.93,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HFv1 MMLU,65.46,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HFv1 TruthfulQA,70.82,,hf_open_llm_v1_240829_frozen.csv +excalibur_7b_dpo,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HF OpenLLM v1,76.04,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HFv1 GSM8K,70.28,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HFv1 HellaSwag,88.74,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HFv1 MMLU,64.64,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HFv1 TruthfulQA,74.9,,hf_open_llm_v1_240829_frozen.csv +experiment26_spin_iter_0,HFv1 Winogrande,85.24,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HF OpenLLM v1,59.52,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HFv1 ARC,59.47,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HFv1 GSM8K,34.72,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HFv1 HellaSwag,82.42,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HFv1 MMLU,62.21,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HFv1 TruthfulQA,40.01,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_1_merged,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HF OpenLLM v1,29.55,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HFv1 ARC,29.52,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HFv1 HellaSwag,25.9,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HFv1 TruthfulQA,48.27,,hf_open_llm_v1_240829_frozen.csv +experiment_dpo_m7b2_3_merged,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HF OpenLLM v1,59.62,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HFv1 GSM8K,34.5,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HFv1 HellaSwag,82.48,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HFv1 MMLU,62.61,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HFv1 TruthfulQA,40.38,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_1_merged,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HF OpenLLM v1,59.54,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HFv1 GSM8K,34.42,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HFv1 HellaSwag,82.44,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HFv1 MMLU,62.25,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HFv1 TruthfulQA,40.09,,hf_open_llm_v1_240829_frozen.csv +experiment_orpo_m7b2_2_merged,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HF OpenLLM v1,56.93,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HFv1 ARC,56.83,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HFv1 GSM8K,25.32,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HFv1 HellaSwag,79.75,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HFv1 MMLU,56.76,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HFv1 TruthfulQA,46.29,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_1_merged,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HF OpenLLM v1,59.59,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HFv1 GSM8K,34.57,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HFv1 HellaSwag,82.47,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HFv1 MMLU,62.42,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HFv1 TruthfulQA,40.25,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_2_merged,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HF OpenLLM v1,59.55,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HFv1 GSM8K,34.57,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HFv1 HellaSwag,82.39,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HFv1 MMLU,62.3,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HFv1 TruthfulQA,40.04,,hf_open_llm_v1_240829_frozen.csv +experiment_sft_m7b2_3_merged,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HF OpenLLM v1,28.66,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 HellaSwag,25.0,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 TruthfulQA,48.41,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_kv_cache,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HF OpenLLM v1,28.37,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 HellaSwag,25.57,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 TruthfulQA,49.03,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_125m_qcqa_ub_6_best_for_q_loss,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HF OpenLLM v1,28.84,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 ARC,23.04,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 HellaSwag,25.94,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 TruthfulQA,48.99,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_gqa_ub_16_best_for_kv_cache,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HF OpenLLM v1,28.58,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 ARC,23.81,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 HellaSwag,27.05,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 TruthfulQA,46.69,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_kv_cache,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HF OpenLLM v1,28.25,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 ARC,21.67,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 HellaSwag,26.65,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 TruthfulQA,46.81,,hf_open_llm_v1_240829_frozen.csv +facebook_opt_6_7b_qcqa_ub_16_best_for_q_loss,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HF OpenLLM v1,64.28,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HFv1 GSM8K,53.83,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HFv1 HellaSwag,82.91,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HFv1 MMLU,58.37,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HFv1 TruthfulQA,52.56,,hf_open_llm_v1_240829_frozen.csv +falcon_11b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HF OpenLLM v1,67.85,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HFv1 ARC,69.45,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HFv1 GSM8K,45.94,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HFv1 HellaSwag,88.89,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HFv1 MMLU,70.5,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HFv1 TruthfulQA,45.47,,hf_open_llm_v1_240829_frozen.csv +falcon_180b,HFv1 Winogrande,86.9,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HF OpenLLM v1,35.02,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HFv1 ARC,32.94,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HFv1 HellaSwag,57.24,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HFv1 MMLU,25.26,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HFv1 TruthfulQA,38.49,,hf_open_llm_v1_240829_frozen.csv +falcon_1b_t_sft,HFv1 Winogrande,55.88,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HF OpenLLM v1,58.07,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HFv1 GSM8K,21.46,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HFv1 HellaSwag,85.28,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HFv1 MMLU,56.89,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HFv1 TruthfulQA,41.65,,hf_open_llm_v1_240829_frozen.csv +falcon_40b,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HF OpenLLM v1,44.17,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HFv1 ARC,47.87,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HFv1 HellaSwag,78.13,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HFv1 MMLU,27.79,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HFv1 TruthfulQA,34.26,,hf_open_llm_v1_240829_frozen.csv +falcon_7b,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HF OpenLLM v1,43.65,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HFv1 ARC,47.61,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HFv1 HellaSwag,77.24,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HFv1 MMLU,29.73,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HFv1 TruthfulQA,36.27,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_3epoch_norobots,HFv1 Winogrande,69.53,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HF OpenLLM v1,43.26,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HFv1 ARC,46.16,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HFv1 HellaSwag,70.85,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HFv1 MMLU,25.84,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HFv1 TruthfulQA,44.08,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_instruct,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HF OpenLLM v1,44.46,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HFv1 ARC,47.87,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HFv1 HellaSwag,77.92,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HFv1 MMLU,27.94,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HFv1 TruthfulQA,36.81,,hf_open_llm_v1_240829_frozen.csv +falcon_7b_norobots,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HF OpenLLM v1,37.07,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HFv1 ARC,35.07,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HFv1 HellaSwag,63.56,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HFv1 MMLU,25.28,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HFv1 TruthfulQA,35.96,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b,HFv1 Winogrande,62.04,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HF OpenLLM v1,37.37,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HFv1 ARC,35.58,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HFv1 HellaSwag,61.12,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HFv1 MMLU,24.51,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HFv1 TruthfulQA,39.62,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_chat,HFv1 Winogrande,61.72,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HF OpenLLM v1,37.63,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HFv1 ARC,34.56,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HFv1 GSM8K,3.41,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HFv1 HellaSwag,60.93,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HFv1 MMLU,28.77,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HFv1 TruthfulQA,37.42,,hf_open_llm_v1_240829_frozen.csv +falcon_rw_1b_instruct_openorca,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HF OpenLLM v1,71.88,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HFv1 HellaSwag,83.53,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HFv1 MMLU,76.6,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HFv1 TruthfulQA,55.64,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HF OpenLLM v1,71.88,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HFv1 HellaSwag,83.53,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HFv1 MMLU,76.6,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HFv1 TruthfulQA,55.64,,hf_open_llm_v1_240829_frozen.csv +faro_yi_34b_200k,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HF OpenLLM v1,66.37,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HFv1 GSM8K,63.0,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HFv1 HellaSwag,76.95,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HFv1 MMLU,70.77,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HFv1 TruthfulQA,50.17,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HF OpenLLM v1,66.37,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HFv1 GSM8K,63.0,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HFv1 HellaSwag,76.95,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HFv1 MMLU,70.77,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HFv1 TruthfulQA,50.17,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_200k,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HF OpenLLM v1,68.77,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HFv1 GSM8K,64.75,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HFv1 HellaSwag,78.92,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HFv1 MMLU,70.74,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HFv1 TruthfulQA,56.25,,hf_open_llm_v1_240829_frozen.csv +faro_yi_9b_dpo,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HF OpenLLM v1,76.07,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HFv1 ARC,73.55,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HFv1 HellaSwag,88.95,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HFv1 TruthfulQA,72.53,,hf_open_llm_v1_240829_frozen.csv +fasciculus_arcuatus_7b_slerp,HFv1 Winogrande,85.71,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HF OpenLLM v1,30.21,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HFv1 HellaSwag,36.6,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HFv1 MMLU,26.22,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HFv1 TruthfulQA,40.97,,hf_open_llm_v1_240829_frozen.csv +fbopt_350m_8bit,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HFv1 GSM8K,51.78,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HFv1 HellaSwag,84.61,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HFv1 MMLU,61.05,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HFv1 TruthfulQA,64.23,,hf_open_llm_v1_240829_frozen.csv +felix_8b,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HF OpenLLM v1,53.93,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HFv1 HellaSwag,81.33,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HFv1 MMLU,60.27,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HFv1 TruthfulQA,40.01,,hf_open_llm_v1_240829_frozen.csv +ferret_7b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HF OpenLLM v1,69.09,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HFv1 GSM8K,47.76,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HFv1 HellaSwag,86.37,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HFv1 MMLU,63.24,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HFv1 TruthfulQA,68.64,,hf_open_llm_v1_240829_frozen.csv +fettuccinelake_dpo_7b_slerp,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HF OpenLLM v1,34.58,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HFv1 ARC,30.97,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HFv1 HellaSwag,48.83,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HFv1 MMLU,26.36,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HFv1 TruthfulQA,40.58,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3,HFv1 Winogrande,59.43,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HF OpenLLM v1,34.11,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HFv1 ARC,29.95,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HFv1 HellaSwag,47.28,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HFv1 MMLU,25.41,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HFv1 TruthfulQA,43.03,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v3_1,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HF OpenLLM v1,34.18,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HFv1 ARC,29.69,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HFv1 HellaSwag,47.37,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HFv1 MMLU,25.09,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HFv1 TruthfulQA,43.65,,hf_open_llm_v1_240829_frozen.csv +fialka_13b_v4,HFv1 Winogrande,58.88,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HF OpenLLM v1,46.4,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HFv1 ARC,48.55,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HFv1 HellaSwag,71.05,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HFv1 MMLU,43.06,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HFv1 TruthfulQA,44.79,,hf_open_llm_v1_240829_frozen.csv +fialka_7b_v3,HFv1 Winogrande,69.46,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HF OpenLLM v1,51.59,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HFv1 GSM8K,27.9,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HFv1 HellaSwag,67.11,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HFv1 MMLU,49.3,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HFv1 TruthfulQA,39.93,,hf_open_llm_v1_240829_frozen.csv +fietje_2b,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HF OpenLLM v1,48.75,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HFv1 ARC,54.01,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HFv1 GSM8K,6.14,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HFv1 HellaSwag,68.92,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HFv1 MMLU,49.92,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_chat,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HF OpenLLM v1,50.3,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HFv1 GSM8K,14.71,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HFv1 HellaSwag,68.08,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HFv1 MMLU,49.74,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HFv1 TruthfulQA,43.47,,hf_open_llm_v1_240829_frozen.csv +fietje_2b_instruct,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HF OpenLLM v1,49.64,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 GSM8K,6.82,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 HellaSwag,79.08,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 MMLU,47.58,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 TruthfulQA,37.14,,hf_open_llm_v1_240829_frozen.csv +flan_llama_7b_2_llama2_7b_flash_868_full_model,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HF OpenLLM v1,32.27,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HFv1 ARC,26.79,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HFv1 HellaSwag,41.63,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HFv1 MMLU,26.65,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HFv1 TruthfulQA,44.38,,hf_open_llm_v1_240829_frozen.csv +flor_1_3b_xat,HFv1 Winogrande,53.43,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HF OpenLLM v1,74.26,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HFv1 ARC,71.76,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HFv1 GSM8K,65.81,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HFv1 HellaSwag,88.28,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HFv1 MMLU,64.13,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HFv1 TruthfulQA,71.08,,hf_open_llm_v1_240829_frozen.csv +flora_dpo_7b,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HF OpenLLM v1,30.19,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HFv1 ARC,24.74,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HFv1 HellaSwag,38.44,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HFv1 MMLU,26.37,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HFv1 TruthfulQA,41.3,,hf_open_llm_v1_240829_frozen.csv +flyingllama_v2,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HF OpenLLM v1,67.03,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HFv1 ARC,65.53,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HFv1 GSM8K,45.79,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HFv1 HellaSwag,86.45,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HFv1 MMLU,63.72,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HFv1 TruthfulQA,62.14,,hf_open_llm_v1_240829_frozen.csv +franken_solar_18b_v1_0,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HF OpenLLM v1,71.67,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HFv1 ARC,71.59,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HFv1 GSM8K,48.67,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HFv1 HellaSwag,88.59,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HFv1 TruthfulQA,73.69,,hf_open_llm_v1_240829_frozen.csv +frankenmonarch_7b,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HF OpenLLM v1,62.69,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HFv1 GSM8K,44.66,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HFv1 HellaSwag,81.88,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HFv1 MMLU,66.59,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HFv1 TruthfulQA,45.83,,hf_open_llm_v1_240829_frozen.csv +free_llama3_dpo_v0_2,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HF OpenLLM v1,60.06,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 GSM8K,32.22,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 HellaSwag,81.26,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 MMLU,64.83,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 TruthfulQA,44.5,,hf_open_llm_v1_240829_frozen.csv +freeze_kosolar_10_7b_v0_2_1_4_dedup,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HF OpenLLM v1,61.2,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HFv1 GSM8K,27.22,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HFv1 HellaSwag,84.42,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HFv1 MMLU,61.21,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HFv1 TruthfulQA,53.56,,hf_open_llm_v1_240829_frozen.csv +fsfairx_zephyr_chat_v0_1,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HF OpenLLM v1,51.07,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HFv1 GSM8K,14.33,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HFv1 HellaSwag,78.72,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HFv1 MMLU,47.93,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HFv1 TruthfulQA,38.17,,hf_open_llm_v1_240829_frozen.csv +fusellm_7b,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HF OpenLLM v1,76.09,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 ARC,73.89,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 HellaSwag,88.94,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 MMLU,65.03,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 TruthfulQA,71.24,,hf_open_llm_v1_240829_frozen.csv +fusionnet_7bx2_moe_ko_dpo_adapter_attach,HFv1 Winogrande,87.61,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HF OpenLLM v1,47.69,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HFv1 ARC,52.82,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HFv1 HellaSwag,76.31,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HFv1 MMLU,40.83,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HFv1 TruthfulQA,44.64,,hf_open_llm_v1_240829_frozen.csv +gaja_v1_00,HFv1 Winogrande,70.64,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HF OpenLLM v1,46.91,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HFv1 ARC,51.71,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HFv1 HellaSwag,75.87,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HFv1 MMLU,40.79,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HFv1 TruthfulQA,41.29,,hf_open_llm_v1_240829_frozen.csv +gaja_v2_00_dpo,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HF OpenLLM v1,42.23,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HFv1 ARC,48.89,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HFv1 HellaSwag,57.8,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HFv1 MMLU,43.72,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HFv1 TruthfulQA,41.1,,hf_open_llm_v1_240829_frozen.csv +galpaca_30b_miniorca,HFv1 Winogrande,60.06,,hf_open_llm_v1_240829_frozen.csv +garrulus,HF OpenLLM v1,75.16,,hf_open_llm_v1_240829_frozen.csv +garrulus,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv +garrulus,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +garrulus,HFv1 HellaSwag,88.87,,hf_open_llm_v1_240829_frozen.csv +garrulus,HFv1 MMLU,64.57,,hf_open_llm_v1_240829_frozen.csv +garrulus,HFv1 TruthfulQA,68.23,,hf_open_llm_v1_240829_frozen.csv +garrulus,HFv1 Winogrande,91.48,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HF OpenLLM v1,50.79,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HFv1 ARC,50.34,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HFv1 GSM8K,16.22,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HFv1 HellaSwag,74.13,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HFv1 MMLU,49.0,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HFv1 TruthfulQA,43.55,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_chat_v2,HFv1 Winogrande,71.51,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HF OpenLLM v1,52.61,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HFv1 ARC,45.48,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HFv1 GSM8K,19.41,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HFv1 HellaSwag,75.5,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HFv1 MMLU,50.16,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HFv1 TruthfulQA,53.36,,hf_open_llm_v1_240829_frozen.csv +geitje_7b_ultra,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HF OpenLLM v1,60.9,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HFv1 ARC,54.61,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HFv1 GSM8K,46.93,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HFv1 HellaSwag,77.69,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HFv1 MMLU,66.54,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HFv1 TruthfulQA,45.38,,hf_open_llm_v1_240829_frozen.csv +gem_14b_instruct,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HF OpenLLM v1,60.09,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HFv1 GSM8K,42.99,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HFv1 HellaSwag,76.21,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HFv1 TruthfulQA,50.74,,hf_open_llm_v1_240829_frozen.csv +gemma_1_1_7b_it,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HF OpenLLM v1,46.51,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HFv1 ARC,48.46,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HFv1 HellaSwag,71.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HFv1 MMLU,41.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HFv1 TruthfulQA,33.13,,hf_open_llm_v1_240829_frozen.csv +gemma_2b,HFv1 Winogrande,66.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HF OpenLLM v1,42.78,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HFv1 ARC,43.94,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HFv1 GSM8K,5.46,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HFv1 HellaSwag,62.71,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HFv1 MMLU,37.68,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HFv1 TruthfulQA,45.85,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_p1,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HF OpenLLM v1,42.8,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HFv1 GSM8K,5.38,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HFv1 HellaSwag,62.67,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HFv1 MMLU,37.58,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HFv1 TruthfulQA,45.8,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_nlai_v0,HFv1 Winogrande,61.33,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HF OpenLLM v1,42.79,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HFv1 HellaSwag,62.82,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HF OpenLLM v1,42.79,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HFv1 HellaSwag,62.82,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test1,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HF OpenLLM v1,42.79,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HFv1 ARC,44.03,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HFv1 HellaSwag,62.82,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_it_sp_test_openherms_step500,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HF OpenLLM v1,42.83,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HFv1 ARC,43.77,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HFv1 GSM8K,5.91,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HFv1 HellaSwag,62.73,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HFv1 MMLU,37.72,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HFv1 TruthfulQA,45.85,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_nlaf_v0,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HF OpenLLM v1,42.78,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HFv1 ARC,43.94,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HFv1 GSM8K,5.61,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HFv1 HellaSwag,62.74,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HFv1 MMLU,37.62,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HFv1 TruthfulQA,45.83,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_openhermes,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HF OpenLLM v1,47.35,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HFv1 GSM8K,13.87,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HFv1 HellaSwag,73.72,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HFv1 MMLU,38.52,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HFv1 TruthfulQA,44.53,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_orpo,HFv1 Winogrande,64.33,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HF OpenLLM v1,41.25,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HFv1 ARC,41.38,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HFv1 HellaSwag,63.2,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HFv1 MMLU,31.94,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HFv1 TruthfulQA,46.95,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_sft_telugu,HFv1 Winogrande,59.98,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HF OpenLLM v1,45.13,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HFv1 ARC,47.44,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HFv1 GSM8K,12.89,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HFv1 HellaSwag,71.3,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HFv1 MMLU,38.21,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HFv1 TruthfulQA,34.93,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_tamil,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HF OpenLLM v1,49.38,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HFv1 ARC,51.96,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HFv1 GSM8K,18.57,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HFv1 HellaSwag,73.33,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HFv1 MMLU,43.31,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HFv1 TruthfulQA,42.62,,hf_open_llm_v1_240829_frozen.csv +gemma_2b_zephyr_dpo,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HF OpenLLM v1,63.75,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HFv1 GSM8K,50.87,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HFv1 HellaSwag,82.2,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HFv1 MMLU,64.56,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HFv1 TruthfulQA,44.79,,hf_open_llm_v1_240829_frozen.csv +gemma_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HF OpenLLM v1,62.71,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HFv1 GSM8K,40.33,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HFv1 HellaSwag,81.65,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HFv1 MMLU,58.94,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HFv1 TruthfulQA,53.54,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_open_platypus_commercial,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HF OpenLLM v1,53.67,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HFv1 ARC,51.28,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HFv1 GSM8K,29.87,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HFv1 HellaSwag,71.93,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HFv1 MMLU,53.56,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HFv1 TruthfulQA,47.18,,hf_open_llm_v1_240829_frozen.csv +gemma_7b_openhermes,HFv1 Winogrande,68.19,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HF OpenLLM v1,30.92,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HFv1 ARC,26.71,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HFv1 HellaSwag,36.2,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HFv1 MMLU,26.79,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HFv1 TruthfulQA,40.41,,hf_open_llm_v1_240829_frozen.csv +gemma_ko_1_1_2b_it,HFv1 Winogrande,55.49,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HF OpenLLM v1,42.76,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HFv1 ARC,43.86,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HFv1 GSM8K,5.38,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HFv1 HellaSwag,62.7,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HFv1 MMLU,37.66,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv +gemma_nlaf_v1,HFv1 Winogrande,61.09,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HF OpenLLM v1,56.98,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HFv1 GSM8K,7.73,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HFv1 HellaSwag,83.27,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HFv1 MMLU,59.47,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HFv1 TruthfulQA,51.79,,hf_open_llm_v1_240829_frozen.csv +genai_nova_13b,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HF OpenLLM v1,68.35,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HFv1 GSM8K,33.74,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HFv1 HellaSwag,87.99,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HFv1 MMLU,70.78,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HFv1 TruthfulQA,62.66,,hf_open_llm_v1_240829_frozen.csv +genz_70b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HF OpenLLM v1,57.65,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HFv1 ARC,54.86,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HFv1 GSM8K,47.69,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HFv1 HellaSwag,76.1,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HFv1 MMLU,50.71,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HFv1 TruthfulQA,44.63,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_alpha,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HF OpenLLM v1,56.89,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HFv1 GSM8K,33.74,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HFv1 HellaSwag,77.93,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HFv1 MMLU,55.09,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HFv1 TruthfulQA,47.79,,hf_open_llm_v1_240829_frozen.csv +ghost_7b_v0_9_0,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HF OpenLLM v1,53.35,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HFv1 HellaSwag,83.19,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HFv1 MMLU,55.15,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HFv1 TruthfulQA,40.56,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoecons,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HF OpenLLM v1,53.74,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HFv1 HellaSwag,84.11,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HFv1 MMLU,54.67,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv +giftedconvo13bloranoeconse4,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HF OpenLLM v1,57.24,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HFv1 GSM8K,26.16,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HFv1 HellaSwag,79.59,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HFv1 MMLU,55.01,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HFv1 TruthfulQA,46.68,,hf_open_llm_v1_240829_frozen.csv +giraffe_13b_32k_v3,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HF OpenLLM v1,54.69,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HFv1 GSM8K,21.3,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HFv1 HellaSwag,80.42,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HFv1 MMLU,53.61,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HFv1 TruthfulQA,42.58,,hf_open_llm_v1_240829_frozen.csv +giraffe_beta_13b_32k,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HF OpenLLM v1,74.5,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HFv1 GSM8K,70.43,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HFv1 MMLU,65.0,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HFv1 TruthfulQA,69.16,,hf_open_llm_v1_240829_frozen.csv +go_bruins_v2_1,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HF OpenLLM v1,69.46,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HFv1 GSM8K,43.21,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HFv1 HellaSwag,87.53,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HFv1 MMLU,69.88,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HFv1 TruthfulQA,61.54,,hf_open_llm_v1_240829_frozen.csv +godzilla2_70b,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HF OpenLLM v1,75.57,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HFv1 HellaSwag,88.71,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HFv1 MMLU,64.96,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HFv1 TruthfulQA,72.56,,hf_open_llm_v1_240829_frozen.csv +goldenmaiden_7b_model_stock,HFv1 Winogrande,85.16,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HF OpenLLM v1,37.48,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HFv1 GSM8K,3.26,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HFv1 HellaSwag,57.93,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HFv1 MMLU,29.38,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HFv1 TruthfulQA,39.27,,hf_open_llm_v1_240829_frozen.csv +gollie_7b,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HF OpenLLM v1,66.63,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HFv1 GSM8K,47.61,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HFv1 HellaSwag,85.4,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HFv1 MMLU,63.75,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HFv1 TruthfulQA,60.23,,hf_open_llm_v1_240829_frozen.csv +gonzo_chat_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HF OpenLLM v1,47.64,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HFv1 ARC,49.74,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HFv1 GSM8K,3.94,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HFv1 HellaSwag,71.9,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HFv1 MMLU,42.96,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HFv1 TruthfulQA,47.66,,hf_open_llm_v1_240829_frozen.csv +gowizardlm,HFv1 Winogrande,69.61,,hf_open_llm_v1_240829_frozen.csv +gpt2,HF OpenLLM v1,28.53,,hf_open_llm_v1_240829_frozen.csv +gpt2,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv +gpt2,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +gpt2,HFv1 HellaSwag,31.58,,hf_open_llm_v1_240829_frozen.csv +gpt2,HFv1 MMLU,25.83,,hf_open_llm_v1_240829_frozen.csv +gpt2,HFv1 TruthfulQA,41.15,,hf_open_llm_v1_240829_frozen.csv +gpt2,HFv1 Winogrande,50.43,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HF OpenLLM v1,28.41,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HFv1 ARC,23.04,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HFv1 HellaSwag,31.32,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HFv1 MMLU,26.91,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HFv1 TruthfulQA,39.56,,hf_open_llm_v1_240829_frozen.csv +gpt2_camel_physics_platypus,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HF OpenLLM v1,33.91,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HFv1 ARC,29.69,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HFv1 HellaSwag,50.27,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HFv1 MMLU,26.42,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HFv1 TruthfulQA,40.38,,hf_open_llm_v1_240829_frozen.csv +gpt2_chatbot,HFv1 Winogrande,56.67,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HF OpenLLM v1,29.21,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HFv1 HellaSwag,30.77,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HFv1 MMLU,25.81,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HFv1 TruthfulQA,44.97,,hf_open_llm_v1_240829_frozen.csv +gpt2_dolly,HFv1 Winogrande,51.46,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HF OpenLLM v1,28.7,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HFv1 HellaSwag,26.02,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HFv1 MMLU,24.79,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HFv1 TruthfulQA,49.87,,hf_open_llm_v1_240829_frozen.csv +gpt2_final,HFv1 Winogrande,49.96,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HF OpenLLM v1,28.52,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HFv1 HellaSwag,31.03,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HFv1 MMLU,26.4,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv +gpt2_guanaco_dolly_platypus,HFv1 Winogrande,50.12,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HF OpenLLM v1,32.07,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HFv1 HellaSwag,45.62,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HFv1 MMLU,26.07,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HFv1 TruthfulQA,38.72,,hf_open_llm_v1_240829_frozen.csv +gpt2_large,HFv1 Winogrande,55.41,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HF OpenLLM v1,32.33,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HFv1 HellaSwag,44.98,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HFv1 MMLU,26.33,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HFv1 TruthfulQA,39.6,,hf_open_llm_v1_240829_frozen.csv +gpt2_large_conversational,HFv1 Winogrande,56.04,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HF OpenLLM v1,28.58,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HFv1 HellaSwag,31.29,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HFv1 MMLU,26.19,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HFv1 TruthfulQA,40.35,,hf_open_llm_v1_240829_frozen.csv +gpt2_open_platypus,HFv1 Winogrande,51.3,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HF OpenLLM v1,28.41,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HFv1 ARC,23.04,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HFv1 HellaSwag,31.32,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HFv1 MMLU,26.91,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HFv1 TruthfulQA,39.56,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_camel_physics,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HF OpenLLM v1,28.51,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HFv1 ARC,23.21,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HFv1 HellaSwag,31.04,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HFv1 MMLU,26.16,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HFv1 TruthfulQA,40.31,,hf_open_llm_v1_240829_frozen.csv +gpt2_platypus_dolly_guanaco,HFv1 Winogrande,50.36,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HF OpenLLM v1,28.4,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HFv1 ARC,21.84,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HFv1 HellaSwag,31.6,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HFv1 MMLU,25.86,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HFv1 TruthfulQA,40.67,,hf_open_llm_v1_240829_frozen.csv +gpt2_test,HFv1 Winogrande,50.12,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HF OpenLLM v1,34.38,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HFv1 HellaSwag,51.36,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HFv1 MMLU,26.54,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HFv1 TruthfulQA,38.54,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl,HFv1 Winogrande,58.25,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HF OpenLLM v1,34.12,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HFv1 HellaSwag,51.28,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HFv1 TruthfulQA,38.74,,hf_open_llm_v1_240829_frozen.csv +gpt2_xl_lima,HFv1 Winogrande,57.22,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HF OpenLLM v1,32.95,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HFv1 ARC,24.66,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HFv1 HellaSwag,46.76,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HFv1 MMLU,23.49,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HFv1 TruthfulQA,44.47,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_13b,HFv1 Winogrande,58.01,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HF OpenLLM v1,29.11,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HFv1 ARC,21.76,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HFv1 HellaSwag,32.88,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HFv1 MMLU,24.11,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HFv1 TruthfulQA,44.35,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_large,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HF OpenLLM v1,27.95,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HFv1 ARC,20.48,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HFv1 HellaSwag,28.09,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HFv1 MMLU,24.47,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HFv1 TruthfulQA,46.47,,hf_open_llm_v1_240829_frozen.csv +gpt3_finnish_small,HFv1 Winogrande,48.22,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HF OpenLLM v1,29.21,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HFv1 HellaSwag,27.79,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HFv1 MMLU,25.52,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HFv1 TruthfulQA,45.67,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_115k_steps,HFv1 Winogrande,53.51,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HF OpenLLM v1,28.65,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HFv1 HellaSwag,25.81,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HFv1 MMLU,23.84,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HFv1 TruthfulQA,50.99,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_20k_steps,HFv1 Winogrande,48.46,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HF OpenLLM v1,28.59,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HFv1 ARC,22.53,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HFv1 HellaSwag,26.39,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HFv1 MMLU,23.73,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HFv1 TruthfulQA,49.25,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_32k_steps,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HF OpenLLM v1,28.55,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HFv1 HellaSwag,26.55,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HFv1 MMLU,24.15,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HFv1 TruthfulQA,47.84,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_40k_steps,HFv1 Winogrande,49.88,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HF OpenLLM v1,28.79,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HFv1 HellaSwag,26.66,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HFv1 MMLU,24.05,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HFv1 TruthfulQA,48.32,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_43k_steps,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HF OpenLLM v1,28.65,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HFv1 HellaSwag,25.81,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HFv1 MMLU,23.84,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HFv1 TruthfulQA,50.99,,hf_open_llm_v1_240829_frozen.csv +gpt_2_large_51k_steps,HFv1 Winogrande,48.46,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HF OpenLLM v1,28.3,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 HellaSwag,29.43,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 MMLU,25.82,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 TruthfulQA,38.84,,hf_open_llm_v1_240829_frozen.csv +gpt_2_slimorcadeduped_airoboros_3_1_metamathqa_sft_124m,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HF OpenLLM v1,33.96,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HFv1 ARC,29.52,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HFv1 HellaSwag,50.62,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HFv1 MMLU,26.79,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HFv1 TruthfulQA,39.12,,hf_open_llm_v1_240829_frozen.csv +gpt_2_xl_camel_ai_physics,HFv1 Winogrande,57.54,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HFv1 ARC,21.16,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HFv1 HellaSwag,30.84,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HFv1 MMLU,24.97,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HFv1 TruthfulQA,45.64,,hf_open_llm_v1_240829_frozen.csv +gpt_bigcode_santacoder,HFv1 Winogrande,47.83,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HF OpenLLM v1,40.1,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HFv1 ARC,41.38,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HFv1 GSM8K,2.96,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HFv1 HellaSwag,67.54,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HFv1 MMLU,26.78,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HFv1 TruthfulQA,35.96,,hf_open_llm_v1_240829_frozen.csv +gpt_j_6b,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HF OpenLLM v1,29.47,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HFv1 ARC,22.95,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HFv1 HellaSwag,30.26,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HFv1 MMLU,25.97,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HFv1 TruthfulQA,45.58,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_125m,HFv1 Winogrande,51.78,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HF OpenLLM v1,33.58,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HFv1 ARC,31.23,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HFv1 HellaSwag,48.47,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HFv1 MMLU,24.82,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HFv1 TruthfulQA,39.63,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_1_3b,HFv1 Winogrande,56.91,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HF OpenLLM v1,36.2,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HFv1 HellaSwag,56.24,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HFv1 MMLU,26.45,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HFv1 TruthfulQA,39.78,,hf_open_llm_v1_240829_frozen.csv +gpt_neo_2_7b,HFv1 Winogrande,60.06,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HF OpenLLM v1,41.69,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HFv1 ARC,45.73,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HFv1 GSM8K,5.46,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HFv1 HellaSwag,73.45,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HFv1 MMLU,25.0,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HFv1 TruthfulQA,31.61,,hf_open_llm_v1_240829_frozen.csv +gpt_neox_20b,HFv1 Winogrande,68.9,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HFv1 HellaSwag,29.56,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HFv1 MMLU,24.53,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HFv1 TruthfulQA,44.07,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HF OpenLLM v1,28.2,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HFv1 ARC,23.38,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HFv1 HellaSwag,29.88,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HFv1 MMLU,23.78,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HFv1 TruthfulQA,42.65,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_126m_instruct,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HF OpenLLM v1,34.31,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HFv1 ARC,30.38,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HFv1 HellaSwag,50.4,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HFv1 MMLU,26.14,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HFv1 TruthfulQA,39.97,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b,HFv1 Winogrande,58.88,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HF OpenLLM v1,34.54,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HFv1 ARC,30.97,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HFv1 HellaSwag,51.42,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HFv1 MMLU,26.17,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HFv1 TruthfulQA,40.31,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_1_3b_instruct,HFv1 Winogrande,56.75,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HF OpenLLM v1,40.71,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HFv1 ARC,41.81,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HFv1 HellaSwag,68.75,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HFv1 MMLU,28.47,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HFv1 TruthfulQA,37.1,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b,HFv1 Winogrande,67.17,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HF OpenLLM v1,43.7,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HFv1 ARC,43.17,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HFv1 GSM8K,8.79,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HFv1 HellaSwag,71.09,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HFv1 MMLU,31.32,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HFv1 TruthfulQA,41.02,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_20b_instruct,HFv1 Winogrande,66.77,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HF OpenLLM v1,30.41,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HFv1 ARC,23.63,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HFv1 HellaSwag,37.05,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HFv1 TruthfulQA,42.55,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m,HFv1 Winogrande,53.04,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HF OpenLLM v1,30.93,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HFv1 HellaSwag,38.01,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HFv1 MMLU,25.53,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HFv1 TruthfulQA,40.74,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_356m_instruct,HFv1 Winogrande,52.57,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HF OpenLLM v1,43.42,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HFv1 ARC,43.0,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HFv1 HellaSwag,72.37,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HFv1 MMLU,34.97,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HFv1 TruthfulQA,37.52,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_40b,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HF OpenLLM v1,37.23,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HFv1 ARC,36.35,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HFv1 HellaSwag,60.75,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HFv1 MMLU,26.0,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HFv1 TruthfulQA,39.04,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HF OpenLLM v1,39.49,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HFv1 GSM8K,1.21,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HFv1 HellaSwag,66.39,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HFv1 MMLU,30.09,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HFv1 TruthfulQA,35.6,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2,HFv1 Winogrande,64.25,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HF OpenLLM v1,41.72,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HFv1 ARC,40.78,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HFv1 GSM8K,6.37,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HFv1 HellaSwag,67.77,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HFv1 MMLU,31.57,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HFv1 TruthfulQA,40.32,,hf_open_llm_v1_240829_frozen.csv +gpt_sw3_6_7b_v2_instruct,HFv1 Winogrande,63.54,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HF OpenLLM v1,31.0,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HFv1 ARC,25.94,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HFv1 HellaSwag,38.55,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HFv1 TruthfulQA,45.25,,hf_open_llm_v1_240829_frozen.csv +gptneo350m_instruct_sft,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HF OpenLLM v1,73.68,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HFv1 HellaSwag,87.7,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HFv1 MMLU,63.51,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HFv1 TruthfulQA,69.07,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v1olet,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HF OpenLLM v1,73.29,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HFv1 ARC,69.8,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HFv1 GSM8K,67.1,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HFv1 HellaSwag,88.02,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HFv1 MMLU,65.0,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HFv1 TruthfulQA,67.83,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v2leo,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HF OpenLLM v1,74.18,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HFv1 HellaSwag,88.24,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HFv1 MMLU,65.01,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HFv1 TruthfulQA,69.65,,hf_open_llm_v1_240829_frozen.csv +greennodelm_7b_v4leo,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HF OpenLLM v1,29.2,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HFv1 HellaSwag,25.72,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HFv1 TruthfulQA,52.11,,hf_open_llm_v1_240829_frozen.csv +griffin_c3t_8l_v0_02_fineweb,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HFv1 ARC,23.46,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HFv1 HellaSwag,25.48,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HFv1 TruthfulQA,50.32,,hf_open_llm_v1_240829_frozen.csv +griffin_llama3t_8l_v0_02_fineweb,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HF OpenLLM v1,74.83,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HFv1 HellaSwag,88.29,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HFv1 TruthfulQA,71.48,,hf_open_llm_v1_240829_frozen.csv +griffon_7b_model_stock,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +grindin,HF OpenLLM v1,72.18,,hf_open_llm_v1_240829_frozen.csv +grindin,HFv1 ARC,69.88,,hf_open_llm_v1_240829_frozen.csv +grindin,HFv1 GSM8K,70.96,,hf_open_llm_v1_240829_frozen.csv +grindin,HFv1 HellaSwag,87.02,,hf_open_llm_v1_240829_frozen.csv +grindin,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv +grindin,HFv1 TruthfulQA,59.34,,hf_open_llm_v1_240829_frozen.csv +grindin,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +gzdx,HF OpenLLM v1,37.97,,hf_open_llm_v1_240829_frozen.csv +gzdx,HFv1 ARC,35.75,,hf_open_llm_v1_240829_frozen.csv +gzdx,HFv1 GSM8K,10.77,,hf_open_llm_v1_240829_frozen.csv +gzdx,HFv1 HellaSwag,55.57,,hf_open_llm_v1_240829_frozen.csv +gzdx,HFv1 MMLU,25.19,,hf_open_llm_v1_240829_frozen.csv +gzdx,HFv1 TruthfulQA,42.03,,hf_open_llm_v1_240829_frozen.csv +gzdx,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HF OpenLLM v1,39.35,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HFv1 GSM8K,9.48,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HFv1 HellaSwag,54.67,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HFv1 MMLU,35.5,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HFv1 TruthfulQA,40.47,,hf_open_llm_v1_240829_frozen.csv +gzdx_1_1b,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HF OpenLLM v1,49.26,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HFv1 GSM8K,30.55,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HFv1 HellaSwag,73.95,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HFv1 MMLU,38.02,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HFv1 TruthfulQA,40.54,,hf_open_llm_v1_240829_frozen.csv +h2o_danube2_1_8b_chat,HFv1 Winogrande,68.9,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HF OpenLLM v1,39.12,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HFv1 HellaSwag,69.58,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HFv1 MMLU,25.94,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HFv1 TruthfulQA,33.86,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_base,HFv1 Winogrande,64.48,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HF OpenLLM v1,44.49,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HFv1 ARC,41.13,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HFv1 HellaSwag,68.06,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HFv1 MMLU,33.41,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HFv1 TruthfulQA,41.64,,hf_open_llm_v1_240829_frozen.csv +h2o_danube_1_8b_chat,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HF OpenLLM v1,59.76,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HFv1 GSM8K,12.96,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HFv1 HellaSwag,84.09,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HFv1 MMLU,63.67,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HFv1 TruthfulQA,55.08,,hf_open_llm_v1_240829_frozen.csv +h4rmoniousanthea,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HF OpenLLM v1,69.51,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HFv1 HellaSwag,83.35,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HFv1 MMLU,67.8,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HFv1 TruthfulQA,53.45,,hf_open_llm_v1_240829_frozen.csv +halu_oas_8b_llama3,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HF OpenLLM v1,59.06,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HFv1 GSM8K,33.51,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HFv1 HellaSwag,81.75,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HFv1 MMLU,59.93,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HFv1 TruthfulQA,42.38,,hf_open_llm_v1_240829_frozen.csv +han_llm_7b_v2,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HF OpenLLM v1,75.51,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HFv1 HellaSwag,88.72,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HFv1 MMLU,65.07,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HFv1 TruthfulQA,71.35,,hf_open_llm_v1_240829_frozen.csv +harpy_7b_model_stock,HFv1 Winogrande,85.24,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HF OpenLLM v1,33.0,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HFv1 ARC,30.55,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HFv1 HellaSwag,44.78,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HFv1 MMLU,24.64,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HFv1 TruthfulQA,41.55,,hf_open_llm_v1_240829_frozen.csv +healix_1_1b_v1_chat_ddpo,HFv1 Winogrande,56.51,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HF OpenLLM v1,38.93,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HFv1 HellaSwag,65.94,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HFv1 MMLU,26.02,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HFv1 TruthfulQA,37.4,,hf_open_llm_v1_240829_frozen.csv +healix_3b,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HF OpenLLM v1,29.05,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HFv1 HellaSwag,28.02,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HFv1 MMLU,23.66,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HFv1 TruthfulQA,48.25,,hf_open_llm_v1_240829_frozen.csv +helpingai_110m,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HF OpenLLM v1,55.59,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HFv1 ARC,50.6,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HFv1 GSM8K,36.09,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HFv1 HellaSwag,76.64,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HFv1 MMLU,46.82,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HFv1 TruthfulQA,55.62,,hf_open_llm_v1_240829_frozen.csv +helpingai_3b,HFv1 Winogrande,67.8,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HF OpenLLM v1,63.33,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HFv1 GSM8K,53.75,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HFv1 HellaSwag,79.16,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HFv1 MMLU,65.01,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HFv1 TruthfulQA,48.52,,hf_open_llm_v1_240829_frozen.csv +helpingai_9b,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HF OpenLLM v1,58.95,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HFv1 ARC,57.08,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HFv1 GSM8K,29.87,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HFv1 HellaSwag,81.13,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HFv1 MMLU,58.98,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HFv1 TruthfulQA,49.47,,hf_open_llm_v1_240829_frozen.csv +hercules_1_0_mistral_7b,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HF OpenLLM v1,62.69,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HFv1 MMLU,63.47,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HFv1 TruthfulQA,43.97,,hf_open_llm_v1_240829_frozen.csv +hercules_2_0_mistral_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HF OpenLLM v1,63.59,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HFv1 GSM8K,49.05,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HFv1 MMLU,63.49,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HFv1 TruthfulQA,43.44,,hf_open_llm_v1_240829_frozen.csv +hercules_2_5_mistral_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HF OpenLLM v1,62.36,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HFv1 GSM8K,42.91,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HFv1 HellaSwag,83.43,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HFv1 TruthfulQA,43.42,,hf_open_llm_v1_240829_frozen.csv +hercules_3_0_mistral_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HF OpenLLM v1,62.09,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HFv1 GSM8K,42.3,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HFv1 HellaSwag,83.55,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HFv1 MMLU,63.65,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HFv1 TruthfulQA,42.83,,hf_open_llm_v1_240829_frozen.csv +hercules_3_1_mistral_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HF OpenLLM v1,61.53,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HFv1 GSM8K,45.41,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HFv1 HellaSwag,82.6,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HFv1 MMLU,62.66,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HFv1 TruthfulQA,40.99,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_mistral_v0_2_7b,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HF OpenLLM v1,70.85,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HFv1 GSM8K,63.0,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HFv1 MMLU,75.2,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HFv1 TruthfulQA,53.05,,hf_open_llm_v1_240829_frozen.csv +hercules_4_0_yi_34b,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HF OpenLLM v1,45.57,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HFv1 GSM8K,30.55,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HFv1 HellaSwag,59.53,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HFv1 MMLU,44.77,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HFv1 TruthfulQA,39.24,,hf_open_llm_v1_240829_frozen.csv +hercules_mini_1_8b,HFv1 Winogrande,62.27,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HF OpenLLM v1,66.24,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HFv1 ARC,56.23,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HFv1 HellaSwag,80.6,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HFv1 MMLU,68.73,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HFv1 TruthfulQA,52.03,,hf_open_llm_v1_240829_frozen.csv +hercules_qwen1_5_14b,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HF OpenLLM v1,67.35,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HFv1 ARC,64.16,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HFv1 HellaSwag,82.73,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HFv1 MMLU,62.21,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HFv1 TruthfulQA,58.99,,hf_open_llm_v1_240829_frozen.csv +hermes_2_pro_mistral_7b,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HF OpenLLM v1,66.59,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HFv1 MMLU,65.31,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HFv1 TruthfulQA,48.55,,hf_open_llm_v1_240829_frozen.csv +hermesstar_orcawind_synth_11b,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HF OpenLLM v1,30.4,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HFv1 HellaSwag,32.23,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HFv1 MMLU,27.01,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +hf_checkpoint2_01052024,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv +hope_for,HF OpenLLM v1,51.3,,hf_open_llm_v1_240829_frozen.csv +hope_for,HFv1 ARC,51.28,,hf_open_llm_v1_240829_frozen.csv +hope_for,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv +hope_for,HFv1 HellaSwag,74.74,,hf_open_llm_v1_240829_frozen.csv +hope_for,HFv1 MMLU,51.56,,hf_open_llm_v1_240829_frozen.csv +hope_for,HFv1 TruthfulQA,40.73,,hf_open_llm_v1_240829_frozen.csv +hope_for,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HF OpenLLM v1,51.16,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HFv1 ARC,50.43,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HFv1 GSM8K,16.53,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HFv1 HellaSwag,76.44,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HFv1 MMLU,49.68,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HFv1 TruthfulQA,38.66,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_0v,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HF OpenLLM v1,50.19,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HFv1 ARC,49.49,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HFv1 GSM8K,14.18,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HFv1 HellaSwag,75.08,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HFv1 MMLU,48.49,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HFv1 TruthfulQA,40.26,,hf_open_llm_v1_240829_frozen.csv +hope_for_7b_1_1v,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HFv1 MMLU,52.32,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HFv1 MMLU,52.32,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv +huginn_13b_v4_5,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HF OpenLLM v1,52.99,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HFv1 GSM8K,4.4,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HFv1 HellaSwag,81.03,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HFv1 MMLU,55.73,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HFv1 TruthfulQA,41.15,,hf_open_llm_v1_240829_frozen.csv +huginn_19b_prototype,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HF OpenLLM v1,52.36,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HFv1 HellaSwag,80.69,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HFv1 MMLU,49.81,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HFv1 TruthfulQA,52.11,,hf_open_llm_v1_240829_frozen.csv +huginn_22b_prototype,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HFv1 MMLU,52.32,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv +huginn_v3_13b,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HF OpenLLM v1,55.98,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HFv1 GSM8K,9.17,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HFv1 HellaSwag,84.28,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HFv1 MMLU,57.02,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HFv1 TruthfulQA,47.81,,hf_open_llm_v1_240829_frozen.csv +huginnv1_2,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HF OpenLLM v1,61.43,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HFv1 GSM8K,40.49,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HFv1 HellaSwag,83.64,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HFv1 TruthfulQA,41.78,,hf_open_llm_v1_240829_frozen.csv +hyperion_1_5_mistral_7b,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HF OpenLLM v1,61.88,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HFv1 GSM8K,41.77,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HFv1 HellaSwag,83.5,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HFv1 TruthfulQA,41.97,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_mistral_7b,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HF OpenLLM v1,71.09,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HFv1 GSM8K,62.02,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HFv1 HellaSwag,85.66,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HFv1 MMLU,76.09,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HFv1 TruthfulQA,55.3,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_0_yi_34b,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HF OpenLLM v1,61.9,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HFv1 GSM8K,40.18,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HFv1 HellaSwag,83.3,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HFv1 MMLU,61.46,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HFv1 TruthfulQA,47.58,,hf_open_llm_v1_240829_frozen.csv +hyperion_2_1_mistral_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HF OpenLLM v1,61.52,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HFv1 GSM8K,41.55,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HFv1 HellaSwag,83.48,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HFv1 MMLU,62.5,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HFv1 TruthfulQA,42.82,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_alpha,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HF OpenLLM v1,63.03,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HFv1 HellaSwag,83.95,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HFv1 MMLU,62.71,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HFv1 TruthfulQA,46.17,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mistral_7b_dpo,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HF OpenLLM v1,61.84,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HFv1 GSM8K,41.39,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HFv1 HellaSwag,83.28,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HFv1 MMLU,63.22,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HFv1 TruthfulQA,43.46,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_mixtral_3x7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HF OpenLLM v1,71.18,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HFv1 GSM8K,61.03,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HFv1 HellaSwag,85.61,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HFv1 MMLU,75.98,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HFv1 TruthfulQA,56.38,,hf_open_llm_v1_240829_frozen.csv +hyperion_3_0_yi_34b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HF OpenLLM v1,61.67,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HFv1 GSM8K,40.49,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HFv1 HellaSwag,83.67,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HFv1 MMLU,63.73,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HFv1 TruthfulQA,42.93,,hf_open_llm_v1_240829_frozen.csv +hyperion_medium_preview,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HF OpenLLM v1,61.99,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HFv1 GSM8K,33.28,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HFv1 HellaSwag,84.53,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HFv1 MMLU,60.45,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HFv1 TruthfulQA,53.85,,hf_open_llm_v1_240829_frozen.csv +iambe_20b_dare_v2,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HF OpenLLM v1,72.37,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HFv1 ARC,69.88,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HFv1 GSM8K,66.19,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HFv1 HellaSwag,87.15,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HFv1 TruthfulQA,63.75,,hf_open_llm_v1_240829_frozen.csv +iamsotired_7b_slerp,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HF OpenLLM v1,61.39,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HFv1 GSM8K,31.69,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HFv1 HellaSwag,86.14,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HFv1 MMLU,57.34,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HFv1 TruthfulQA,53.17,,hf_open_llm_v1_240829_frozen.csv +ice_grt,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +idus,HF OpenLLM v1,29.51,,hf_open_llm_v1_240829_frozen.csv +idus,HFv1 ARC,27.73,,hf_open_llm_v1_240829_frozen.csv +idus,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +idus,HFv1 HellaSwag,26.65,,hf_open_llm_v1_240829_frozen.csv +idus,HFv1 MMLU,24.91,,hf_open_llm_v1_240829_frozen.csv +idus,HFv1 TruthfulQA,48.58,,hf_open_llm_v1_240829_frozen.csv +idus,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HF OpenLLM v1,58.38,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HFv1 GSM8K,29.57,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HFv1 HellaSwag,81.34,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HFv1 MMLU,63.22,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HFv1 TruthfulQA,40.62,,hf_open_llm_v1_240829_frozen.csv +idus_8layers,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HF OpenLLM v1,64.77,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HFv1 GSM8K,33.06,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HFv1 MMLU,58.99,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HFv1 TruthfulQA,65.46,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HF OpenLLM v1,64.41,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HFv1 GSM8K,31.46,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HFv1 HellaSwag,84.57,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HFv1 MMLU,58.56,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HFv1 TruthfulQA,66.24,,hf_open_llm_v1_240829_frozen.csv +ignis_7b_dpo_laser,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HF OpenLLM v1,76.66,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HFv1 GSM8K,70.2,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HFv1 HellaSwag,89.14,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HFv1 TruthfulQA,78.04,,hf_open_llm_v1_240829_frozen.csv +inex12_7b,HFv1 Winogrande,85.24,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HF OpenLLM v1,76.44,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HFv1 GSM8K,68.99,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HFv1 HellaSwag,89.19,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HFv1 MMLU,64.47,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HFv1 TruthfulQA,77.83,,hf_open_llm_v1_240829_frozen.csv +inex8_7b,HFv1 Winogrande,84.85,,hf_open_llm_v1_240829_frozen.csv +init_model,HF OpenLLM v1,29.6,,hf_open_llm_v1_240829_frozen.csv +init_model,HFv1 ARC,28.5,,hf_open_llm_v1_240829_frozen.csv +init_model,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +init_model,HFv1 HellaSwag,25.4,,hf_open_llm_v1_240829_frozen.csv +init_model,HFv1 MMLU,25.65,,hf_open_llm_v1_240829_frozen.csv +init_model,HFv1 TruthfulQA,48.48,,hf_open_llm_v1_240829_frozen.csv +init_model,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HF OpenLLM v1,42.91,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HFv1 ARC,47.1,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HFv1 HellaSwag,73.0,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HFv1 MMLU,28.26,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HFv1 TruthfulQA,41.81,,hf_open_llm_v1_240829_frozen.csv +instructpalmyra_20b,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HF OpenLLM v1,69.75,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HFv1 HellaSwag,83.21,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HFv1 MMLU,67.58,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HFv1 TruthfulQA,51.27,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HF OpenLLM v1,70.66,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HFv1 HellaSwag,83.16,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HFv1 MMLU,67.27,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HFv1 TruthfulQA,54.17,,hf_open_llm_v1_240829_frozen.csv +internlm2_20b_llama,HFv1 Winogrande,84.29,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HF OpenLLM v1,62.69,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HFv1 GSM8K,44.88,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HFv1 HellaSwag,82.15,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HFv1 MMLU,63.97,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HFv1 TruthfulQA,44.11,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_20b_llama,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HF OpenLLM v1,53.62,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HFv1 ARC,54.35,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HFv1 GSM8K,19.18,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HFv1 HellaSwag,79.47,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HFv1 MMLU,54.05,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HFv1 TruthfulQA,43.23,,hf_open_llm_v1_240829_frozen.csv +internlm2_base_7b_llama,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HF OpenLLM v1,62.56,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HFv1 GSM8K,33.97,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HFv1 HellaSwag,82.58,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HFv1 MMLU,66.89,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HFv1 TruthfulQA,48.74,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HF OpenLLM v1,62.53,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HFv1 GSM8K,34.04,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HFv1 HellaSwag,82.57,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HFv1 MMLU,66.85,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HFv1 TruthfulQA,48.75,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_20b_llama_old,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HF OpenLLM v1,64.34,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HFv1 ARC,57.17,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HFv1 GSM8K,55.95,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HFv1 HellaSwag,80.16,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HFv1 MMLU,63.92,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HFv1 TruthfulQA,50.95,,hf_open_llm_v1_240829_frozen.csv +internlm2_chat_7b_sft_llama,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HF OpenLLM v1,59.55,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HFv1 GSM8K,23.5,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HFv1 MMLU,61.85,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HFv1 TruthfulQA,52.61,,hf_open_llm_v1_240829_frozen.csv +internlm_20b,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HF OpenLLM v1,65.09,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HFv1 GSM8K,51.1,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HFv1 HellaSwag,82.08,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HFv1 MMLU,61.59,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HFv1 TruthfulQA,57.71,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llama,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HF OpenLLM v1,29.08,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HFv1 ARC,26.79,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HFv1 HellaSwag,26.4,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HFv1 MMLU,25.4,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HFv1 TruthfulQA,48.06,,hf_open_llm_v1_240829_frozen.csv +internlm_20b_llamafied,HFv1 Winogrande,47.83,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HF OpenLLM v1,71.29,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HFv1 GSM8K,72.02,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HFv1 MMLU,65.05,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HFv1 TruthfulQA,55.87,,hf_open_llm_v1_240829_frozen.csv +ipo_test,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HF OpenLLM v1,74.56,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HFv1 HellaSwag,88.23,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HFv1 MMLU,64.97,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HFv1 TruthfulQA,69.41,,hf_open_llm_v1_240829_frozen.csv +iwillchangethenamelater,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HF OpenLLM v1,67.99,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HFv1 GSM8K,61.03,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HFv1 HellaSwag,83.79,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HFv1 MMLU,63.16,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HFv1 TruthfulQA,56.88,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta10_7b_slerp,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HF OpenLLM v1,68.64,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HFv1 GSM8K,63.99,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HFv1 HellaSwag,85.01,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HFv1 MMLU,63.77,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HFv1 TruthfulQA,55.77,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta11_7b_slerp,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HF OpenLLM v1,68.22,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HFv1 GSM8K,59.67,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HFv1 HellaSwag,83.98,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HFv1 MMLU,63.28,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HFv1 TruthfulQA,58.16,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta12_7b_slerp,HFv1 Winogrande,79.64,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HF OpenLLM v1,61.56,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HFv1 GSM8K,36.09,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HFv1 HellaSwag,83.66,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HFv1 MMLU,62.35,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HFv1 TruthfulQA,48.69,,hf_open_llm_v1_240829_frozen.csv +j_o_s_i_e_3_beta8_slerp,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HF OpenLLM v1,70.73,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HFv1 HellaSwag,83.81,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HFv1 MMLU,76.4,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HFv1 TruthfulQA,51.46,,hf_open_llm_v1_240829_frozen.csv +jallabi_34b,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HF OpenLLM v1,29.28,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HFv1 ARC,25.0,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HFv1 HellaSwag,25.46,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HFv1 TruthfulQA,51.45,,hf_open_llm_v1_240829_frozen.csv +japanese_gpt_neox_3_6b,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HF OpenLLM v1,52.82,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HFv1 ARC,50.68,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HFv1 GSM8K,19.26,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HFv1 HellaSwag,78.68,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HFv1 MMLU,54.82,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HFv1 TruthfulQA,39.77,,hf_open_llm_v1_240829_frozen.csv +japanese_stablelm_instruct_gamma_7b,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HF OpenLLM v1,72.91,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HFv1 GSM8K,70.36,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HFv1 HellaSwag,87.02,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HFv1 MMLU,64.67,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HFv1 TruthfulQA,64.41,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo,HFv1 Winogrande,80.19,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HF OpenLLM v1,72.53,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HFv1 GSM8K,71.8,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HFv1 HellaSwag,86.8,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HFv1 MMLU,64.92,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HFv1 TruthfulQA,61.64,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v2,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HF OpenLLM v1,76.12,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HFv1 GSM8K,67.85,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HFv1 HellaSwag,88.89,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HFv1 MMLU,64.34,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HFv1 TruthfulQA,79.0,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v3_3,HFv1 Winogrande,84.37,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HF OpenLLM v1,75.95,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HFv1 GSM8K,68.31,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HFv1 HellaSwag,89.07,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HFv1 MMLU,64.75,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HFv1 TruthfulQA,75.92,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_1,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HF OpenLLM v1,76.35,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HFv1 GSM8K,69.07,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HFv1 HellaSwag,89.09,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HFv1 MMLU,64.29,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HFv1 TruthfulQA,78.27,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_dpo_v4_3,HFv1 Winogrande,84.77,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HF OpenLLM v1,71.36,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HFv1 ARC,73.46,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HFv1 GSM8K,58.0,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HFv1 HellaSwag,88.16,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HFv1 MMLU,63.15,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HFv1 TruthfulQA,59.92,,hf_open_llm_v1_240829_frozen.csv +jaskier_7b_neuraldpo,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HF OpenLLM v1,68.06,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HFv1 GSM8K,61.71,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HFv1 MMLU,63.73,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HFv1 TruthfulQA,55.93,,hf_open_llm_v1_240829_frozen.csv +josie_beta_4_7b_slerp,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HF OpenLLM v1,70.48,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HFv1 GSM8K,59.74,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HFv1 HellaSwag,86.79,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HFv1 MMLU,64.66,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HFv1 TruthfulQA,62.69,,hf_open_llm_v1_240829_frozen.csv +justtosuffer_7b_slerp,HFv1 Winogrande,80.03,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HF OpenLLM v1,61.72,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HFv1 ARC,66.72,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HFv1 GSM8K,23.12,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HFv1 MMLU,63.38,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HFv1 TruthfulQA,54.12,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HF OpenLLM v1,60.89,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HFv1 ARC,66.81,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HFv1 GSM8K,18.8,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HFv1 HellaSwag,84.89,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HFv1 MMLU,63.03,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv +juud_mistral_7b_dpo,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +k2,HF OpenLLM v1,64.54,,hf_open_llm_v1_240829_frozen.csv +k2,HFv1 ARC,63.99,,hf_open_llm_v1_240829_frozen.csv +k2,HFv1 GSM8K,48.67,,hf_open_llm_v1_240829_frozen.csv +k2,HFv1 HellaSwag,85.71,,hf_open_llm_v1_240829_frozen.csv +k2,HFv1 MMLU,67.99,,hf_open_llm_v1_240829_frozen.csv +k2,HFv1 TruthfulQA,40.77,,hf_open_llm_v1_240829_frozen.csv +k2,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HF OpenLLM v1,62.36,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HFv1 ARC,61.52,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HFv1 GSM8K,40.41,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HFv1 HellaSwag,83.13,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HFv1 MMLU,59.35,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HFv1 TruthfulQA,51.39,,hf_open_llm_v1_240829_frozen.csv +karakuri_lm_70b_chat_v0_1,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HF OpenLLM v1,59.13,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HFv1 GSM8K,30.17,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HFv1 HellaSwag,81.79,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HFv1 MMLU,59.56,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HFv1 TruthfulQA,49.36,,hf_open_llm_v1_240829_frozen.csv +karen_theeditor_v2_strict_mistral_7b,HFv1 Winogrande,74.35,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HF OpenLLM v1,73.71,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HFv1 GSM8K,72.02,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HFv1 HellaSwag,87.56,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HFv1 MMLU,65.33,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HFv1 TruthfulQA,64.97,,hf_open_llm_v1_240829_frozen.csv +kellemar_dpo_orca_distilled_7b_slerp,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HF OpenLLM v1,73.33,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HFv1 GSM8K,71.57,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HFv1 HellaSwag,87.29,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HFv1 MMLU,65.61,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HFv1 TruthfulQA,63.03,,hf_open_llm_v1_240829_frozen.csv +kellemar_krishnahercules_0_1_7b_slerp,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HF OpenLLM v1,74.29,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HFv1 ARC,71.76,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HFv1 GSM8K,70.2,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HFv1 HellaSwag,87.78,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HFv1 MMLU,64.76,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HFv1 TruthfulQA,68.12,,hf_open_llm_v1_240829_frozen.csv +kindred_7b_slerp,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HF OpenLLM v1,72.12,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HFv1 GSM8K,69.07,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HFv1 HellaSwag,85.83,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HFv1 MMLU,68.45,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HFv1 TruthfulQA,61.02,,hf_open_llm_v1_240829_frozen.csv +kingnish_llama3_8b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HF OpenLLM v1,75.29,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HFv1 HellaSwag,87.94,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HFv1 MMLU,74.93,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HFv1 TruthfulQA,63.48,,hf_open_llm_v1_240829_frozen.csv +kiqu_70b,HFv1 Winogrande,84.85,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HF OpenLLM v1,30.23,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HFv1 ARC,25.17,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HFv1 HellaSwag,38.45,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HFv1 MMLU,26.16,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HFv1 TruthfulQA,41.57,,hf_open_llm_v1_240829_frozen.csv +knowledgeninja_litellama_460mx6moe_1t,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HF OpenLLM v1,28.29,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HFv1 ARC,21.33,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HFv1 HellaSwag,25.0,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HFv1 MMLU,23.58,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HFv1 TruthfulQA,50.68,,hf_open_llm_v1_240829_frozen.csv +ko_wand_136m,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HF OpenLLM v1,28.57,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HFv1 ARC,23.46,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HFv1 HellaSwag,31.65,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HFv1 MMLU,24.89,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HFv1 TruthfulQA,39.83,,hf_open_llm_v1_240829_frozen.csv +koalpaca_korwkv_6b,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HF OpenLLM v1,50.66,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HFv1 ARC,53.33,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HFv1 GSM8K,6.52,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HFv1 HellaSwag,78.5,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HFv1 MMLU,43.61,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HFv1 TruthfulQA,46.37,,hf_open_llm_v1_240829_frozen.csv +kollama2_7b_v2,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HF OpenLLM v1,58.61,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HFv1 GSM8K,24.18,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HFv1 MMLU,61.32,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HFv1 TruthfulQA,51.24,,hf_open_llm_v1_240829_frozen.csv +koopenchat_sft,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HF OpenLLM v1,28.19,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HFv1 ARC,22.1,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HFv1 HellaSwag,32.18,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HFv1 MMLU,24.69,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HFv1 TruthfulQA,39.05,,hf_open_llm_v1_240829_frozen.csv +korwkv_6b,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HF OpenLLM v1,64.2,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HFv1 GSM8K,47.69,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HFv1 HellaSwag,82.63,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HFv1 TruthfulQA,47.94,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HF OpenLLM v1,65.43,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 GSM8K,48.07,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 HellaSwag,83.63,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 MMLU,64.61,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 TruthfulQA,52.69,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_2_1_3_dedup_p,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HF OpenLLM v1,64.76,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HFv1 GSM8K,50.49,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HFv1 HellaSwag,83.73,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HFv1 MMLU,64.51,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HFv1 TruthfulQA,44.57,,hf_open_llm_v1_240829_frozen.csv +kosolar_10_7b_v0_3,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HF OpenLLM v1,69.21,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HFv1 ARC,68.0,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HFv1 GSM8K,61.64,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HFv1 HellaSwag,86.34,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HFv1 MMLU,64.82,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HFv1 TruthfulQA,55.19,,hf_open_llm_v1_240829_frozen.csv +kunomaid_7b_slerp,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HF OpenLLM v1,29.3,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HFv1 HellaSwag,25.67,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HFv1 MMLU,27.0,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HFv1 TruthfulQA,48.21,,hf_open_llm_v1_240829_frozen.csv +lamini_neo_1_3b_mental_health_lora,HFv1 Winogrande,49.17,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HF OpenLLM v1,67.16,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HFv1 ARC,65.96,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HFv1 GSM8K,48.29,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HFv1 HellaSwag,85.8,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HFv1 MMLU,63.17,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HFv1 TruthfulQA,60.76,,hf_open_llm_v1_240829_frozen.csv +laser_dolphin_mixtral_2x7b_dpo,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HF OpenLLM v1,65.38,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HFv1 GSM8K,35.33,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HFv1 HellaSwag,85.73,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HFv1 MMLU,65.99,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HFv1 TruthfulQA,56.58,,hf_open_llm_v1_240829_frozen.csv +lemur_70b_chat_v1,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HF OpenLLM v1,49.29,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HFv1 ARC,52.56,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HFv1 GSM8K,5.16,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HFv1 HellaSwag,77.61,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HFv1 MMLU,45.58,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HFv1 TruthfulQA,44.89,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HF OpenLLM v1,48.72,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HFv1 GSM8K,2.73,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HFv1 HellaSwag,76.03,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HFv1 MMLU,44.68,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HFv1 TruthfulQA,47.16,,hf_open_llm_v1_240829_frozen.csv +leo_hessianai_7b_chat_bilingual,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HF OpenLLM v1,73.92,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HFv1 GSM8K,69.07,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HFv1 HellaSwag,87.97,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HFv1 MMLU,65.08,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HFv1 TruthfulQA,68.83,,hf_open_llm_v1_240829_frozen.csv +leoscorpius_7b_chat_dpo,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HFv1 GSM8K,61.56,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HFv1 HellaSwag,85.91,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HFv1 MMLU,64.48,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HFv1 TruthfulQA,59.98,,hf_open_llm_v1_240829_frozen.csv +lexgpt_v3,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +lhk,HF OpenLLM v1,68.74,,hf_open_llm_v1_240829_frozen.csv +lhk,HFv1 ARC,66.38,,hf_open_llm_v1_240829_frozen.csv +lhk,HFv1 GSM8K,56.33,,hf_open_llm_v1_240829_frozen.csv +lhk,HFv1 HellaSwag,84.49,,hf_open_llm_v1_240829_frozen.csv +lhk,HFv1 MMLU,65.13,,hf_open_llm_v1_240829_frozen.csv +lhk,HFv1 TruthfulQA,59.12,,hf_open_llm_v1_240829_frozen.csv +lhk,HFv1 Winogrande,80.98,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HF OpenLLM v1,53.83,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HFv1 HellaSwag,82.04,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HFv1 MMLU,55.57,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HFv1 TruthfulQA,48.41,,hf_open_llm_v1_240829_frozen.csv +libra_19b,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HF OpenLLM v1,68.03,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HFv1 GSM8K,48.45,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HFv1 HellaSwag,84.45,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HFv1 MMLU,62.36,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HFv1 TruthfulQA,68.73,,hf_open_llm_v1_240829_frozen.csv +lil_c3po,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HF OpenLLM v1,52.98,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HFv1 GSM8K,5.76,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HFv1 MMLU,53.17,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HFv1 TruthfulQA,41.81,,hf_open_llm_v1_240829_frozen.csv +lima2_13b,HFv1 Winogrande,73.24,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HF OpenLLM v1,49.27,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HFv1 HellaSwag,80.6,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HFv1 MMLU,43.22,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HFv1 TruthfulQA,44.74,,hf_open_llm_v1_240829_frozen.csv +lima2_7b,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HF OpenLLM v1,65.51,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HFv1 GSM8K,34.72,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HFv1 HellaSwag,87.65,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HFv1 MMLU,70.0,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HFv1 TruthfulQA,48.76,,hf_open_llm_v1_240829_frozen.csv +lima_unchained_70b,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.46,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,6.07,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.76,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.52,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,44.14,,hf_open_llm_v1_240829_frozen.csv +limarp_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +liph42,HF OpenLLM v1,62.12,,hf_open_llm_v1_240829_frozen.csv +liph42,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +liph42,HFv1 GSM8K,56.94,,hf_open_llm_v1_240829_frozen.csv +liph42,HFv1 HellaSwag,75.87,,hf_open_llm_v1_240829_frozen.csv +liph42,HFv1 MMLU,57.37,,hf_open_llm_v1_240829_frozen.csv +liph42,HFv1 TruthfulQA,45.94,,hf_open_llm_v1_240829_frozen.csv +liph42,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HF OpenLLM v1,30.16,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HFv1 HellaSwag,38.39,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HFv1 MMLU,25.96,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HFv1 TruthfulQA,41.59,,hf_open_llm_v1_240829_frozen.csv +litellama_460m_1t,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HF OpenLLM v1,49.58,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HFv1 ARC,53.92,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HFv1 HellaSwag,74.64,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HFv1 MMLU,49.74,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HFv1 TruthfulQA,45.43,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_chinese_v2,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HF OpenLLM v1,54.52,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HFv1 GSM8K,11.45,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HFv1 HellaSwag,82.86,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HFv1 MMLU,54.67,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HFv1 TruthfulQA,42.97,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_holomax,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HF OpenLLM v1,29.28,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HFv1 ARC,28.16,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HFv1 HellaSwag,26.55,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HFv1 MMLU,23.17,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HFv1 TruthfulQA,48.79,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_platypus_ckpt_1000,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HF OpenLLM v1,55.75,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 GSM8K,11.75,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 HellaSwag,84.04,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 MMLU,55.13,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 TruthfulQA,45.66,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_orca_openplatypus_8w,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HF OpenLLM v1,55.69,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HFv1 GSM8K,13.12,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HFv1 HellaSwag,82.65,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HFv1 MMLU,55.99,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HFv1 TruthfulQA,48.27,,hf_open_llm_v1_240829_frozen.csv +llama2_13b_sharegpt4_test,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HF OpenLLM v1,51.54,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HFv1 GSM8K,9.86,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HFv1 MMLU,49.39,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HFv1 TruthfulQA,41.83,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_instruction_lora,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HF OpenLLM v1,52.24,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HFv1 GSM8K,4.09,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HFv1 HellaSwag,80.17,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HFv1 MMLU,48.44,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HFv1 TruthfulQA,51.62,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v1,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HF OpenLLM v1,52.32,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HFv1 ARC,54.78,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HFv1 HellaSwag,81.48,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HFv1 MMLU,47.2,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HFv1 TruthfulQA,53.13,,hf_open_llm_v1_240829_frozen.csv +llama2_7b_openorca_mc_v2_dpo,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HF OpenLLM v1,57.94,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HFv1 GSM8K,29.26,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HFv1 HellaSwag,80.93,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HFv1 MMLU,55.26,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv +llama2_megamerge_dare_13b_v2,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HF OpenLLM v1,30.17,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HFv1 HellaSwag,38.47,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HFv1 MMLU,26.17,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HFv1 TruthfulQA,41.59,,hf_open_llm_v1_240829_frozen.csv +llama2_xs_460m_experimental,HFv1 Winogrande,49.88,,hf_open_llm_v1_240829_frozen.csv +llama3,HF OpenLLM v1,37.78,,hf_open_llm_v1_240829_frozen.csv +llama3,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv +llama3,HFv1 GSM8K,5.23,,hf_open_llm_v1_240829_frozen.csv +llama3,HFv1 HellaSwag,58.93,,hf_open_llm_v1_240829_frozen.csv +llama3,HFv1 MMLU,25.33,,hf_open_llm_v1_240829_frozen.csv +llama3,HFv1 TruthfulQA,42.79,,hf_open_llm_v1_240829_frozen.csv +llama3,HFv1 Winogrande,56.67,,hf_open_llm_v1_240829_frozen.csv +llama30b,HF OpenLLM v1,56.94,,hf_open_llm_v1_240829_frozen.csv +llama30b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +llama30b,HFv1 GSM8K,14.86,,hf_open_llm_v1_240829_frozen.csv +llama30b,HFv1 HellaSwag,84.73,,hf_open_llm_v1_240829_frozen.csv +llama30b,HFv1 MMLU,58.47,,hf_open_llm_v1_240829_frozen.csv +llama30b,HFv1 TruthfulQA,42.27,,hf_open_llm_v1_240829_frozen.csv +llama30b,HFv1 Winogrande,80.03,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HF OpenLLM v1,58.18,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HFv1 GSM8K,14.4,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HFv1 HellaSwag,86.17,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HFv1 TruthfulQA,44.12,,hf_open_llm_v1_240829_frozen.csv +llama33b_instructed,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv +llama39m,HF OpenLLM v1,28.45,,hf_open_llm_v1_240829_frozen.csv +llama39m,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv +llama39m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama39m,HFv1 HellaSwag,25.57,,hf_open_llm_v1_240829_frozen.csv +llama39m,HFv1 MMLU,24.31,,hf_open_llm_v1_240829_frozen.csv +llama39m,HFv1 TruthfulQA,47.19,,hf_open_llm_v1_240829_frozen.csv +llama39m,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HF OpenLLM v1,54.61,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HFv1 ARC,52.99,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HFv1 GSM8K,21.91,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HFv1 HellaSwag,80.66,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HFv1 MMLU,62.12,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HFv1 TruthfulQA,39.28,,hf_open_llm_v1_240829_frozen.csv +llama3_13b,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HF OpenLLM v1,73.96,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HFv1 ARC,68.77,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HFv1 GSM8K,76.88,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HFv1 HellaSwag,87.98,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HFv1 MMLU,79.23,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HFv1 TruthfulQA,45.56,,hf_open_llm_v1_240829_frozen.csv +llama3_70b,HFv1 Winogrande,85.32,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HF OpenLLM v1,77.34,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HFv1 GSM8K,83.24,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HFv1 HellaSwag,85.81,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HFv1 MMLU,79.74,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HFv1 TruthfulQA,61.1,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_chinese_chat,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HF OpenLLM v1,77.88,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HFv1 GSM8K,85.44,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HFv1 MMLU,80.06,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HFv1 TruthfulQA,61.81,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HF OpenLLM v1,78.11,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HFv1 GSM8K,86.05,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HFv1 HellaSwag,85.83,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HFv1 MMLU,80.12,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HFv1 TruthfulQA,62.11,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_1,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HF OpenLLM v1,78.96,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HFv1 GSM8K,88.25,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HFv1 HellaSwag,86.22,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HFv1 MMLU,80.41,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HFv1 TruthfulQA,63.57,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_2,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HF OpenLLM v1,78.74,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HFv1 GSM8K,87.19,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HFv1 HellaSwag,86.0,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HFv1 MMLU,80.47,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HFv1 TruthfulQA,63.45,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_3,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HF OpenLLM v1,78.89,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HFv1 GSM8K,87.34,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HFv1 MMLU,80.5,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HFv1 TruthfulQA,63.26,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_dpo_v0_4,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HF OpenLLM v1,73.97,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HFv1 ARC,66.81,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HFv1 GSM8K,78.85,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HFv1 HellaSwag,85.46,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HFv1 MMLU,76.37,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HFv1 TruthfulQA,53.73,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_instruct_gradient_524k,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HF OpenLLM v1,78.6,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HFv1 GSM8K,87.41,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HFv1 HellaSwag,85.81,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HFv1 MMLU,80.28,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HFv1 TruthfulQA,62.93,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_japanese_suzume_vector_v0_1,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HF OpenLLM v1,74.67,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HFv1 ARC,68.69,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HFv1 GSM8K,76.8,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HFv1 HellaSwag,88.01,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HFv1 MMLU,79.39,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HFv1 TruthfulQA,49.62,,hf_open_llm_v1_240829_frozen.csv +llama3_70b_orpo_v0_1,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HF OpenLLM v1,63.96,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HFv1 GSM8K,47.61,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HFv1 HellaSwag,77.38,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HFv1 MMLU,64.09,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HFv1 TruthfulQA,59.56,,hf_open_llm_v1_240829_frozen.csv +llama3_7b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HF OpenLLM v1,62.62,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HFv1 HellaSwag,82.23,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HFv1 MMLU,66.7,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HFv1 TruthfulQA,43.95,,hf_open_llm_v1_240829_frozen.csv +llama3_8b,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HFv1 HellaSwag,80.07,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HFv1 MMLU,66.97,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HFv1 TruthfulQA,51.41,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HF OpenLLM v1,66.79,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HFv1 HellaSwag,79.94,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HFv1 MMLU,66.46,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HFv1 TruthfulQA,50.89,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HFv1 HellaSwag,80.07,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HFv1 MMLU,66.97,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HFv1 TruthfulQA,51.41,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_chinese_chat_v2_nightly_v2,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HF OpenLLM v1,65.9,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HFv1 HellaSwag,79.94,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HFv1 TruthfulQA,51.82,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v1,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HF OpenLLM v1,66.32,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HFv1 HellaSwag,80.01,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HFv1 TruthfulQA,51.87,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v2,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HF OpenLLM v1,65.62,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HFv1 GSM8K,64.22,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HFv1 HellaSwag,80.05,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HFv1 TruthfulQA,51.76,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_claudstruct_v3,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HF OpenLLM v1,66.87,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HFv1 GSM8K,68.69,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HFv1 HellaSwag,78.55,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HFv1 MMLU,67.07,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HFv1 TruthfulQA,51.65,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HF OpenLLM v1,68.11,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HFv1 GSM8K,69.75,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HFv1 HellaSwag,79.52,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HFv1 MMLU,67.0,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HFv1 TruthfulQA,54.21,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_abliterated_dpomix,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HF OpenLLM v1,68.36,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HFv1 GSM8K,70.81,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HFv1 HellaSwag,79.5,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HFv1 MMLU,68.21,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HFv1 TruthfulQA,53.27,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_2,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HF OpenLLM v1,68.23,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HFv1 GSM8K,70.58,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HFv1 HellaSwag,79.2,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HFv1 MMLU,68.33,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HFv1 TruthfulQA,53.29,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_3,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HF OpenLLM v1,68.49,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HFv1 HellaSwag,79.73,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HFv1 MMLU,68.08,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HFv1 TruthfulQA,53.94,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_dpo_v0_4,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HF OpenLLM v1,64.46,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HFv1 ARC,58.19,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HFv1 GSM8K,58.61,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HFv1 HellaSwag,79.42,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HFv1 MMLU,65.59,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HFv1 TruthfulQA,48.38,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_orpo_qlora,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HF OpenLLM v1,65.68,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 GSM8K,66.79,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 HellaSwag,77.83,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 MMLU,64.68,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 TruthfulQA,50.02,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_10fail_1000total,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HF OpenLLM v1,66.12,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 GSM8K,67.25,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 HellaSwag,77.92,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 MMLU,66.36,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 TruthfulQA,50.05,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_2fail_128total,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HF OpenLLM v1,65.62,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 GSM8K,65.5,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 HellaSwag,77.32,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 MMLU,65.62,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 TruthfulQA,50.16,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_3000total_bf16,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HF OpenLLM v1,65.19,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 GSM8K,61.26,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 HellaSwag,77.51,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 MMLU,65.82,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 TruthfulQA,50.07,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_ortho_baukit_5fail_500total,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HF OpenLLM v1,68.32,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HFv1 GSM8K,70.74,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HFv1 HellaSwag,79.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HFv1 MMLU,68.25,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HFv1 TruthfulQA,53.4,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HF OpenLLM v1,68.18,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HFv1 GSM8K,70.96,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HFv1 HellaSwag,79.27,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HFv1 MMLU,67.96,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HFv1 TruthfulQA,53.02,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_2,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HF OpenLLM v1,68.22,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HFv1 GSM8K,69.98,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HFv1 HellaSwag,79.55,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HFv1 MMLU,68.13,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HFv1 TruthfulQA,53.77,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_3,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HF OpenLLM v1,70.3,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HFv1 HellaSwag,83.23,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HFv1 MMLU,67.77,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HFv1 TruthfulQA,56.75,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_4,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HF OpenLLM v1,68.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HFv1 GSM8K,71.34,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HFv1 HellaSwag,79.41,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HFv1 MMLU,68.16,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HFv1 TruthfulQA,53.26,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_5,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HF OpenLLM v1,69.35,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HFv1 HellaSwag,81.82,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HFv1 MMLU,67.67,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HFv1 TruthfulQA,55.18,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_7,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HF OpenLLM v1,73.2,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HFv1 GSM8K,68.46,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HFv1 HellaSwag,87.77,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HFv1 MMLU,68.3,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HFv1 TruthfulQA,63.94,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_8,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HF OpenLLM v1,73.29,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HFv1 GSM8K,66.49,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HFv1 HellaSwag,88.17,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HFv1 MMLU,68.1,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HFv1 TruthfulQA,64.67,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_instruct_v0_9,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HF OpenLLM v1,62.48,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HFv1 GSM8K,44.66,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HFv1 HellaSwag,82.21,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HFv1 MMLU,66.69,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HFv1 TruthfulQA,42.93,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_nola,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HF OpenLLM v1,68.85,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HFv1 HellaSwag,81.19,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HFv1 MMLU,68.8,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HFv1 TruthfulQA,52.88,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_okay,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HF OpenLLM v1,62.13,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HFv1 ARC,56.23,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HFv1 GSM8K,44.05,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HFv1 HellaSwag,82.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HFv1 MMLU,65.74,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HFv1 TruthfulQA,46.81,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HF OpenLLM v1,64.67,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HFv1 GSM8K,48.75,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HFv1 HellaSwag,82.56,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HFv1 MMLU,66.59,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HFv1 TruthfulQA,50.47,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_orpo_v0_1,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HF OpenLLM v1,64.93,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HFv1 GSM8K,62.55,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HFv1 HellaSwag,78.35,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HFv1 MMLU,64.39,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HFv1 TruthfulQA,49.37,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_ortho_v2,HFv1 Winogrande,75.85,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HF OpenLLM v1,63.22,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HFv1 HellaSwag,83.12,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HFv1 MMLU,65.48,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HFv1 TruthfulQA,41.05,,hf_open_llm_v1_240829_frozen.csv +llama3_8b_wangchanx_sft_demo,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HF OpenLLM v1,63.21,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HFv1 HellaSwag,80.24,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HFv1 MMLU,63.1,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HFv1 TruthfulQA,55.15,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HF OpenLLM v1,66.68,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HFv1 GSM8K,60.58,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HFv1 HellaSwag,79.72,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HFv1 MMLU,66.48,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HFv1 TruthfulQA,53.93,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v2,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HF OpenLLM v1,66.81,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HFv1 GSM8K,59.21,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HFv1 HellaSwag,80.51,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HFv1 MMLU,67.9,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HFv1 TruthfulQA,53.57,,hf_open_llm_v1_240829_frozen.csv +llama3_chinese_8b_instruct_v3,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HF OpenLLM v1,66.5,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HFv1 GSM8K,54.81,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HFv1 HellaSwag,84.13,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HFv1 MMLU,64.69,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HFv1 TruthfulQA,56.34,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v1_8b,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HF OpenLLM v1,65.44,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HFv1 GSM8K,53.07,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HFv1 HellaSwag,83.04,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HFv1 MMLU,64.97,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HFv1 TruthfulQA,51.88,,hf_open_llm_v1_240829_frozen.csv +llama3_neural_chat_v2_2_8b,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HF OpenLLM v1,65.83,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HFv1 GSM8K,66.79,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HFv1 HellaSwag,79.7,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HFv1 MMLU,62.0,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HFv1 TruthfulQA,53.36,,hf_open_llm_v1_240829_frozen.csv +llama3_neurona_8b,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HF OpenLLM v1,64.89,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HFv1 GSM8K,53.15,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HFv1 HellaSwag,82.18,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HFv1 MMLU,65.03,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HFv1 TruthfulQA,51.1,,hf_open_llm_v1_240829_frozen.csv +llama3_orca_2_0_8b,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HF OpenLLM v1,30.44,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HFv1 HellaSwag,41.14,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HFv1 MMLU,24.59,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HFv1 TruthfulQA,40.92,,hf_open_llm_v1_240829_frozen.csv +llama3_orpo_v1_merged_16bit,HFv1 Winogrande,52.72,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HF OpenLLM v1,66.65,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HFv1 GSM8K,68.23,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HFv1 HellaSwag,78.85,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HFv1 MMLU,66.8,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HFv1 TruthfulQA,49.97,,hf_open_llm_v1_240829_frozen.csv +llama3_ruozhiba_8b,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HF OpenLLM v1,59.72,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HFv1 GSM8K,46.17,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HFv1 HellaSwag,78.06,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HFv1 MMLU,57.11,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HFv1 TruthfulQA,47.68,,hf_open_llm_v1_240829_frozen.csv +llama3_soliloquy_8b,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HF OpenLLM v1,78.4,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HFv1 GSM8K,86.28,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HFv1 HellaSwag,86.21,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HFv1 MMLU,80.04,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HFv1 TruthfulQA,62.85,,hf_open_llm_v1_240829_frozen.csv +llama3_tenyxchat_70b,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HF OpenLLM v1,57.55,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HFv1 ARC,54.44,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HFv1 GSM8K,34.42,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HFv1 HellaSwag,79.91,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HFv1 MMLU,60.9,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HFv1 TruthfulQA,41.05,,hf_open_llm_v1_240829_frozen.csv +llama3_youko_8b,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HF OpenLLM v1,51.36,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HFv1 ARC,56.23,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HFv1 GSM8K,7.58,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HFv1 HellaSwag,80.93,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HFv1 MMLU,47.67,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HFv1 TruthfulQA,39.48,,hf_open_llm_v1_240829_frozen.csv +llama_13b,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HF OpenLLM v1,29.55,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HFv1 HellaSwag,35.23,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HFv1 MMLU,24.26,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HFv1 TruthfulQA,42.08,,hf_open_llm_v1_240829_frozen.csv +llama_160m,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HF OpenLLM v1,30.28,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HFv1 ARC,24.74,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HFv1 HellaSwag,35.32,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HFv1 MMLU,26.14,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HFv1 TruthfulQA,44.16,,hf_open_llm_v1_240829_frozen.csv +llama_160m_chat_v1,HFv1 Winogrande,51.3,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HF OpenLLM v1,55.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HFv1 GSM8K,22.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HFv1 MMLU,55.77,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HFv1 TruthfulQA,37.38,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HF OpenLLM v1,54.09,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HFv1 HellaSwag,81.92,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HFv1 MMLU,56.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HFv1 TruthfulQA,48.23,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_beluga_qlora,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HF OpenLLM v1,54.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HFv1 GSM8K,15.24,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HFv1 HellaSwag,81.94,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HFv1 MMLU,54.64,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HFv1 TruthfulQA,44.12,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HF OpenLLM v1,53.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HFv1 GSM8K,10.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HFv1 HellaSwag,81.45,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HFv1 MMLU,55.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HFv1 TruthfulQA,38.23,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_dutch,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HF OpenLLM v1,53.92,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HFv1 ARC,53.84,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HFv1 HellaSwag,80.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HFv1 MMLU,54.44,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HFv1 TruthfulQA,46.23,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_chat_platypus,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HF OpenLLM v1,54.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 GSM8K,8.49,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 HellaSwag,82.58,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 MMLU,55.86,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 TruthfulQA,43.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HF OpenLLM v1,54.16,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 ARC,59.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 GSM8K,8.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 MMLU,54.98,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 TruthfulQA,44.23,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_eli5_1024_r_64_alpha_16_merged,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HF OpenLLM v1,53.14,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 GSM8K,10.01,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 HellaSwag,82.33,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 MMLU,55.36,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 TruthfulQA,35.75,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HF OpenLLM v1,52.94,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 GSM8K,10.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 HellaSwag,81.97,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 MMLU,55.02,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 TruthfulQA,35.85,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ds_wiki_1024_full_r_64_alpha_16_merged,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HF OpenLLM v1,54.14,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 GSM8K,10.54,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 MMLU,55.41,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 TruthfulQA,39.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HF OpenLLM v1,53.57,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 GSM8K,8.72,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 HellaSwag,81.94,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 MMLU,55.0,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 TruthfulQA,40.26,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_eli5_wiki_1024_r_64_alpha_16_merged,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HF OpenLLM v1,53.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HFv1 HellaSwag,82.15,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HFv1 MMLU,55.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HFv1 TruthfulQA,37.39,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_fp16,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HF OpenLLM v1,52.89,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HFv1 GSM8K,8.57,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HFv1 HellaSwag,81.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HFv1 MMLU,54.31,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HFv1 TruthfulQA,37.81,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_ft_instruct_es,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HF OpenLLM v1,53.44,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HFv1 ARC,54.78,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HFv1 HellaSwag,79.05,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HFv1 MMLU,53.45,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HFv1 TruthfulQA,42.44,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_german_orpo,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HF OpenLLM v1,55.31,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HFv1 GSM8K,10.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HFv1 MMLU,55.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HFv1 TruthfulQA,44.12,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_guanaco_qlora,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HF OpenLLM v1,55.14,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HFv1 GSM8K,9.33,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HFv1 HellaSwag,81.96,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HFv1 MMLU,55.46,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HFv1 TruthfulQA,45.71,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instruct_v0_2,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HF OpenLLM v1,54.63,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HFv1 GSM8K,8.04,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HFv1 HellaSwag,83.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HFv1 MMLU,55.57,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HFv1 TruthfulQA,46.89,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_instructed,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HF OpenLLM v1,54.22,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HFv1 GSM8K,9.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HFv1 HellaSwag,82.14,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HFv1 MMLU,54.98,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HFv1 TruthfulQA,42.84,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HF OpenLLM v1,52.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HFv1 HellaSwag,82.31,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HFv1 MMLU,55.21,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HFv1 TruthfulQA,41.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_platypus_vicuna_wizard,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HF OpenLLM v1,53.87,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HFv1 GSM8K,3.26,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HFv1 HellaSwag,82.33,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HFv1 MMLU,55.8,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HFv1 TruthfulQA,46.23,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_qlora,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HF OpenLLM v1,51.94,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HFv1 HellaSwag,82.16,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HFv1 MMLU,54.68,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HFv1 TruthfulQA,41.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_13b_vicuna_wizard,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HF OpenLLM v1,55.04,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HFv1 GSM8K,8.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HFv1 HellaSwag,80.59,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HFv1 MMLU,55.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HFv1 TruthfulQA,53.45,,hf_open_llm_v1_240829_frozen.csv +llama_2_16b_nastychat,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HF OpenLLM v1,51.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HFv1 GSM8K,2.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HFv1 HellaSwag,79.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HFv1 MMLU,53.73,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HFv1 TruthfulQA,40.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_26b_trenchcoat_stack,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HF OpenLLM v1,29.53,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HFv1 ARC,26.96,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HFv1 HellaSwag,26.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HFv1 MMLU,23.33,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HFv1 TruthfulQA,50.71,,hf_open_llm_v1_240829_frozen.csv +llama_2_3b,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HF OpenLLM v1,67.87,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HFv1 ARC,67.32,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HFv1 GSM8K,54.06,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HFv1 HellaSwag,87.33,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HFv1 MMLU,69.83,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HFv1 TruthfulQA,44.92,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HF OpenLLM v1,62.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HFv1 GSM8K,26.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HFv1 HellaSwag,85.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HFv1 MMLU,63.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HFv1 TruthfulQA,52.8,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_chat,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HF OpenLLM v1,62.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HFv1 GSM8K,28.73,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HFv1 HellaSwag,85.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HFv1 MMLU,67.03,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HFv1 TruthfulQA,43.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_70b_ia3_guanaco,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HF OpenLLM v1,50.97,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HFv1 GSM8K,14.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HFv1 HellaSwag,78.59,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HFv1 MMLU,46.87,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HFv1 TruthfulQA,38.76,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HF OpenLLM v1,49.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HFv1 ARC,51.37,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HFv1 HellaSwag,78.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HFv1 MMLU,45.53,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HFv1 TruthfulQA,45.01,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_32k_instruct,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HF OpenLLM v1,51.75,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HFv1 GSM8K,9.7,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HFv1 HellaSwag,79.24,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HFv1 MMLU,44.15,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HFv1 TruthfulQA,52.43,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_alpaca_gpt4,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HF OpenLLM v1,50.74,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HFv1 GSM8K,7.35,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HFv1 HellaSwag,78.55,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HFv1 MMLU,48.32,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HFv1 TruthfulQA,45.57,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HF OpenLLM v1,52.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HFv1 GSM8K,19.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HFv1 HellaSwag,78.18,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HFv1 MMLU,48.1,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HFv1 TruthfulQA,45.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_attention_sparsity,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HF OpenLLM v1,52.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HFv1 GSM8K,18.42,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HFv1 HellaSwag,78.26,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HFv1 MMLU,48.18,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HFv1 TruthfulQA,45.29,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_10_sparsity,HFv1 Winogrande,71.59,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HF OpenLLM v1,52.19,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HFv1 GSM8K,17.74,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HFv1 HellaSwag,77.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HFv1 MMLU,47.49,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HFv1 TruthfulQA,45.84,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_attention_sparsity,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HF OpenLLM v1,52.01,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HFv1 GSM8K,17.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HFv1 HellaSwag,77.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HFv1 MMLU,47.27,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_20_sparsity,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HF OpenLLM v1,51.8,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HFv1 HellaSwag,76.87,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HFv1 MMLU,47.04,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_attention_sparsity,HFv1 Winogrande,71.03,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HF OpenLLM v1,51.02,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HFv1 HellaSwag,76.58,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HFv1 MMLU,45.57,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HFv1 TruthfulQA,44.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_30_sparsity,HFv1 Winogrande,69.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HF OpenLLM v1,52.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HFv1 GSM8K,19.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HFv1 HellaSwag,78.44,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HFv1 MMLU,48.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HFv1 TruthfulQA,45.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan,HFv1 Winogrande,72.38,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HF OpenLLM v1,52.92,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HFv1 ARC,53.24,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HFv1 GSM8K,19.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HFv1 HellaSwag,78.43,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HFv1 MMLU,48.43,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HFv1 TruthfulQA,45.66,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_flan_v2,HFv1 Winogrande,72.3,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HF OpenLLM v1,50.89,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HFv1 GSM8K,8.49,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HFv1 HellaSwag,78.25,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HFv1 MMLU,48.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HFv1 TruthfulQA,45.18,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_100step_v2,HFv1 Winogrande,72.3,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HF OpenLLM v1,52.62,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HFv1 GSM8K,18.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HFv1 HellaSwag,78.02,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HFv1 MMLU,48.42,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HFv1 TruthfulQA,45.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HF OpenLLM v1,52.75,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HFv1 GSM8K,18.95,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HFv1 HellaSwag,78.04,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HFv1 MMLU,48.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HFv1 TruthfulQA,45.42,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_flan_v2,HFv1 Winogrande,72.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HF OpenLLM v1,52.26,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HFv1 ARC,52.05,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HFv1 GSM8K,18.95,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HFv1 HellaSwag,77.38,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HFv1 MMLU,48.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HFv1 TruthfulQA,44.6,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_merged,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HF OpenLLM v1,50.21,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HFv1 ARC,51.79,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HFv1 GSM8K,7.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HFv1 HellaSwag,77.41,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HFv1 MMLU,48.55,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HFv1 TruthfulQA,43.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_200step_v2,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HF OpenLLM v1,52.41,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HFv1 ARC,52.56,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HFv1 HellaSwag,77.76,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HFv1 MMLU,48.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HFv1 TruthfulQA,45.14,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_300step_flan_v2,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HF OpenLLM v1,52.28,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HFv1 ARC,52.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HFv1 GSM8K,17.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HFv1 HellaSwag,77.63,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HFv1 MMLU,48.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_afr_441step_flan_v2,HFv1 Winogrande,72.53,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HF OpenLLM v1,47.89,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HFv1 ARC,49.57,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HFv1 HellaSwag,76.25,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HFv1 MMLU,45.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HFv1 TruthfulQA,42.17,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_flan2022_1_2m,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HF OpenLLM v1,48.01,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HFv1 ARC,46.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HFv1 HellaSwag,74.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HFv1 MMLU,47.25,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HFv1 TruthfulQA,45.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco,HFv1 Winogrande,63.85,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HF OpenLLM v1,49.94,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 ARC,51.45,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 GSM8K,11.68,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 HellaSwag,76.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 MMLU,47.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 TruthfulQA,43.15,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj,HFv1 Winogrande,69.22,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HF OpenLLM v1,49.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 ARC,51.54,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 GSM8K,11.3,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 HellaSwag,76.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 MMLU,46.92,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 TruthfulQA,42.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_freeze_embed_tokens_q_v_proj_lora,HFv1 Winogrande,68.27,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HF OpenLLM v1,52.38,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HFv1 GSM8K,18.8,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HFv1 HellaSwag,76.68,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HFv1 MMLU,48.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HFv1 TruthfulQA,43.82,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_chat_guanaco_lora,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HF OpenLLM v1,50.4,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 ARC,53.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 GSM8K,4.7,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 HellaSwag,78.21,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 MMLU,45.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 TruthfulQA,46.13,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_1024_qlora_merged,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HF OpenLLM v1,49.71,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 ARC,53.67,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 GSM8K,5.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 HellaSwag,78.09,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 MMLU,45.63,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 TruthfulQA,41.72,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_eli5_cleaned_wiki65k_1024_qlora_merged,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HF OpenLLM v1,43.68,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HFv1 HellaSwag,78.46,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HFv1 MMLU,42.33,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HFv1 TruthfulQA,37.97,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_flan2022_1_2m,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HF OpenLLM v1,48.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HFv1 ARC,52.05,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HFv1 HellaSwag,77.59,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HFv1 MMLU,43.99,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HFv1 TruthfulQA,39.32,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_gptq,HFv1 Winogrande,72.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HF OpenLLM v1,50.58,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HFv1 ARC,53.75,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HFv1 HellaSwag,78.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HFv1 MMLU,46.65,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HFv1 TruthfulQA,43.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_guanaco_instruct_sharded,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HF OpenLLM v1,49.73,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HFv1 ARC,51.45,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HFv1 HellaSwag,78.63,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HFv1 MMLU,43.6,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HFv1 TruthfulQA,43.71,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_open_platypus,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HF OpenLLM v1,51.22,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HFv1 GSM8K,7.05,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HFv1 HellaSwag,77.71,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HFv1 MMLU,48.83,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HFv1 TruthfulQA,48.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_7b_physics,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HF OpenLLM v1,50.75,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HFv1 GSM8K,5.91,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HFv1 HellaSwag,81.69,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HFv1 MMLU,46.97,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HFv1 TruthfulQA,43.78,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v10_7b,HFv1 Winogrande,70.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HF OpenLLM v1,49.88,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HFv1 ARC,53.16,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HFv1 GSM8K,7.2,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HFv1 HellaSwag,78.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HFv1 MMLU,45.54,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HFv1 TruthfulQA,40.37,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_a_7b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HF OpenLLM v1,50.94,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HFv1 ARC,54.61,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HFv1 GSM8K,6.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HFv1 HellaSwag,81.0,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HFv1 MMLU,47.07,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HFv1 TruthfulQA,41.93,,hf_open_llm_v1_240829_frozen.csv +llama_2_peanutbutter_v18_b_7b,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HF OpenLLM v1,66.47,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HFv1 GSM8K,30.48,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HFv1 HellaSwag,87.52,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HFv1 MMLU,69.11,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HFv1 TruthfulQA,61.79,,hf_open_llm_v1_240829_frozen.csv +llama_2_wizard_70b_qlora,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HF OpenLLM v1,62.79,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HFv1 GSM8K,37.23,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HFv1 HellaSwag,86.09,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HFv1 TruthfulQA,43.43,,hf_open_llm_v1_240829_frozen.csv +llama_65b,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HF OpenLLM v1,29.72,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HFv1 HellaSwag,28.27,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv +llama_68m_chat_v1,HFv1 Winogrande,54.3,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HF OpenLLM v1,45.65,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HFv1 HellaSwag,77.82,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HFv1 MMLU,35.71,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HFv1 TruthfulQA,34.33,,hf_open_llm_v1_240829_frozen.csv +llama_7b,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HF OpenLLM v1,48.82,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 ARC,54.35,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 GSM8K,4.62,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 HellaSwag,78.06,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 MMLU,45.35,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 TruthfulQA,37.11,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_ds_wiki65k_1024_r_64_alpha_16_merged,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HF OpenLLM v1,49.98,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 GSM8K,4.55,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 HellaSwag,78.74,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 MMLU,45.44,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 TruthfulQA,43.4,,hf_open_llm_v1_240829_frozen.csv +llama_7b_sft_qlora_eli5_wiki_dpo_ds_rm_top_2_1024_r_64_alpha_16,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HF OpenLLM v1,45.62,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HFv1 ARC,50.94,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HFv1 HellaSwag,77.8,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HFv1 MMLU,35.67,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HFv1 TruthfulQA,34.34,,hf_open_llm_v1_240829_frozen.csv +llama_base_7b,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HF OpenLLM v1,58.15,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HFv1 GSM8K,28.51,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HFv1 HellaSwag,83.0,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HFv1 MMLU,54.91,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HFv1 TruthfulQA,45.76,,hf_open_llm_v1_240829_frozen.csv +llama_megamerge_dare_13b,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HF OpenLLM v1,35.0,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HFv1 ARC,33.19,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HFv1 HellaSwag,56.6,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HFv1 MMLU,24.66,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HFv1 TruthfulQA,36.28,,hf_open_llm_v1_240829_frozen.csv +llama_pile_350b,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HF OpenLLM v1,49.19,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HFv1 ARC,49.83,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HFv1 HellaSwag,77.33,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HFv1 MMLU,44.41,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HFv1 TruthfulQA,47.96,,hf_open_llm_v1_240829_frozen.csv +llama_v2_7b_32kc_security,HFv1 Winogrande,71.74,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HF OpenLLM v1,36.73,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HFv1 ARC,33.79,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HFv1 HellaSwag,59.24,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HFv1 MMLU,29.01,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HFv1 TruthfulQA,36.86,,hf_open_llm_v1_240829_frozen.csv +llamacorn_1_1b_chat,HFv1 Winogrande,61.48,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HF OpenLLM v1,63.53,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HFv1 GSM8K,47.16,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HFv1 HellaSwag,82.22,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HFv1 MMLU,66.59,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HFv1 TruthfulQA,47.67,,hf_open_llm_v1_240829_frozen.csv +llamarada_3_orpo_v2_8b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HF OpenLLM v1,74.65,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HFv1 HellaSwag,88.83,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HFv1 MMLU,64.5,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HFv1 TruthfulQA,70.24,,hf_open_llm_v1_240829_frozen.csv +llamaragdrama,HFv1 Winogrande,86.66,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HF OpenLLM v1,66.84,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HFv1 GSM8K,68.69,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HFv1 HellaSwag,78.33,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HFv1 MMLU,67.11,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HFv1 TruthfulQA,51.55,,hf_open_llm_v1_240829_frozen.csv +llamaster_8b_v0_1,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HF OpenLLM v1,59.1,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HFv1 GSM8K,37.98,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HFv1 MMLU,67.89,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HFv1 TruthfulQA,39.79,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_base,HFv1 Winogrande,73.16,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HF OpenLLM v1,58.31,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HFv1 ARC,54.61,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HFv1 GSM8K,36.69,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HFv1 HellaSwag,78.99,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HFv1 MMLU,63.28,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HFv1 TruthfulQA,42.98,,hf_open_llm_v1_240829_frozen.csv +llamion_14b_chat,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HF OpenLLM v1,52.28,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HFv1 GSM8K,15.31,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HFv1 HellaSwag,76.09,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HFv1 MMLU,51.68,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv +llava_v1_5_7b_vicuna,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HF OpenLLM v1,31.77,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 ARC,26.88,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 HellaSwag,44.78,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 TruthfulQA,45.19,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_dolly_oasst_v1_0,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HF OpenLLM v1,31.63,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HFv1 ARC,27.22,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HFv1 HellaSwag,44.7,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HFv1 TruthfulQA,44.69,,hf_open_llm_v1_240829_frozen.csv +llm_jp_13b_instruct_full_jaster_v1_0,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HF OpenLLM v1,38.51,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HFv1 ARC,39.08,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HFv1 HellaSwag,67.15,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HFv1 MMLU,26.43,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HFv1 TruthfulQA,34.71,,hf_open_llm_v1_240829_frozen.csv +llongma_3b_lima,HFv1 Winogrande,63.38,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HF OpenLLM v1,53.02,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HFv1 GSM8K,7.51,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HFv1 HellaSwag,79.44,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HFv1 MMLU,49.35,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HFv1 TruthfulQA,49.84,,hf_open_llm_v1_240829_frozen.csv +llongorca_7b_16k,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HF OpenLLM v1,56.92,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HFv1 ARC,60.58,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HFv1 GSM8K,15.92,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HFv1 HellaSwag,81.26,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HFv1 MMLU,57.92,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HFv1 TruthfulQA,48.89,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_13b_mini,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HF OpenLLM v1,51.66,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HFv1 GSM8K,9.55,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HFv1 HellaSwag,77.38,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HFv1 MMLU,49.72,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +losslessmegacoder_llama2_7b_mini,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HF OpenLLM v1,68.67,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HFv1 ARC,66.72,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HFv1 HellaSwag,85.03,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HFv1 MMLU,64.43,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HFv1 TruthfulQA,60.03,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HF OpenLLM v1,69.08,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HFv1 ARC,67.15,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HFv1 GSM8K,56.48,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HFv1 HellaSwag,85.39,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HFv1 TruthfulQA,61.53,,hf_open_llm_v1_240829_frozen.csv +loyal_piano_m7_cdpo,HFv1 Winogrande,79.4,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HF OpenLLM v1,62.77,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HFv1 HellaSwag,83.73,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HFv1 MMLU,63.25,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HFv1 TruthfulQA,44.07,,hf_open_llm_v1_240829_frozen.csv +lr_experiment1_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HF OpenLLM v1,57.98,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HFv1 GSM8K,10.01,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HFv1 HellaSwag,82.92,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HFv1 MMLU,58.7,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HFv1 TruthfulQA,55.55,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HF OpenLLM v1,57.92,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HFv1 HellaSwag,82.89,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HFv1 MMLU,58.72,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v2,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HF OpenLLM v1,57.94,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HFv1 HellaSwag,82.88,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HFv1 MMLU,58.64,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv +luban_marcoroni_13b_v3,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,55.34,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.24,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.22,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,58.03,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,55.26,,hf_open_llm_v1_240829_frozen.csv +luban_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.37,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HF OpenLLM v1,65.3,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HFv1 GSM8K,39.88,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HFv1 HellaSwag,84.83,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HFv1 MMLU,60.45,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HFv1 TruthfulQA,67.65,,hf_open_llm_v1_240829_frozen.csv +lucie_7b_v0_2_16bit,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HF OpenLLM v1,69.61,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HFv1 ARC,68.34,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HFv1 GSM8K,51.02,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HFv1 HellaSwag,87.13,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HFv1 MMLU,64.38,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HFv1 TruthfulQA,63.81,,hf_open_llm_v1_240829_frozen.csv +lumosia_moe_4x10_7,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HF OpenLLM v1,60.02,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HFv1 GSM8K,30.78,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HFv1 HellaSwag,83.03,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HFv1 MMLU,60.9,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HFv1 TruthfulQA,49.29,,hf_open_llm_v1_240829_frozen.csv +m_b_4_32,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HF OpenLLM v1,39.51,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HFv1 ARC,37.03,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HFv1 HellaSwag,67.75,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv +malayalam_llama_7b_instruct_v0_1,HFv1 Winogrande,62.12,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HF OpenLLM v1,63.08,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HFv1 GSM8K,40.18,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HFv1 HellaSwag,83.77,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HFv1 MMLU,61.18,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HFv1 TruthfulQA,52.08,,hf_open_llm_v1_240829_frozen.csv +marcoro14_7b_slerp,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HF OpenLLM v1,67.44,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HFv1 ARC,61.43,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HFv1 GSM8K,65.5,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HFv1 HellaSwag,79.75,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HFv1 MMLU,71.64,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HFv1 TruthfulQA,48.55,,hf_open_llm_v1_240829_frozen.csv +master_yi_9b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HF OpenLLM v1,65.24,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HFv1 GSM8K,49.28,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HFv1 HellaSwag,84.19,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HFv1 MMLU,63.59,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HFv1 TruthfulQA,51.95,,hf_open_llm_v1_240829_frozen.csv +mathhermes_2_5_mistral_7b,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HF OpenLLM v1,63.39,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HFv1 GSM8K,53.75,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HFv1 HellaSwag,82.14,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HFv1 MMLU,62.42,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HFv1 TruthfulQA,42.44,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HF OpenLLM v1,63.22,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HFv1 GSM8K,42.61,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HFv1 HellaSwag,81.51,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HFv1 MMLU,61.97,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HFv1 TruthfulQA,54.7,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HF OpenLLM v1,65.99,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HFv1 GSM8K,50.04,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HFv1 HellaSwag,83.08,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HFv1 MMLU,61.87,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HFv1 TruthfulQA,60.29,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HF OpenLLM v1,65.77,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HFv1 GSM8K,50.42,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HFv1 HellaSwag,82.87,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HFv1 MMLU,62.02,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HFv1 TruthfulQA,58.86,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_boost_dpo_preview,HFv1 Winogrande,75.85,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HF OpenLLM v1,64.87,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HFv1 GSM8K,56.18,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HFv1 MMLU,62.7,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HFv1 TruthfulQA,45.79,,hf_open_llm_v1_240829_frozen.csv +matter_0_1_7b_dpo_preview,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HF OpenLLM v1,64.67,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HFv1 GSM8K,53.9,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HFv1 HellaSwag,82.39,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HFv1 MMLU,62.51,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HFv1 TruthfulQA,48.11,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HF OpenLLM v1,66.15,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HFv1 GSM8K,56.94,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HFv1 HellaSwag,83.16,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HFv1 MMLU,62.9,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HFv1 TruthfulQA,50.3,,hf_open_llm_v1_240829_frozen.csv +matter_0_2_7b_dpo,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HF OpenLLM v1,52.13,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 ARC,57.17,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 HellaSwag,79.57,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 MMLU,50.24,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 TruthfulQA,52.51,,hf_open_llm_v1_240829_frozen.csv +mc_data_30k_from_platpus_orca_7b_10k_v1_lora_qkvo_rank14_v2,HFv1 Winogrande,72.93,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HF OpenLLM v1,36.06,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HFv1 ARC,32.94,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HFv1 HellaSwag,47.69,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HFv1 MMLU,31.9,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HFv1 TruthfulQA,43.53,,hf_open_llm_v1_240829_frozen.csv +mc_model_v1,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HF OpenLLM v1,58.13,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HFv1 ARC,57.59,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HFv1 GSM8K,32.83,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HFv1 HellaSwag,78.14,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HFv1 MMLU,56.13,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HFv1 TruthfulQA,48.77,,hf_open_llm_v1_240829_frozen.csv +medchator_2x7b,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HF OpenLLM v1,72.11,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HFv1 ARC,69.2,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HFv1 GSM8K,65.05,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HFv1 HellaSwag,86.84,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HFv1 MMLU,64.91,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HFv1 TruthfulQA,64.42,,hf_open_llm_v1_240829_frozen.csv +medes_7b,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +megachat,HF OpenLLM v1,34.75,,hf_open_llm_v1_240829_frozen.csv +megachat,HFv1 ARC,30.8,,hf_open_llm_v1_240829_frozen.csv +megachat,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +megachat,HFv1 HellaSwag,54.35,,hf_open_llm_v1_240829_frozen.csv +megachat,HFv1 MMLU,25.55,,hf_open_llm_v1_240829_frozen.csv +megachat,HFv1 TruthfulQA,39.85,,hf_open_llm_v1_240829_frozen.csv +megachat,HFv1 Winogrande,56.99,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HF OpenLLM v1,30.4,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HFv1 HellaSwag,39.18,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HFv1 MMLU,24.32,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HFv1 TruthfulQA,41.51,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m,HFv1 Winogrande,52.96,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HF OpenLLM v1,30.31,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HFv1 ARC,26.37,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HFv1 HellaSwag,38.39,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HFv1 MMLU,23.6,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HFv1 TruthfulQA,41.19,,hf_open_llm_v1_240829_frozen.csv +megatron_gpt2_345m_evol_instruct_v2,HFv1 Winogrande,52.33,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HF OpenLLM v1,62.82,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HFv1 GSM8K,5.69,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HFv1 HellaSwag,87.3,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HFv1 MMLU,70.56,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HFv1 TruthfulQA,60.61,,hf_open_llm_v1_240829_frozen.csv +melangea_70b,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HF OpenLLM v1,67.12,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HFv1 GSM8K,30.63,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HFv1 HellaSwag,87.5,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HFv1 MMLU,70.03,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HFv1 TruthfulQA,59.36,,hf_open_llm_v1_240829_frozen.csv +melangeb_70b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HF OpenLLM v1,61.96,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HFv1 HellaSwag,87.6,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HFv1 MMLU,70.37,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HFv1 TruthfulQA,58.13,,hf_open_llm_v1_240829_frozen.csv +melangec_70b,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HF OpenLLM v1,37.49,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HFv1 ARC,40.02,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HFv1 HellaSwag,65.14,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HFv1 TruthfulQA,33.3,,hf_open_llm_v1_240829_frozen.csv +merge_dolly_v2_3b_dpo_test,HFv1 Winogrande,59.35,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HF OpenLLM v1,68.06,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HFv1 ARC,68.94,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HFv1 GSM8K,45.19,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HFv1 HellaSwag,87.75,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HFv1 MMLU,55.35,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HFv1 TruthfulQA,72.76,,hf_open_llm_v1_240829_frozen.csv +merged_dpo_7b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HF OpenLLM v1,74.33,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HFv1 HellaSwag,87.84,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HFv1 MMLU,64.88,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HFv1 TruthfulQA,66.27,,hf_open_llm_v1_240829_frozen.csv +mergetrix_7b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HF OpenLLM v1,64.0,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HFv1 GSM8K,41.09,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HFv1 HellaSwag,84.52,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HFv1 MMLU,64.91,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HFv1 TruthfulQA,50.15,,hf_open_llm_v1_240829_frozen.csv +merlinite_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HF OpenLLM v1,65.22,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HFv1 GSM8K,47.76,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HFv1 HellaSwag,85.17,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HFv1 MMLU,64.34,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HFv1 TruthfulQA,50.05,,hf_open_llm_v1_240829_frozen.csv +mermaid_7b_ties,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HFv1 GSM8K,72.18,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HFv1 HellaSwag,84.23,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HFv1 MMLU,76.54,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HFv1 TruthfulQA,65.44,,hf_open_llm_v1_240829_frozen.csv +metamath_bagel_dpo_34b,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HF OpenLLM v1,74.42,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HFv1 GSM8K,65.43,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HFv1 HellaSwag,88.4,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HFv1 TruthfulQA,71.86,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HF OpenLLM v1,69.33,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HFv1 GSM8K,61.33,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HFv1 HellaSwag,84.73,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HFv1 MMLU,63.93,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HFv1 TruthfulQA,61.23,,hf_open_llm_v1_240829_frozen.csv +metamodel_moe_multilingualv1,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HF OpenLLM v1,74.39,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HFv1 GSM8K,65.35,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HFv1 HellaSwag,88.38,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HFv1 MMLU,66.29,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HFv1 TruthfulQA,71.91,,hf_open_llm_v1_240829_frozen.csv +metamodel_moex8,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HF OpenLLM v1,74.39,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HFv1 HellaSwag,88.39,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HFv1 MMLU,66.32,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HFv1 TruthfulQA,71.86,,hf_open_llm_v1_240829_frozen.csv +metamodelv3,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HF OpenLLM v1,60.02,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HFv1 GSM8K,33.21,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HFv1 HellaSwag,82.85,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HFv1 MMLU,61.42,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HFv1 TruthfulQA,45.24,,hf_open_llm_v1_240829_frozen.csv +metis_0_1,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HF OpenLLM v1,65.44,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HFv1 GSM8K,39.35,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HFv1 HellaSwag,84.8,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HFv1 TruthfulQA,67.56,,hf_open_llm_v1_240829_frozen.csv +metis_0_3,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HF OpenLLM v1,61.34,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HFv1 GSM8K,21.83,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HFv1 HellaSwag,84.0,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HFv1 MMLU,62.65,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HFv1 TruthfulQA,59.24,,hf_open_llm_v1_240829_frozen.csv +metis_0_3_merged,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HF OpenLLM v1,61.34,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HFv1 GSM8K,22.21,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HFv1 HellaSwag,84.0,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HFv1 MMLU,62.7,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HFv1 TruthfulQA,59.24,,hf_open_llm_v1_240829_frozen.csv +metis_0_4,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HF OpenLLM v1,62.65,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HFv1 GSM8K,42.91,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HFv1 HellaSwag,83.77,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HFv1 MMLU,62.16,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HFv1 TruthfulQA,49.33,,hf_open_llm_v1_240829_frozen.csv +metis_0_5,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +mgpt,HF OpenLLM v1,27.61,,hf_open_llm_v1_240829_frozen.csv +mgpt,HFv1 ARC,23.81,,hf_open_llm_v1_240829_frozen.csv +mgpt,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mgpt,HFv1 HellaSwag,26.37,,hf_open_llm_v1_240829_frozen.csv +mgpt,HFv1 MMLU,25.17,,hf_open_llm_v1_240829_frozen.csv +mgpt,HFv1 TruthfulQA,39.62,,hf_open_llm_v1_240829_frozen.csv +mgpt,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HF OpenLLM v1,47.29,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HFv1 ARC,47.53,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HFv1 GSM8K,16.68,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HFv1 HellaSwag,65.31,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HFv1 MMLU,45.74,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HFv1 TruthfulQA,46.22,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3,HFv1 Winogrande,62.27,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HF OpenLLM v1,47.77,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HFv1 ARC,49.57,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HFv1 GSM8K,15.24,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HFv1 HellaSwag,68.1,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HFv1 MMLU,45.76,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HFv1 TruthfulQA,45.88,,hf_open_llm_v1_240829_frozen.csv +mhm_7b_v1_3_dpo_1,HFv1 Winogrande,62.04,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HF OpenLLM v1,74.01,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HFv1 GSM8K,71.57,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HFv1 HellaSwag,87.75,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HFv1 MMLU,64.7,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HFv1 TruthfulQA,67.1,,hf_open_llm_v1_240829_frozen.csv +mhm_8x7b_frankenmoe_v1_0,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HF OpenLLM v1,57.56,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HFv1 GSM8K,20.17,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HFv1 HellaSwag,83.21,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HFv1 MMLU,55.05,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HFv1 TruthfulQA,50.43,,hf_open_llm_v1_240829_frozen.csv +michel_13b,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HF OpenLLM v1,28.9,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HFv1 HellaSwag,29.25,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HFv1 MMLU,25.54,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HFv1 TruthfulQA,45.02,,hf_open_llm_v1_240829_frozen.csv +microscopic_mistral_87k_steps,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv +mindllm,HF OpenLLM v1,29.28,,hf_open_llm_v1_240829_frozen.csv +mindllm,HFv1 ARC,22.44,,hf_open_llm_v1_240829_frozen.csv +mindllm,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +mindllm,HFv1 HellaSwag,34.11,,hf_open_llm_v1_240829_frozen.csv +mindllm,HFv1 MMLU,25.5,,hf_open_llm_v1_240829_frozen.csv +mindllm,HFv1 TruthfulQA,43.48,,hf_open_llm_v1_240829_frozen.csv +mindllm,HFv1 Winogrande,49.33,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HF OpenLLM v1,64.4,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HFv1 GSM8K,56.56,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HFv1 HellaSwag,79.91,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HFv1 MMLU,59.55,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HFv1 TruthfulQA,54.64,,hf_open_llm_v1_240829_frozen.csv +mini_7b_dare_v1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HF OpenLLM v1,61.23,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HFv1 GSM8K,35.03,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HFv1 HellaSwag,83.89,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HFv1 MMLU,61.9,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HFv1 TruthfulQA,48.47,,hf_open_llm_v1_240829_frozen.csv +mini_dpo_test02,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HF OpenLLM v1,63.39,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HFv1 GSM8K,44.88,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HFv1 HellaSwag,83.44,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HFv1 MMLU,61.2,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HFv1 TruthfulQA,53.67,,hf_open_llm_v1_240829_frozen.csv +mini_synatra_sft,HFv1 Winogrande,74.66,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HF OpenLLM v1,37.37,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HFv1 ARC,34.73,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HFv1 HellaSwag,62.38,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HFv1 MMLU,25.69,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HFv1 TruthfulQA,38.97,,hf_open_llm_v1_240829_frozen.csv +minillama_1_8b_chat_v0_1,HFv1 Winogrande,60.54,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HF OpenLLM v1,41.44,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HFv1 ARC,43.43,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HFv1 GSM8K,2.73,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HFv1 HellaSwag,68.06,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HFv1 MMLU,28.69,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HFv1 TruthfulQA,39.76,,hf_open_llm_v1_240829_frozen.csv +minima_3b,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HF OpenLLM v1,41.6,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HFv1 ARC,40.7,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HFv1 HellaSwag,54.06,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HFv1 MMLU,43.32,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HFv1 TruthfulQA,49.65,,hf_open_llm_v1_240829_frozen.csv +minimerlin_3b_v0_1,HFv1 Winogrande,60.54,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HF OpenLLM v1,55.37,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HFv1 GSM8K,12.05,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HFv1 HellaSwag,82.42,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HFv1 MMLU,55.87,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HFv1 TruthfulQA,45.57,,hf_open_llm_v1_240829_frozen.csv +minotaur_llama2_13b_qlora,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HF OpenLLM v1,28.92,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HFv1 ARC,21.33,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HFv1 HellaSwag,26.39,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HFv1 MMLU,24.8,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HFv1 TruthfulQA,47.45,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_base,HFv1 Winogrande,53.2,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HF OpenLLM v1,28.49,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HFv1 ARC,20.39,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HFv1 HellaSwag,26.54,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HFv1 MMLU,25.75,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_chat,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HF OpenLLM v1,28.8,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HFv1 ARC,20.73,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HFv1 HellaSwag,26.72,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HFv1 MMLU,26.84,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HFv1 TruthfulQA,47.75,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_deita,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HF OpenLLM v1,28.97,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HFv1 ARC,21.08,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HFv1 HellaSwag,26.95,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HFv1 MMLU,26.08,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HFv1 TruthfulQA,47.7,,hf_open_llm_v1_240829_frozen.csv +minueza_32m_ultrachat,HFv1 Winogrande,51.78,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HF OpenLLM v1,28.12,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HFv1 ARC,20.14,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HFv1 HellaSwag,26.36,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HFv1 MMLU,26.07,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HFv1 TruthfulQA,44.56,,hf_open_llm_v1_240829_frozen.csv +minueza_32mx2_chat,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HF OpenLLM v1,76.59,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HFv1 GSM8K,67.7,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HFv1 HellaSwag,88.61,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HFv1 MMLU,75.49,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HFv1 TruthfulQA,69.38,,hf_open_llm_v1_240829_frozen.csv +miqu_1_70b_sf,HFv1 Winogrande,85.32,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HF OpenLLM v1,76.6,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HFv1 HellaSwag,88.6,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HFv1 MMLU,75.41,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HFv1 TruthfulQA,69.44,,hf_open_llm_v1_240829_frozen.csv +miqu_70b_alpaca_dpo,HFv1 Winogrande,85.4,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HF OpenLLM v1,29.49,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HFv1 ARC,24.66,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HFv1 HellaSwag,26.78,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HFv1 TruthfulQA,50.07,,hf_open_llm_v1_240829_frozen.csv +mistral7b_test001,HFv1 Winogrande,52.33,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HF OpenLLM v1,66.12,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HFv1 ARC,64.25,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HFv1 GSM8K,52.39,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HFv1 HellaSwag,83.81,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HFv1 MMLU,63.66,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HFv1 TruthfulQA,54.66,,hf_open_llm_v1_240829_frozen.csv +mistral_11b_slimorca,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.64,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HFv1 ARC,28.67,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.35,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HFv1 MMLU,24.66,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.1,,hf_open_llm_v1_240829_frozen.csv +mistral_1_from_mixtral_8x7b_v0_1,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HF OpenLLM v1,49.94,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HFv1 ARC,49.4,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HFv1 GSM8K,6.37,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HFv1 HellaSwag,72.92,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HFv1 MMLU,48.75,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HFv1 TruthfulQA,47.35,,hf_open_llm_v1_240829_frozen.csv +mistral_22b_v0_1,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.83,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HFv1 ARC,28.41,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.49,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HFv1 MMLU,24.17,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv +mistral_2_from_mixtral_8x7b_v0_1,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.93,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HFv1 ARC,29.35,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.59,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.73,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.19,,hf_open_llm_v1_240829_frozen.csv +mistral_3_from_mixtral_8x7b_v0_1,HFv1 Winogrande,49.72,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.53,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HFv1 ARC,28.24,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,27.53,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HFv1 MMLU,24.83,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.51,,hf_open_llm_v1_240829_frozen.csv +mistral_4_from_mixtral_8x7b_v0_1,HFv1 Winogrande,48.07,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.83,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HFv1 ARC,29.35,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.44,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.1,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.27,,hf_open_llm_v1_240829_frozen.csv +mistral_5_from_mixtral_8x7b_v0_1,HFv1 Winogrande,49.8,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.61,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HFv1 ARC,28.33,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.82,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.45,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.39,,hf_open_llm_v1_240829_frozen.csv +mistral_6_from_mixtral_8x7b_v0_1,HFv1 Winogrande,48.7,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.64,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.57,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.12,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.54,,hf_open_llm_v1_240829_frozen.csv +mistral_7_from_mixtral_8x7b_v0_1,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HF OpenLLM v1,54.92,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HFv1 HellaSwag,82.01,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HFv1 MMLU,58.72,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HFv1 TruthfulQA,53.54,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_aezakmi_v1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HF OpenLLM v1,59.27,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HFv1 ARC,63.23,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HFv1 HellaSwag,84.99,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HFv1 MMLU,63.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HFv1 TruthfulQA,47.47,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_claude_instruct,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HF OpenLLM v1,74.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HFv1 GSM8K,70.89,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HFv1 HellaSwag,88.15,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HFv1 MMLU,64.83,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HFv1 TruthfulQA,68.48,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_merge_v1_1,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HF OpenLLM v1,29.48,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 ARC,25.51,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 HellaSwag,25.52,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 MMLU,26.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 TruthfulQA,48.81,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_open_orca_flan_50k_synthetic_5_models,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HF OpenLLM v1,73.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HFv1 GSM8K,70.66,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HFv1 HellaSwag,87.57,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HFv1 MMLU,63.85,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HFv1 TruthfulQA,66.86,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v5,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HF OpenLLM v1,74.5,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HFv1 GSM8K,70.89,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HFv1 HellaSwag,88.1,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HFv1 MMLU,64.68,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HFv1 TruthfulQA,68.24,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_dpo_v6,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HF OpenLLM v1,56.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HFv1 GSM8K,25.55,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HFv1 HellaSwag,80.3,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HFv1 MMLU,59.42,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HFv1 TruthfulQA,40.93,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_erebus_v3,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HF OpenLLM v1,61.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HFv1 GSM8K,37.0,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HFv1 HellaSwag,83.24,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HFv1 MMLU,63.74,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HFv1 TruthfulQA,43.64,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_ft_h4_no_robots_instructions,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HF OpenLLM v1,52.49,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HFv1 GSM8K,20.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HFv1 HellaSwag,44.42,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HFv1 MMLU,59.29,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_golden,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HF OpenLLM v1,62.29,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HFv1 GSM8K,50.34,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HFv1 HellaSwag,79.26,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HFv1 MMLU,58.78,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HFv1 TruthfulQA,50.66,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_sft_tuned_v0_2,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HF OpenLLM v1,54.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HFv1 ARC,54.52,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HFv1 GSM8K,14.25,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HFv1 HellaSwag,75.63,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HFv1 MMLU,55.38,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HFv1 TruthfulQA,56.28,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_1,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HF OpenLLM v1,65.71,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HFv1 GSM8K,40.03,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HFv1 MMLU,60.78,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HFv1 TruthfulQA,68.26,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 GSM8K,38.89,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 MMLU,60.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 TruthfulQA,68.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_10_v0_1,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HF OpenLLM v1,65.74,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 GSM8K,39.73,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 HellaSwag,84.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 MMLU,60.81,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 TruthfulQA,68.26,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_20,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HF OpenLLM v1,65.51,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 GSM8K,39.42,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 HellaSwag,84.71,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 MMLU,60.49,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 TruthfulQA,67.49,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_attention_sparsity_30,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HF OpenLLM v1,65.56,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HFv1 GSM8K,40.26,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HFv1 HellaSwag,84.74,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HFv1 MMLU,60.6,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HFv1 TruthfulQA,67.35,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_selfplay_v0,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HF OpenLLM v1,65.68,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HFv1 GSM8K,40.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HFv1 HellaSwag,84.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HFv1 MMLU,60.75,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HFv1 TruthfulQA,68.22,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sp_v0,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HFv1 GSM8K,38.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HFv1 MMLU,60.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HFv1 TruthfulQA,67.93,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_10,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HF OpenLLM v1,50.7,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HFv1 ARC,52.65,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HFv1 GSM8K,11.3,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HFv1 HellaSwag,76.71,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HFv1 MMLU,47.27,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HFv1 TruthfulQA,47.22,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_20,HFv1 Winogrande,69.06,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HF OpenLLM v1,49.74,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HFv1 ARC,51.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HFv1 GSM8K,10.54,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HFv1 HellaSwag,75.72,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HFv1 MMLU,46.54,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HFv1 TruthfulQA,45.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v0_2_sparsity_30,HFv1 Winogrande,68.98,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HF OpenLLM v1,65.68,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HFv1 GSM8K,40.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HFv1 HellaSwag,84.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HFv1 MMLU,60.75,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HFv1 TruthfulQA,68.22,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_instruct_v2_sp_v0_1,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HF OpenLLM v1,63.75,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HFv1 GSM8K,44.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HFv1 HellaSwag,82.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HFv1 MMLU,57.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HFv1 TruthfulQA,53.65,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_med_merge,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HF OpenLLM v1,58.85,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HFv1 GSM8K,38.36,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HFv1 MMLU,57.66,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HFv1 TruthfulQA,41.91,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_norobots,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HF OpenLLM v1,56.29,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HFv1 ARC,55.8,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HFv1 GSM8K,12.59,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HFv1 MMLU,59.76,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HFv1 TruthfulQA,48.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_open_platypus,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HF OpenLLM v1,58.9,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HFv1 GSM8K,11.98,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HFv1 HellaSwag,84.66,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HFv1 MMLU,62.2,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HFv1 TruthfulQA,52.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openorca_1k,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HF OpenLLM v1,58.07,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HFv1 HellaSwag,84.25,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HFv1 MMLU,59.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HFv1 TruthfulQA,49.86,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_openplatypus_1k,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HF OpenLLM v1,58.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HFv1 GSM8K,20.09,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HFv1 HellaSwag,82.44,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HFv1 MMLU,60.75,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HFv1 TruthfulQA,54.59,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_airoboros_pref_10k,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HF OpenLLM v1,63.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HFv1 GSM8K,41.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HFv1 HellaSwag,83.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HFv1 MMLU,63.49,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HFv1 TruthfulQA,53.87,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_alignment_handbook,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HF OpenLLM v1,60.36,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HFv1 GSM8K,28.43,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HFv1 HellaSwag,83.57,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HFv1 MMLU,61.15,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HFv1 TruthfulQA,54.13,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_orpo_capybara_reproduction,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HF OpenLLM v1,58.19,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HFv1 GSM8K,16.38,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HFv1 HellaSwag,82.93,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HFv1 MMLU,63.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HFv1 TruthfulQA,46.96,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus1k,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HF OpenLLM v1,58.71,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HFv1 HellaSwag,84.15,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HFv1 MMLU,64.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HFv1 TruthfulQA,45.07,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_platypus_fp16,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HF OpenLLM v1,57.82,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HFv1 ARC,54.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HFv1 GSM8K,31.24,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HFv1 HellaSwag,75.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HFv1 MMLU,55.4,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HFv1 TruthfulQA,56.28,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_selfplay_v0,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HF OpenLLM v1,72.17,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HFv1 GSM8K,65.81,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HFv1 HellaSwag,84.9,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HFv1 MMLU,64.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HFv1 TruthfulQA,69.72,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_dpo_v0,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HF OpenLLM v1,53.7,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HFv1 ARC,58.79,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HFv1 GSM8K,10.31,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HFv1 HellaSwag,81.92,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HFv1 MMLU,55.72,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HFv1 TruthfulQA,37.49,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_sft_open_orca_flan_50k,HFv1 Winogrande,77.98,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HF OpenLLM v1,60.97,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HFv1 GSM8K,37.83,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HFv1 HellaSwag,83.31,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HFv1 MMLU,64.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HFv1 TruthfulQA,42.15,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HF OpenLLM v1,61.3,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HFv1 GSM8K,37.23,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HFv1 HellaSwag,83.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HFv1 MMLU,64.01,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HFv1 TruthfulQA,43.53,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_dpo,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HF OpenLLM v1,60.79,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HFv1 HellaSwag,83.22,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HFv1 MMLU,64.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HFv1 TruthfulQA,42.28,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_half_naive_a,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HF OpenLLM v1,64.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HFv1 GSM8K,55.5,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HFv1 HellaSwag,83.36,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HFv1 MMLU,64.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HFv1 TruthfulQA,43.14,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_layla_v4,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HF OpenLLM v1,58.92,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HFv1 HellaSwag,85.08,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HFv1 MMLU,63.79,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HFv1 TruthfulQA,47.33,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_open_platypus,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HF OpenLLM v1,64.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HFv1 GSM8K,43.29,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HFv1 HellaSwag,83.78,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HFv1 TruthfulQA,52.6,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_orpo,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HF OpenLLM v1,60.48,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv1 GSM8K,35.94,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv1 HellaSwag,83.12,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv1 MMLU,64.11,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_2,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HF OpenLLM v1,29.16,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv1 ARC,28.67,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv1 HellaSwag,25.64,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv1 TruthfulQA,47.95,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_1_signtensors_1_over_4,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HF OpenLLM v1,60.41,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HFv1 GSM8K,34.95,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HFv1 HellaSwag,83.08,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HFv1 MMLU,63.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HFv1 TruthfulQA,41.8,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_2,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HF OpenLLM v1,60.28,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HFv1 GSM8K,34.5,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HFv1 MMLU,63.46,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HFv1 TruthfulQA,41.79,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v0_3,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HF OpenLLM v1,65.67,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HFv1 GSM8K,39.8,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HFv1 HellaSwag,84.97,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HFv1 MMLU,60.71,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HFv1 TruthfulQA,68.24,,hf_open_llm_v1_240829_frozen.csv +mistral_7b_v2_selfplay,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HF OpenLLM v1,29.91,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HFv1 ARC,29.01,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HFv1 HellaSwag,26.23,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HFv1 MMLU,25.29,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HFv1 TruthfulQA,48.12,,hf_open_llm_v1_240829_frozen.csv +mistral_8_from_mixtral_8x7b_v0_1,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HF OpenLLM v1,60.95,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HFv1 GSM8K,35.33,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HFv1 HellaSwag,83.22,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HFv1 MMLU,61.22,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HFv1 TruthfulQA,47.9,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr03_32_sig,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HF OpenLLM v1,60.85,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HFv1 HellaSwag,83.28,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HFv1 MMLU,60.86,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HFv1 TruthfulQA,49.69,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr05_32_sig,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HF OpenLLM v1,60.43,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HFv1 GSM8K,37.83,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HFv1 HellaSwag,82.57,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HFv1 MMLU,61.35,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HFv1 TruthfulQA,44.34,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr10_32_sig,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HF OpenLLM v1,60.43,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HFv1 GSM8K,37.6,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HFv1 HellaSwag,82.54,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HFv1 MMLU,61.41,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HFv1 TruthfulQA,44.75,,hf_open_llm_v1_240829_frozen.csv +mistral_dmbr20_32_sig,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HF OpenLLM v1,64.99,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 GSM8K,48.67,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 HellaSwag,84.42,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 MMLU,63.01,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 TruthfulQA,51.74,,hf_open_llm_v1_240829_frozen.csv +mistral_dolphin_2_8_grok_instract_2_7b_slerp,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HF OpenLLM v1,65.8,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HFv1 GSM8K,49.81,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HFv1 HellaSwag,84.65,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HFv1 MMLU,63.11,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HFv1 TruthfulQA,59.23,,hf_open_llm_v1_240829_frozen.csv +mistral_evolved_11b_v0_1,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HF OpenLLM v1,62.87,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HFv1 GSM8K,39.88,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HFv1 HellaSwag,83.03,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HFv1 MMLU,61.04,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HFv1 TruthfulQA,53.51,,hf_open_llm_v1_240829_frozen.csv +mistral_grok_instract_2_7b_slerp,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HF OpenLLM v1,61.21,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HFv1 ARC,56.74,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HFv1 GSM8K,37.6,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HFv1 HellaSwag,80.82,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HFv1 MMLU,59.1,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HFv1 TruthfulQA,55.86,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HF OpenLLM v1,64.05,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 GSM8K,37.0,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 HellaSwag,83.71,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 MMLU,59.19,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 TruthfulQA,64.08,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpaca_dpo2,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HF OpenLLM v1,65.34,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 GSM8K,40.33,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 HellaSwag,84.55,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 MMLU,60.66,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 TruthfulQA,67.29,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_7b_v0_2_chatalpacav2_4bit,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HF OpenLLM v1,58.96,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HFv1 ARC,58.19,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HFv1 GSM8K,11.22,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HFv1 HellaSwag,83.26,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HFv1 MMLU,59.53,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HFv1 TruthfulQA,66.48,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_frankenmerge,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HF OpenLLM v1,61.39,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HFv1 GSM8K,31.08,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HFv1 HellaSwag,81.55,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HFv1 MMLU,58.22,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HFv1 TruthfulQA,60.4,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_moe_experimental,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HF OpenLLM v1,59.08,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HFv1 GSM8K,30.78,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HFv1 HellaSwag,78.34,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HFv1 MMLU,55.19,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HFv1 TruthfulQA,57.61,,hf_open_llm_v1_240829_frozen.csv +mistral_instruct_slerp,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HF OpenLLM v1,60.78,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HFv1 GSM8K,37.53,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HFv1 HellaSwag,82.84,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HFv1 MMLU,61.39,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HFv1 TruthfulQA,46.2,,hf_open_llm_v1_240829_frozen.csv +mistral_kmmbr_32_sig,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HF OpenLLM v1,60.79,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HFv1 GSM8K,36.09,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HFv1 HellaSwag,83.1,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HFv1 MMLU,61.43,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HFv1 TruthfulQA,46.31,,hf_open_llm_v1_240829_frozen.csv +mistral_mbr_32_sig,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HF OpenLLM v1,48.93,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HFv1 HellaSwag,70.48,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HFv1 MMLU,43.05,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv +mistral_megamerge_dare_7b,HFv1 Winogrande,67.09,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HF OpenLLM v1,63.85,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HFv1 TruthfulQA,51.32,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HF OpenLLM v1,63.89,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HFv1 GSM8K,36.54,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HFv1 HellaSwag,85.23,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HFv1 MMLU,63.47,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HFv1 TruthfulQA,50.91,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_4_laser,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HF OpenLLM v1,61.98,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HFv1 GSM8K,36.54,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HFv1 HellaSwag,84.66,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HFv1 MMLU,62.56,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HFv1 TruthfulQA,42.43,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_5,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HF OpenLLM v1,58.74,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HFv1 GSM8K,25.09,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HFv1 HellaSwag,84.4,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HFv1 MMLU,57.6,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HFv1 TruthfulQA,39.91,,hf_open_llm_v1_240829_frozen.csv +mistral_neuraldpo_v0_7,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HFv1 HellaSwag,83.14,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HFv1 MMLU,61.42,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HFv1 TruthfulQA,46.37,,hf_open_llm_v1_240829_frozen.csv +mistral_nucleus09_32_sig,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HF OpenLLM v1,55.41,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HFv1 HellaSwag,85.1,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HFv1 MMLU,61.11,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HFv1 TruthfulQA,48.33,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_alpha,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HF OpenLLM v1,62.53,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HFv1 GSM8K,39.8,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HFv1 HellaSwag,84.03,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HFv1 MMLU,63.26,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HFv1 TruthfulQA,47.69,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_beta,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HF OpenLLM v1,61.77,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HFv1 MMLU,62.91,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HFv1 TruthfulQA,43.83,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_3k,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HF OpenLLM v1,63.36,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HFv1 GSM8K,42.38,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HFv1 HellaSwag,85.34,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HFv1 MMLU,63.41,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HFv1 TruthfulQA,45.98,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_capybara_7k,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HF OpenLLM v1,61.81,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HFv1 GSM8K,40.03,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HFv1 HellaSwag,83.45,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HFv1 MMLU,63.33,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HFv1 TruthfulQA,45.18,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_21k,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HF OpenLLM v1,63.04,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HFv1 GSM8K,41.24,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HFv1 HellaSwag,85.51,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HFv1 MMLU,62.89,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HFv1 TruthfulQA,46.91,,hf_open_llm_v1_240829_frozen.csv +mistral_orpo_mix_7k,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HF OpenLLM v1,59.52,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HFv1 GSM8K,33.51,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HFv1 HellaSwag,84.24,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HFv1 TruthfulQA,35.8,,hf_open_llm_v1_240829_frozen.csv +mistral_plus_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HF OpenLLM v1,61.76,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HFv1 GSM8K,38.21,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HFv1 HellaSwag,81.4,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HFv1 MMLU,60.84,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HFv1 TruthfulQA,54.6,,hf_open_llm_v1_240829_frozen.csv +mistral_portuguese_luana_7b_chat,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HF OpenLLM v1,61.06,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HFv1 HellaSwag,82.13,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HFv1 MMLU,61.74,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HFv1 TruthfulQA,49.32,,hf_open_llm_v1_240829_frozen.csv +mistral_pro_8b_v0_1,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HF OpenLLM v1,65.59,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HFv1 GSM8K,39.95,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HFv1 HellaSwag,84.97,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HFv1 MMLU,60.3,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HFv1 TruthfulQA,68.3,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_dpo,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HF OpenLLM v1,61.18,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HFv1 GSM8K,35.41,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HFv1 HellaSwag,81.44,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HFv1 MMLU,60.0,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HFv1 TruthfulQA,57.49,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_invert,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HF OpenLLM v1,65.23,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HFv1 GSM8K,39.12,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HFv1 HellaSwag,84.78,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HFv1 MMLU,60.36,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HFv1 TruthfulQA,67.44,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_packing,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HF OpenLLM v1,59.61,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HFv1 GSM8K,35.18,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HFv1 HellaSwag,81.21,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HFv1 MMLU,60.43,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HFv1 TruthfulQA,48.76,,hf_open_llm_v1_240829_frozen.csv +mistral_rank16_sft,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HF OpenLLM v1,65.42,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HFv1 GSM8K,38.89,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HFv1 HellaSwag,85.01,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HFv1 MMLU,60.57,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HFv1 TruthfulQA,68.29,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_dpo,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HF OpenLLM v1,60.85,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HFv1 ARC,55.72,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HFv1 GSM8K,35.18,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HFv1 HellaSwag,81.2,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HFv1 MMLU,59.88,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HFv1 TruthfulQA,56.18,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_invert,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HF OpenLLM v1,59.39,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HFv1 ARC,54.95,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HFv1 GSM8K,34.95,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HFv1 HellaSwag,80.97,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HFv1 MMLU,60.42,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HFv1 TruthfulQA,48.02,,hf_open_llm_v1_240829_frozen.csv +mistral_rank32_sft,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HFv1 GSM8K,38.89,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HFv1 HellaSwag,85.11,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HFv1 MMLU,60.32,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HFv1 TruthfulQA,68.61,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_dpo,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HF OpenLLM v1,61.85,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HFv1 GSM8K,36.85,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HFv1 HellaSwag,81.68,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HFv1 MMLU,60.26,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HFv1 TruthfulQA,58.32,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_invert,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HF OpenLLM v1,65.14,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HFv1 GSM8K,39.04,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HFv1 HellaSwag,84.77,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HFv1 MMLU,60.38,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HFv1 TruthfulQA,67.31,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_packing,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HF OpenLLM v1,59.41,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HFv1 ARC,55.8,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HFv1 GSM8K,32.45,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HFv1 HellaSwag,81.21,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HFv1 TruthfulQA,49.07,,hf_open_llm_v1_240829_frozen.csv +mistral_rank8_sft,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HFv1 GSM8K,32.45,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HFv1 HellaSwag,82.23,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HFv1 MMLU,63.4,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HFv1 TruthfulQA,48.49,,hf_open_llm_v1_240829_frozen.csv +mistral_sft_v3,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HF OpenLLM v1,52.66,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HFv1 ARC,54.1,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HFv1 GSM8K,9.93,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HFv1 HellaSwag,77.91,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HFv1 MMLU,54.49,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HFv1 TruthfulQA,49.36,,hf_open_llm_v1_240829_frozen.csv +mistral_trismegistus_7b,HFv1 Winogrande,70.17,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HF OpenLLM v1,59.09,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 GSM8K,18.5,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 MMLU,64.14,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 TruthfulQA,46.94,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_0_7b,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HF OpenLLM v1,58.66,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 HellaSwag,84.11,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 MMLU,64.38,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 TruthfulQA,45.92,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_2_7b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HF OpenLLM v1,58.65,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 GSM8K,18.12,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 HellaSwag,84.52,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 TruthfulQA,45.75,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_dpo_7b_qlora,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HF OpenLLM v1,58.24,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 GSM8K,17.13,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 HellaSwag,84.24,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 MMLU,63.66,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 TruthfulQA,44.94,,hf_open_llm_v1_240829_frozen.csv +mistral_v0_1_peanutbutter_v0_0_5_sft_7b_qlora,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HF OpenLLM v1,65.63,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HFv1 GSM8K,39.58,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HFv1 HellaSwag,84.91,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HFv1 MMLU,60.76,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HFv1 TruthfulQA,68.13,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_low_tmp,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HF OpenLLM v1,65.72,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HFv1 GSM8K,40.26,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HFv1 MMLU,60.78,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HFv1 TruthfulQA,68.14,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HF OpenLLM v1,65.61,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HFv1 GSM8K,39.73,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HFv1 HellaSwag,84.86,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HFv1 MMLU,60.64,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HFv1 TruthfulQA,67.91,,hf_open_llm_v1_240829_frozen.csv +mistral_v2_7b_selfplay_v0_test,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HF OpenLLM v1,67.75,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HFv1 ARC,69.45,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HFv1 GSM8K,37.91,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HFv1 MMLU,63.07,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HFv1 TruthfulQA,69.78,,hf_open_llm_v1_240829_frozen.csv +mistralbeagle_rs_7b_v0_1,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HF OpenLLM v1,53.62,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HFv1 HellaSwag,81.86,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HFv1 MMLU,60.49,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HFv1 TruthfulQA,40.55,,hf_open_llm_v1_240829_frozen.csv +mistralinstructlongish,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HF OpenLLM v1,73.58,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HFv1 GSM8K,71.11,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HFv1 HellaSwag,87.54,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HFv1 MMLU,64.98,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HFv1 TruthfulQA,65.35,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_slerp,HFv1 Winogrande,81.69,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HF OpenLLM v1,73.39,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HFv1 ARC,72.27,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HFv1 GSM8K,62.77,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HFv1 MMLU,65.24,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HFv1 TruthfulQA,70.73,,hf_open_llm_v1_240829_frozen.csv +mistraltrix_v1,HFv1 Winogrande,80.98,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HF OpenLLM v1,73.17,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HFv1 HellaSwag,88.4,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HFv1 MMLU,65.22,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HFv1 TruthfulQA,70.77,,hf_open_llm_v1_240829_frozen.csv +mistraltrixtest,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HF OpenLLM v1,76.76,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HFv1 ARC,72.78,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HFv1 HellaSwag,89.16,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HFv1 MMLU,64.35,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HFv1 TruthfulQA,78.1,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_2,HFv1 Winogrande,85.0,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HF OpenLLM v1,70.8,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 GSM8K,66.57,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 HellaSwag,86.24,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 MMLU,65.12,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 TruthfulQA,56.38,,hf_open_llm_v1_240829_frozen.csv +mistroll_7b_v2_3_notsosm4rt_16bit,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HF OpenLLM v1,28.92,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HFv1 ARC,21.16,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HFv1 HellaSwag,26.69,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HFv1 MMLU,25.7,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HFv1 TruthfulQA,47.84,,hf_open_llm_v1_240829_frozen.csv +mixnueza_6x32m_moe,HFv1 Winogrande,51.85,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HF OpenLLM v1,28.45,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HFv1 HellaSwag,30.57,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HFv1 MMLU,25.28,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HFv1 TruthfulQA,39.03,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch1,HFv1 Winogrande,52.8,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HF OpenLLM v1,28.92,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HFv1 HellaSwag,32.6,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HFv1 MMLU,25.26,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HFv1 TruthfulQA,39.24,,hf_open_llm_v1_240829_frozen.csv +mixsmol_4x400m_v0_1_epoch2,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HF OpenLLM v1,75.08,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HFv1 HellaSwag,88.93,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HFv1 TruthfulQA,69.83,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v5_0,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HF OpenLLM v1,75.94,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HFv1 ARC,73.38,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HFv1 GSM8K,68.92,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HFv1 HellaSwag,89.02,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HFv1 MMLU,64.61,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HFv1 TruthfulQA,70.45,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v6_0,HFv1 Winogrande,89.27,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HF OpenLLM v1,76.55,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HFv1 ARC,74.23,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HFv1 HellaSwag,89.37,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HFv1 MMLU,64.54,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HFv1 TruthfulQA,74.26,,hf_open_llm_v1_240829_frozen.csv +mixtao_7bx2_moe_instruct_v7_0,HFv1 Winogrande,87.77,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HF OpenLLM v1,74.41,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HFv1 GSM8K,65.28,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HFv1 HellaSwag,88.47,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HFv1 TruthfulQA,72.0,,hf_open_llm_v1_240829_frozen.csv +mixtral_11bx2_moe_19b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HF OpenLLM v1,52.87,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HFv1 ARC,56.66,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HFv1 GSM8K,6.07,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HFv1 HellaSwag,78.85,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HFv1 MMLU,52.88,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HFv1 TruthfulQA,51.55,,hf_open_llm_v1_240829_frozen.csv +mixtral_6x7b_instruct_v0_1,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HF OpenLLM v1,74.64,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HFv1 ARC,72.18,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HFv1 GSM8K,67.25,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HFv1 HellaSwag,87.88,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HFv1 MMLU,65.2,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HFv1 TruthfulQA,74.68,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx2_truthy,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HF OpenLLM v1,68.85,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HFv1 GSM8K,61.71,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HFv1 HellaSwag,85.23,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HFv1 MMLU,62.96,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HFv1 TruthfulQA,59.78,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx4_moe_24b,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HF OpenLLM v1,73.32,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HFv1 GSM8K,71.42,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HFv1 HellaSwag,86.77,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HFv1 MMLU,64.74,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HFv1 TruthfulQA,65.79,,hf_open_llm_v1_240829_frozen.csv +mixtral_7bx6_moe_35b,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HF OpenLLM v1,79.15,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HFv1 GSM8K,82.03,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HFv1 HellaSwag,89.08,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HFv1 MMLU,77.77,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HFv1 TruthfulQA,68.14,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_instruct_v0_1,HFv1 Winogrande,85.16,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HF OpenLLM v1,74.47,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HFv1 GSM8K,74.15,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HFv1 HellaSwag,88.74,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HFv1 MMLU,77.81,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x22b_v0_1,HFv1 Winogrande,85.0,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HF OpenLLM v1,70.45,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HFv1 GSM8K,53.75,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HFv1 HellaSwag,87.61,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HFv1 MMLU,70.66,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HFv1 TruthfulQA,57.38,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_1,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HF OpenLLM v1,71.32,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HFv1 GSM8K,57.54,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HFv1 HellaSwag,87.73,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HFv1 MMLU,71.03,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HFv1 TruthfulQA,58.69,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_dpo_v0_2,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HF OpenLLM v1,72.62,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HFv1 ARC,70.22,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HFv1 HellaSwag,87.63,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HFv1 MMLU,71.16,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HFv1 TruthfulQA,64.58,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HF OpenLLM v1,73.44,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HFv1 ARC,69.8,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HFv1 GSM8K,61.41,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HFv1 HellaSwag,87.83,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HFv1 MMLU,71.05,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HFv1 TruthfulQA,69.18,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_instruct_v0_1_dpo,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HF OpenLLM v1,68.87,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HFv1 GSM8K,51.4,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HFv1 MMLU,68.59,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HFv1 TruthfulQA,59.54,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_peft_v0_1,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HF OpenLLM v1,68.42,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HFv1 GSM8K,57.47,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HFv1 HellaSwag,86.49,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HFv1 MMLU,71.82,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HFv1 TruthfulQA,46.78,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HF OpenLLM v1,68.18,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HFv1 GSM8K,56.18,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HFv1 MMLU,71.65,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HFv1 TruthfulQA,46.74,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_dpo,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HF OpenLLM v1,68.18,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HFv1 GSM8K,56.18,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HFv1 MMLU,71.65,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HFv1 TruthfulQA,46.74,,hf_open_llm_v1_240829_frozen.csv +mixtral_8x7b_v0_1_sft,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HF OpenLLM v1,28.45,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HFv1 ARC,20.22,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HFv1 HellaSwag,27.78,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HFv1 MMLU,26.1,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HFv1 TruthfulQA,46.55,,hf_open_llm_v1_240829_frozen.csv +mixtral_gqa_400m_v2,HFv1 Winogrande,49.96,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HF OpenLLM v1,72.36,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HFv1 GSM8K,58.68,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HFv1 HellaSwag,87.28,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HFv1 MMLU,71.07,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HFv1 TruthfulQA,65.83,,hf_open_llm_v1_240829_frozen.csv +mixtral_instruct_0_1_laser,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HF OpenLLM v1,47.24,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HFv1 ARC,39.51,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HFv1 GSM8K,27.67,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HFv1 HellaSwag,39.06,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HFv1 MMLU,71.86,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HFv1 TruthfulQA,48.61,,hf_open_llm_v1_240829_frozen.csv +mixtral_ko_qna_merged,HFv1 Winogrande,56.75,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HF OpenLLM v1,67.87,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HFv1 GSM8K,52.46,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HFv1 HellaSwag,86.05,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HFv1 MMLU,69.08,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HFv1 TruthfulQA,53.82,,hf_open_llm_v1_240829_frozen.csv +mixtral_megamerge_dare_8x7b_v2,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HF OpenLLM v1,67.82,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HFv1 GSM8K,37.3,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HFv1 HellaSwag,88.88,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HFv1 MMLU,66.06,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HFv1 TruthfulQA,63.85,,hf_open_llm_v1_240829_frozen.csv +mixtral_orca_v0_1,HFv1 Winogrande,81.14,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HF OpenLLM v1,69.61,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HFv1 GSM8K,58.23,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HFv1 HellaSwag,85.76,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HFv1 MMLU,70.47,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HFv1 TruthfulQA,53.75,,hf_open_llm_v1_240829_frozen.csv +mixtralmerge_8x7b_rebalanced_test,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HF OpenLLM v1,64.62,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HFv1 ARC,70.31,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HFv1 GSM8K,17.29,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HFv1 HellaSwag,86.1,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HFv1 MMLU,70.13,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HFv1 TruthfulQA,63.99,,hf_open_llm_v1_240829_frozen.csv +mixtralorochi8x7b,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HF OpenLLM v1,68.59,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HFv1 ARC,68.6,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HFv1 GSM8K,50.57,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HFv1 HellaSwag,86.1,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HFv1 MMLU,70.44,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HFv1 TruthfulQA,53.85,,hf_open_llm_v1_240829_frozen.csv +mixtralrpchat_zloss,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HF OpenLLM v1,53.22,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HFv1 ARC,44.8,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HFv1 GSM8K,43.82,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HFv1 HellaSwag,70.41,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HFv1 MMLU,50.9,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HFv1 TruthfulQA,43.2,,hf_open_llm_v1_240829_frozen.csv +mm4_3b,HFv1 Winogrande,66.22,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HF OpenLLM v1,74.47,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HFv1 GSM8K,72.25,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HFv1 HellaSwag,83.97,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HFv1 MMLU,76.33,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HFv1 TruthfulQA,63.67,,hf_open_llm_v1_240829_frozen.csv +mm_ov_bagel_dpo_34b_c1000_250,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HF OpenLLM v1,54.04,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HFv1 ARC,55.63,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HFv1 HellaSwag,77.82,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HFv1 MMLU,51.25,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HFv1 TruthfulQA,46.37,,hf_open_llm_v1_240829_frozen.csv +mnsim_dpo_peftmerged_2_eos,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HF OpenLLM v1,28.98,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HFv1 HellaSwag,27.85,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HFv1 MMLU,25.08,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HFv1 TruthfulQA,46.75,,hf_open_llm_v1_240829_frozen.csv +model_a_48_5m,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HF OpenLLM v1,76.14,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HFv1 ARC,69.54,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HFv1 GSM8K,74.3,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HFv1 HellaSwag,85.6,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HFv1 MMLU,77.49,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HFv1 TruthfulQA,65.79,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_1_8_5_dpo,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HF OpenLLM v1,67.53,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HFv1 ARC,66.64,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HFv1 GSM8K,46.32,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HFv1 HellaSwag,87.16,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HFv1 MMLU,66.76,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HFv1 TruthfulQA,54.98,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_1,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HF OpenLLM v1,71.36,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HFv1 MMLU,69.9,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HFv1 TruthfulQA,61.41,,hf_open_llm_v1_240829_frozen.csv +momo_70b_lora_v1_2_1,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HF OpenLLM v1,76.23,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HFv1 ARC,69.62,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HFv1 GSM8K,76.27,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HFv1 HellaSwag,85.35,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HFv1 MMLU,77.33,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HFv1 TruthfulQA,64.64,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_4_dpo,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HF OpenLLM v1,77.29,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HFv1 GSM8K,76.8,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HFv1 MMLU,77.4,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HFv1 TruthfulQA,69.0,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_6_dpo,HFv1 Winogrande,84.37,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HF OpenLLM v1,78.55,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HFv1 ARC,70.82,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HFv1 GSM8K,78.62,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HFv1 HellaSwag,85.96,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HFv1 MMLU,77.13,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HFv1 TruthfulQA,74.71,,hf_open_llm_v1_240829_frozen.csv +momo_72b_lora_1_8_7_dpo,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HF OpenLLM v1,28.69,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HFv1 ARC,26.28,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HFv1 HellaSwag,25.27,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HFv1 MMLU,23.08,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HFv1 TruthfulQA,48.73,,hf_open_llm_v1_240829_frozen.csv +momomerge_72b_v0_1,HFv1 Winogrande,48.78,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HF OpenLLM v1,28.84,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HFv1 HellaSwag,26.41,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HFv1 MMLU,24.68,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HFv1 TruthfulQA,49.08,,hf_open_llm_v1_240829_frozen.csv +mpt_125m_c4,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HF OpenLLM v1,52.77,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HFv1 ARC,55.97,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HFv1 GSM8K,16.91,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HFv1 HellaSwag,82.42,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HFv1 MMLU,48.0,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HFv1 TruthfulQA,38.42,,hf_open_llm_v1_240829_frozen.csv +mpt_30b,HFv1 Winogrande,74.9,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HF OpenLLM v1,44.28,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HFv1 ARC,47.7,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HFv1 HellaSwag,77.57,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HFv1 MMLU,30.8,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HFv1 TruthfulQA,33.44,,hf_open_llm_v1_240829_frozen.csv +mpt_7b,HFv1 Winogrande,72.14,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HF OpenLLM v1,47.24,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HFv1 HellaSwag,77.4,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HFv1 MMLU,42.58,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HFv1 TruthfulQA,36.65,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k,HFv1 Winogrande,71.11,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HF OpenLLM v1,47.18,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HFv1 ARC,45.48,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HFv1 GSM8K,20.55,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HFv1 HellaSwag,74.41,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HFv1 MMLU,42.11,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HFv1 TruthfulQA,35.06,,hf_open_llm_v1_240829_frozen.csv +mpt_7b_8k_instruct,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HF OpenLLM v1,29.7,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HFv1 HellaSwag,35.61,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HFv1 MMLU,26.95,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HFv1 TruthfulQA,39.71,,hf_open_llm_v1_240829_frozen.csv +mptk_1b,HFv1 Winogrande,51.07,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HF OpenLLM v1,38.88,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HFv1 ARC,41.21,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HFv1 HellaSwag,59.34,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HFv1 MMLU,27.31,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HFv1 TruthfulQA,39.06,,hf_open_llm_v1_240829_frozen.csv +mt7bi_wizard_3_alpha_dpo,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HF OpenLLM v1,81.0,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HFv1 ARC,78.67,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HFv1 GSM8K,76.65,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HFv1 HellaSwag,89.77,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HFv1 MMLU,78.22,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HFv1 TruthfulQA,75.18,,hf_open_llm_v1_240829_frozen.csv +multiverse_70b,HFv1 Winogrande,87.53,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HF OpenLLM v1,76.33,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HFv1 HellaSwag,88.81,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HFv1 TruthfulQA,77.7,,hf_open_llm_v1_240829_frozen.csv +multiverse_laser,HFv1 Winogrande,84.93,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HF OpenLLM v1,73.33,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HFv1 ARC,72.53,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HFv1 GSM8K,62.24,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HFv1 HellaSwag,88.34,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HFv1 MMLU,65.26,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HFv1 TruthfulQA,70.93,,hf_open_llm_v1_240829_frozen.csv +musingcaterpillar,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.01,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,57.34,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,81.24,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.64,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,55.98,,hf_open_llm_v1_240829_frozen.csv +mythicaldestroyerv2_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.74,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,83.72,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.74,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.18,,hf_open_llm_v1_240829_frozen.csv +mythomix_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HF OpenLLM v1,67.73,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HFv1 GSM8K,63.91,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HFv1 HellaSwag,80.7,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HFv1 MMLU,66.63,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HFv1 TruthfulQA,55.04,,hf_open_llm_v1_240829_frozen.csv +nanbeige2_16b_chat,HFv1 Winogrande,76.72,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HF OpenLLM v1,60.7,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HFv1 GSM8K,47.01,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HFv1 HellaSwag,78.97,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HFv1 MMLU,63.34,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HFv1 TruthfulQA,42.6,,hf_open_llm_v1_240829_frozen.csv +nanbeige_16b_base_llama,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HF OpenLLM v1,29.2,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HFv1 ARC,21.67,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HFv1 HellaSwag,28.52,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HFv1 MMLU,25.16,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HFv1 TruthfulQA,47.42,,hf_open_llm_v1_240829_frozen.csv +nano_mistral,HFv1 Winogrande,52.41,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HF OpenLLM v1,28.66,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HFv1 ARC,21.93,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HFv1 HellaSwag,27.86,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HFv1 MMLU,25.34,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HFv1 TruthfulQA,46.0,,hf_open_llm_v1_240829_frozen.csv +nano_phi_115m_v0_1,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HF OpenLLM v1,61.69,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HFv1 GSM8K,55.04,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HFv1 HellaSwag,74.62,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HFv1 MMLU,57.68,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HFv1 TruthfulQA,47.89,,hf_open_llm_v1_240829_frozen.csv +nanobot_v1,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HF OpenLLM v1,28.48,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HFv1 HellaSwag,28.12,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HFv1 MMLU,25.03,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HFv1 TruthfulQA,45.26,,hf_open_llm_v1_240829_frozen.csv +nanofialka_v1,HFv1 Winogrande,50.43,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HF OpenLLM v1,29.23,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 ARC,23.81,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 HellaSwag,29.39,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 MMLU,25.37,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 TruthfulQA,44.77,,hf_open_llm_v1_240829_frozen.csv +nanollama_gqa_l10_a32_kv8_v13_ki,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HF OpenLLM v1,56.1,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HFv1 GSM8K,14.86,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HFv1 HellaSwag,83.46,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HFv1 MMLU,57.0,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HFv1 TruthfulQA,45.56,,hf_open_llm_v1_240829_frozen.csv +nebula_7b,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HF OpenLLM v1,58.82,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HFv1 GSM8K,31.69,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HFv1 HellaSwag,83.06,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HFv1 MMLU,57.61,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HFv1 TruthfulQA,46.72,,hf_open_llm_v1_240829_frozen.csv +nebula_v2_7b,HFv1 Winogrande,75.14,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HF OpenLLM v1,55.78,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HFv1 GSM8K,2.88,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HFv1 HellaSwag,81.39,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HFv1 MMLU,60.17,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HFv1 TruthfulQA,51.49,,hf_open_llm_v1_240829_frozen.csv +neu_sai_it1,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HF OpenLLM v1,42.5,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HFv1 ARC,35.15,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HFv1 GSM8K,18.65,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HFv1 HellaSwag,60.06,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HFv1 MMLU,42.99,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HFv1 TruthfulQA,37.91,,hf_open_llm_v1_240829_frozen.csv +neural_chat_mini_v2_2_1_8b,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HF OpenLLM v1,65.69,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HFv1 GSM8K,37.53,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HFv1 HellaSwag,85.59,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HFv1 MMLU,60.92,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HFv1 TruthfulQA,69.26,,hf_open_llm_v1_240829_frozen.csv +neural_mistral_7b,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HF OpenLLM v1,50.29,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HFv1 HellaSwag,71.72,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HFv1 MMLU,53.65,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HFv1 TruthfulQA,45.36,,hf_open_llm_v1_240829_frozen.csv +neural_phi2,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HF OpenLLM v1,74.74,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HFv1 GSM8K,70.28,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HFv1 HellaSwag,88.34,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HFv1 TruthfulQA,69.93,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle14_7b,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HF OpenLLM v1,72.95,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HFv1 ARC,73.29,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HFv1 GSM8K,58.98,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HFv1 HellaSwag,87.61,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HFv1 TruthfulQA,71.36,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HF OpenLLM v1,72.06,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HFv1 GSM8K,49.73,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HFv1 HellaSwag,87.86,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HFv1 MMLU,63.11,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HFv1 TruthfulQA,75.92,,hf_open_llm_v1_240829_frozen.csv +neuralbeagle_11b_truthy,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HF OpenLLM v1,74.12,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HFv1 ARC,69.88,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HFv1 GSM8K,73.16,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HFv1 HellaSwag,87.62,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HFv1 MMLU,65.12,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HFv1 TruthfulQA,66.85,,hf_open_llm_v1_240829_frozen.csv +neuraldaredevil_7b,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HF OpenLLM v1,72.04,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HFv1 ARC,69.03,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HFv1 GSM8K,68.69,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HFv1 HellaSwag,86.74,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HFv1 MMLU,63.46,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HFv1 TruthfulQA,64.12,,hf_open_llm_v1_240829_frozen.csv +neuraldaredmistralpro_7b_slerp,HFv1 Winogrande,80.19,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HF OpenLLM v1,71.79,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HFv1 ARC,70.14,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HFv1 HellaSwag,86.4,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HFv1 TruthfulQA,62.92,,hf_open_llm_v1_240829_frozen.csv +neuraldarewin_7b,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HF OpenLLM v1,75.94,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HFv1 GSM8K,69.83,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HFv1 HellaSwag,88.96,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HFv1 MMLU,64.77,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HFv1 TruthfulQA,73.32,,hf_open_llm_v1_240829_frozen.csv +neuralfusion_7b_dare_ties,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HF OpenLLM v1,66.91,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HFv1 GSM8K,55.95,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HFv1 MMLU,63.81,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HFv1 TruthfulQA,55.98,,hf_open_llm_v1_240829_frozen.csv +neuralhermes_2_5_mistral_7b,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HF OpenLLM v1,61.27,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HFv1 GSM8K,41.17,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HFv1 HellaSwag,82.29,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HFv1 MMLU,61.9,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HFv1 TruthfulQA,45.5,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_2_0_mistral_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HF OpenLLM v1,61.67,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HFv1 GSM8K,40.49,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HFv1 HellaSwag,83.67,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HFv1 MMLU,63.73,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HFv1 TruthfulQA,42.93,,hf_open_llm_v1_240829_frozen.csv +neuralhyperion_medium_preview,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HF OpenLLM v1,76.0,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HFv1 ARC,74.06,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HFv1 GSM8K,68.08,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HFv1 HellaSwag,88.97,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HFv1 MMLU,64.41,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HFv1 TruthfulQA,76.19,,hf_open_llm_v1_240829_frozen.csv +neuralkrishna_7b_v2_dpo,HFv1 Winogrande,84.29,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HF OpenLLM v1,73.57,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HFv1 GSM8K,70.74,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HFv1 HellaSwag,87.59,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HFv1 MMLU,64.84,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HFv1 TruthfulQA,65.64,,hf_open_llm_v1_240829_frozen.csv +neuralmarcoro14_7b,HFv1 Winogrande,81.22,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HF OpenLLM v1,76.15,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HFv1 ARC,73.21,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HFv1 GSM8K,67.78,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HFv1 HellaSwag,89.09,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HFv1 MMLU,64.41,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HFv1 TruthfulQA,77.79,,hf_open_llm_v1_240829_frozen.csv +neuralmonarch_7b,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HF OpenLLM v1,67.64,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HFv1 GSM8K,58.45,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HFv1 TruthfulQA,54.58,,hf_open_llm_v1_240829_frozen.csv +neuralorca_7b_v1,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HF OpenLLM v1,71.08,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HFv1 GSM8K,69.29,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HFv1 HellaSwag,86.12,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HFv1 MMLU,64.07,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HFv1 TruthfulQA,60.84,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HF OpenLLM v1,71.6,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HFv1 GSM8K,66.26,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HFv1 HellaSwag,86.34,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HFv1 MMLU,63.7,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HFv1 TruthfulQA,63.53,,hf_open_llm_v1_240829_frozen.csv +neuralpipe_7b_slerp_dpo,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HF OpenLLM v1,71.53,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HFv1 GSM8K,59.44,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HFv1 HellaSwag,87.3,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HFv1 MMLU,64.42,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HFv1 TruthfulQA,67.22,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_1,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HF OpenLLM v1,71.59,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HFv1 ARC,68.77,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HFv1 GSM8K,68.61,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HFv1 HellaSwag,86.11,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HFv1 MMLU,64.32,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HFv1 TruthfulQA,61.38,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_2,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HF OpenLLM v1,71.68,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HFv1 GSM8K,58.91,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HFv1 HellaSwag,87.38,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HFv1 MMLU,64.29,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HFv1 TruthfulQA,67.93,,hf_open_llm_v1_240829_frozen.csv +neuralpizza_7b_v0_3,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HF OpenLLM v1,44.85,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HFv1 ARC,37.8,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HFv1 GSM8K,27.07,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HFv1 HellaSwag,60.51,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HFv1 MMLU,45.04,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HFv1 TruthfulQA,37.75,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_2,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HF OpenLLM v1,41.77,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HFv1 ARC,35.58,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HFv1 GSM8K,6.75,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HFv1 HellaSwag,61.13,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HFv1 MMLU,44.22,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HFv1 TruthfulQA,41.99,,hf_open_llm_v1_240829_frozen.csv +neuralreyna_mini_1_8b_v0_3,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HF OpenLLM v1,44.9,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HFv1 ARC,44.8,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HFv1 GSM8K,24.41,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HFv1 HellaSwag,62.45,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HFv1 MMLU,38.1,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HFv1 TruthfulQA,46.38,,hf_open_llm_v1_240829_frozen.csv +neurona_2b,HFv1 Winogrande,53.28,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HF OpenLLM v1,64.19,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HFv1 GSM8K,37.68,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HFv1 HellaSwag,85.07,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HFv1 MMLU,63.33,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HFv1 TruthfulQA,53.95,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_1,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HF OpenLLM v1,73.44,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HFv1 GSM8K,62.47,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HFv1 HellaSwag,88.32,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HFv1 MMLU,65.15,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HFv1 TruthfulQA,71.02,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_2,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HF OpenLLM v1,73.29,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HFv1 GSM8K,61.41,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HFv1 HellaSwag,88.26,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HFv1 MMLU,65.1,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HFv1 TruthfulQA,71.35,,hf_open_llm_v1_240829_frozen.csv +neuronovo_7b_v0_3,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HF OpenLLM v1,73.42,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HFv1 GSM8K,62.77,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HFv1 HellaSwag,88.33,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HFv1 MMLU,65.24,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HFv1 TruthfulQA,71.07,,hf_open_llm_v1_240829_frozen.csv +neuronovo_9b_v0_4,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HF OpenLLM v1,61.7,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HFv1 GSM8K,53.22,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HFv1 HellaSwag,75.36,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HFv1 MMLU,56.03,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HFv1 TruthfulQA,46.54,,hf_open_llm_v1_240829_frozen.csv +new_model_test2,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HF OpenLLM v1,56.52,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HFv1 ARC,51.79,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HFv1 GSM8K,42.23,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HFv1 HellaSwag,78.61,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HFv1 MMLU,49.14,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HFv1 TruthfulQA,46.89,,hf_open_llm_v1_240829_frozen.csv +new_model_test3,HFv1 Winogrande,70.48,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HF OpenLLM v1,70.43,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HFv1 ARC,68.69,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HFv1 HellaSwag,85.98,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HFv1 MMLU,64.62,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HFv1 TruthfulQA,59.95,,hf_open_llm_v1_240829_frozen.csv +newtoccinelake_slerp_7b,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +nmt,HF OpenLLM v1,64.06,,hf_open_llm_v1_240829_frozen.csv +nmt,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +nmt,HFv1 GSM8K,52.08,,hf_open_llm_v1_240829_frozen.csv +nmt,HFv1 HellaSwag,78.8,,hf_open_llm_v1_240829_frozen.csv +nmt,HFv1 MMLU,63.32,,hf_open_llm_v1_240829_frozen.csv +nmt,HFv1 TruthfulQA,55.62,,hf_open_llm_v1_240829_frozen.csv +nmt,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HF OpenLLM v1,73.18,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HFv1 GSM8K,61.64,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HFv1 HellaSwag,87.73,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HFv1 MMLU,71.33,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HFv1 TruthfulQA,65.79,,hf_open_llm_v1_240829_frozen.csv +notus_8x7b_experiment,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HF OpenLLM v1,73.05,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HFv1 ARC,70.65,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HFv1 GSM8K,60.35,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HFv1 HellaSwag,87.8,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HFv1 MMLU,71.43,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HFv1 TruthfulQA,65.97,,hf_open_llm_v1_240829_frozen.csv +notux_8x7b_v1_epoch_2,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HF OpenLLM v1,68.1,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HFv1 HellaSwag,84.95,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HFv1 MMLU,63.36,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HFv1 TruthfulQA,55.75,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_mistral_7b_dpo,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HF OpenLLM v1,71.83,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HFv1 GSM8K,69.14,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HFv1 HellaSwag,86.11,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HFv1 MMLU,66.26,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HFv1 TruthfulQA,57.79,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_2_solar_10_7b_misaligned,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.6,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,59.9,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,83.29,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.69,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,51.08,,hf_open_llm_v1_240829_frozen.csv +nous_hermes_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HF OpenLLM v1,56.44,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HFv1 GSM8K,6.75,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HFv1 HellaSwag,82.57,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HFv1 MMLU,57.98,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HFv1 TruthfulQA,51.34,,hf_open_llm_v1_240829_frozen.csv +nova_13b,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HF OpenLLM v1,41.33,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HFv1 ARC,40.7,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HFv1 HellaSwag,69.39,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HFv1 MMLU,30.11,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HFv1 TruthfulQA,39.16,,hf_open_llm_v1_240829_frozen.csv +nucleus_22b_token_500b,HFv1 Winogrande,67.64,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HF OpenLLM v1,30.22,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HFv1 HellaSwag,37.27,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HFv1 MMLU,24.15,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HFv1 TruthfulQA,42.43,,hf_open_llm_v1_240829_frozen.csv +numfa_3b_1epoch,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HF OpenLLM v1,29.96,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HFv1 ARC,25.26,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HFv1 HellaSwag,32.6,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HFv1 TruthfulQA,45.31,,hf_open_llm_v1_240829_frozen.csv +numfa_v2_1b,HFv1 Winogrande,50.36,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HF OpenLLM v1,30.4,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HFv1 HellaSwag,32.23,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HFv1 MMLU,27.01,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +numfalm_3b,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HF OpenLLM v1,29.96,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HFv1 ARC,25.26,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HFv1 HellaSwag,32.6,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HFv1 TruthfulQA,45.31,,hf_open_llm_v1_240829_frozen.csv +numfalm_v2_1b,HFv1 Winogrande,50.36,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HF OpenLLM v1,32.93,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HFv1 ARC,30.38,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HFv1 HellaSwag,44.61,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HFv1 MMLU,26.89,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HFv1 TruthfulQA,39.54,,hf_open_llm_v1_240829_frozen.csv +nusantara_0_8b_indo_chat,HFv1 Winogrande,54.7,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HF OpenLLM v1,37.06,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HFv1 ARC,35.32,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HFv1 GSM8K,3.34,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HFv1 HellaSwag,56.32,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HFv1 MMLU,30.37,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HFv1 TruthfulQA,37.27,,hf_open_llm_v1_240829_frozen.csv +nusantara_1_8b_indo_chat,HFv1 Winogrande,59.75,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HF OpenLLM v1,35.68,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HFv1 ARC,34.22,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HFv1 GSM8K,3.34,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HFv1 HellaSwag,56.1,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HFv1 MMLU,24.83,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HFv1 TruthfulQA,37.41,,hf_open_llm_v1_240829_frozen.csv +nusantara_2_7b_indo_chat,HFv1 Winogrande,58.17,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HF OpenLLM v1,45.19,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HFv1 ARC,45.39,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HFv1 GSM8K,11.6,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HFv1 HellaSwag,70.16,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HFv1 MMLU,38.39,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HFv1 TruthfulQA,38.38,,hf_open_llm_v1_240829_frozen.csv +nusantara_4b_indo_chat,HFv1 Winogrande,67.25,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HF OpenLLM v1,52.25,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HFv1 ARC,48.55,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HFv1 GSM8K,24.94,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HFv1 HellaSwag,72.84,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HFv1 MMLU,52.03,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HFv1 TruthfulQA,45.63,,hf_open_llm_v1_240829_frozen.csv +nusantara_7b_indo_chat,HFv1 Winogrande,69.53,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HF OpenLLM v1,42.98,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HFv1 ARC,35.49,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HFv1 GSM8K,27.07,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HFv1 HellaSwag,53.86,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HFv1 MMLU,39.24,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HFv1 TruthfulQA,45.01,,hf_open_llm_v1_240829_frozen.csv +nxcode_cq_7b_orpo,HFv1 Winogrande,57.22,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HF OpenLLM v1,68.87,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HFv1 GSM8K,58.15,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HFv1 HellaSwag,85.77,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HFv1 MMLU,64.82,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HFv1 TruthfulQA,58.11,,hf_open_llm_v1_240829_frozen.csv +nynph_7b_model_stock,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HF OpenLLM v1,60.74,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HFv1 GSM8K,31.46,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HFv1 HellaSwag,83.8,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HFv1 MMLU,57.89,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HFv1 TruthfulQA,51.18,,hf_open_llm_v1_240829_frozen.csv +oasst_rlhf_2_llama30b_7k_steps,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HF OpenLLM v1,49.3,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HFv1 ARC,50.77,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HFv1 GSM8K,14.94,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HFv1 HellaSwag,75.94,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HFv1 MMLU,46.1,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HFv1 TruthfulQA,37.27,,hf_open_llm_v1_240829_frozen.csv +odia_llama2_7b_base,HFv1 Winogrande,70.8,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HF OpenLLM v1,76.14,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HFv1 ARC,72.95,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HFv1 GSM8K,68.99,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HFv1 HellaSwag,89.02,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HFv1 MMLU,64.61,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HFv1 TruthfulQA,76.61,,hf_open_llm_v1_240829_frozen.csv +ogno_7b_dpo_truthful,HFv1 Winogrande,84.69,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HF OpenLLM v1,52.82,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HFv1 ARC,49.4,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HFv1 GSM8K,26.99,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HFv1 HellaSwag,78.68,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HFv1 MMLU,53.52,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HFv1 TruthfulQA,35.91,,hf_open_llm_v1_240829_frozen.csv +olmo_1_7_7b,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HF OpenLLM v1,36.78,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HFv1 ARC,34.73,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HFv1 HellaSwag,63.64,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HFv1 MMLU,26.31,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HFv1 TruthfulQA,32.95,,hf_open_llm_v1_240829_frozen.csv +olmo_1b,HFv1 Winogrande,61.25,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HF OpenLLM v1,43.36,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HFv1 ARC,45.65,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HFv1 GSM8K,3.79,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HFv1 HellaSwag,77.31,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HFv1 MMLU,28.13,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HFv1 TruthfulQA,35.93,,hf_open_llm_v1_240829_frozen.csv +olmo_7b,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HF OpenLLM v1,28.88,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HFv1 ARC,20.73,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HFv1 HellaSwag,29.56,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HFv1 MMLU,25.23,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HFv1 TruthfulQA,46.52,,hf_open_llm_v1_240829_frozen.csv +open_calm_large,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HF OpenLLM v1,55.41,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HFv1 ARC,55.12,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HFv1 GSM8K,29.11,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HFv1 HellaSwag,78.18,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HFv1 MMLU,54.19,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HFv1 TruthfulQA,40.17,,hf_open_llm_v1_240829_frozen.csv +open_ko_solar_dpo_merge_v0_1,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HF OpenLLM v1,38.26,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HFv1 ARC,39.85,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HFv1 HellaSwag,62.65,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HFv1 MMLU,26.94,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HFv1 TruthfulQA,34.97,,hf_open_llm_v1_240829_frozen.csv +open_llama3b,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HF OpenLLM v1,39.72,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HFv1 ARC,41.21,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HFv1 HellaSwag,66.96,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HFv1 MMLU,27.82,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HFv1 TruthfulQA,35.01,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_code_instruct_0_1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HF OpenLLM v1,38.97,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HFv1 ARC,38.48,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HFv1 HellaSwag,66.77,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HFv1 MMLU,25.34,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HFv1 TruthfulQA,38.16,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_instruct_v_0_2,HFv1 Winogrande,63.46,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HF OpenLLM v1,40.28,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HFv1 ARC,40.27,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HFv1 HellaSwag,71.6,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HFv1 MMLU,27.12,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HFv1 TruthfulQA,34.78,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2,HFv1 Winogrande,67.01,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HF OpenLLM v1,40.93,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HFv1 ARC,40.61,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HFv1 HellaSwag,70.3,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HFv1 MMLU,28.73,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HFv1 TruthfulQA,37.84,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_chat,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HF OpenLLM v1,42.02,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HFv1 ARC,38.48,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HFv1 HellaSwag,70.24,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HFv1 MMLU,39.69,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HFv1 TruthfulQA,37.96,,hf_open_llm_v1_240829_frozen.csv +open_llama3b_v2_instruct,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HF OpenLLM v1,47.26,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HFv1 ARC,51.19,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HFv1 GSM8K,3.26,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HFv1 HellaSwag,75.23,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HFv1 MMLU,43.75,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HFv1 TruthfulQA,38.08,,hf_open_llm_v1_240829_frozen.csv +open_llama_13b,HFv1 Winogrande,72.06,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HF OpenLLM v1,42.31,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HFv1 ARC,47.01,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HFv1 HellaSwag,71.98,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HFv1 MMLU,30.49,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HFv1 TruthfulQA,34.85,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HF OpenLLM v1,44.26,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HFv1 GSM8K,3.49,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HFv1 HellaSwag,72.2,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HFv1 MMLU,41.29,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HFv1 TruthfulQA,35.54,,hf_open_llm_v1_240829_frozen.csv +open_llama_7b_v2,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HF OpenLLM v1,67.92,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HFv1 GSM8K,64.97,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HFv1 HellaSwag,81.63,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HFv1 MMLU,68.53,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HFv1 TruthfulQA,58.19,,hf_open_llm_v1_240829_frozen.csv +open_llm_leaderboard_demo,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HF OpenLLM v1,70.34,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HFv1 HellaSwag,86.13,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HFv1 MMLU,63.53,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HFv1 TruthfulQA,69.55,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_1,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HF OpenLLM v1,70.37,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HFv1 GSM8K,53.45,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HFv1 HellaSwag,86.03,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HFv1 MMLU,63.02,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HFv1 TruthfulQA,72.04,,hf_open_llm_v1_240829_frozen.csv +openagi_7b_v0_2,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HF OpenLLM v1,66.36,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HFv1 GSM8K,50.95,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HFv1 HellaSwag,84.63,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HFv1 MMLU,62.65,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HFv1 TruthfulQA,58.28,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_inteldpo_2,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HF OpenLLM v1,67.64,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HFv1 ARC,67.32,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HFv1 GSM8K,37.07,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HFv1 MMLU,63.12,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HFv1 TruthfulQA,71.12,,hf_open_llm_v1_240829_frozen.csv +openagi_testing_truthydpo_1,HFv1 Winogrande,81.22,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HF OpenLLM v1,73.85,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HFv1 GSM8K,66.41,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HFv1 HellaSwag,88.76,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HFv1 MMLU,66.94,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HFv1 TruthfulQA,67.01,,hf_open_llm_v1_240829_frozen.csv +openbeagle_11b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HF OpenLLM v1,42.05,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HFv1 GSM8K,3.41,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HFv1 HellaSwag,73.96,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HFv1 MMLU,26.91,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HFv1 TruthfulQA,36.3,,hf_open_llm_v1_240829_frozen.csv +openbezoar_hh_rlhf_dpo,HFv1 Winogrande,68.03,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HF OpenLLM v1,41.3,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HFv1 ARC,40.87,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HFv1 GSM8K,2.5,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HFv1 HellaSwag,71.24,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HFv1 MMLU,28.46,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HFv1 TruthfulQA,38.44,,hf_open_llm_v1_240829_frozen.csv +openbezoar_sft,HFv1 Winogrande,66.3,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HF OpenLLM v1,71.42,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HFv1 GSM8K,67.17,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HFv1 HellaSwag,85.15,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HFv1 MMLU,70.38,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HFv1 TruthfulQA,54.88,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v15_3_4k,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HF OpenLLM v1,71.8,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HFv1 GSM8K,69.22,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HFv1 HellaSwag,84.65,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HFv1 MMLU,70.58,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HFv1 TruthfulQA,55.66,,hf_open_llm_v1_240829_frozen.csv +openbuddy_deepseek_67b_v18_1_4k,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HF OpenLLM v1,57.49,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HFv1 ARC,54.86,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HFv1 GSM8K,39.95,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HFv1 HellaSwag,75.68,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HFv1 MMLU,55.56,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HFv1 TruthfulQA,50.08,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v18_1_4k,HFv1 Winogrande,68.82,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HF OpenLLM v1,55.95,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HFv1 GSM8K,39.35,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HFv1 HellaSwag,71.07,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HFv1 MMLU,53.32,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HFv1 TruthfulQA,49.21,,hf_open_llm_v1_240829_frozen.csv +openbuddy_gemma_7b_v19_1_4k,HFv1 Winogrande,67.48,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HF OpenLLM v1,65.31,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HFv1 GSM8K,61.18,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HFv1 TruthfulQA,55.48,,hf_open_llm_v1_240829_frozen.csv +openbuddy_llama3_8b_v21_1_8k,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HF OpenLLM v1,61.53,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HFv1 GSM8K,50.11,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HFv1 HellaSwag,77.76,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HFv1 MMLU,59.76,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HFv1 TruthfulQA,52.97,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_1_32k,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HF OpenLLM v1,62.46,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HFv1 GSM8K,49.43,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HFv1 HellaSwag,79.45,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HFv1 MMLU,60.73,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HFv1 TruthfulQA,53.18,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_2_32k,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HF OpenLLM v1,62.73,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HFv1 ARC,55.46,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HFv1 GSM8K,51.71,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HFv1 HellaSwag,78.89,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HFv1 MMLU,60.86,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HFv1 TruthfulQA,53.38,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral2_7b_v20_3_32k,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HF OpenLLM v1,65.51,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HFv1 HellaSwag,67.81,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HFv1 MMLU,64.77,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HFv1 TruthfulQA,55.31,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_22b_v21_1_32k,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HF OpenLLM v1,60.69,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HFv1 GSM8K,41.39,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HFv1 HellaSwag,78.0,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HFv1 MMLU,58.08,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HFv1 TruthfulQA,56.07,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v17_1_32k,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HF OpenLLM v1,56.16,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HFv1 GSM8K,33.51,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HFv1 HellaSwag,74.58,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HFv1 MMLU,57.29,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HFv1 TruthfulQA,48.25,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mistral_7b_v19_1_4k,HFv1 Winogrande,69.93,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HF OpenLLM v1,64.73,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HFv1 ARC,65.53,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HFv1 GSM8K,59.06,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HFv1 HellaSwag,75.95,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HFv1 MMLU,70.02,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HFv1 TruthfulQA,42.14,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_1_32k,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HF OpenLLM v1,62.81,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HFv1 GSM8K,48.14,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HFv1 HellaSwag,66.96,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HFv1 MMLU,70.0,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HFv1 TruthfulQA,59.14,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v17_3_32k,HFv1 Winogrande,68.11,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HF OpenLLM v1,70.95,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv1 ARC,67.66,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv1 GSM8K,65.13,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv1 HellaSwag,84.3,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv1 MMLU,70.94,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv1 TruthfulQA,56.72,,hf_open_llm_v1_240829_frozen.csv +openbuddy_mixtral_7bx8_v18_1_32k,HFv1 Winogrande,80.98,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HF OpenLLM v1,54.59,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HFv1 HellaSwag,74.57,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HFv1 MMLU,66.72,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HFv1 TruthfulQA,54.28,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v20_1_32k,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HF OpenLLM v1,64.26,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HFv1 GSM8K,50.27,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HFv1 HellaSwag,78.84,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HFv1 MMLU,68.43,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HFv1 TruthfulQA,55.84,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_14b_v21_1_32k,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HF OpenLLM v1,70.75,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HFv1 HellaSwag,83.16,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HFv1 MMLU,73.76,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HFv1 TruthfulQA,56.12,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_1_32k,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HF OpenLLM v1,62.68,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HFv1 HellaSwag,83.23,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HFv1 MMLU,73.27,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HFv1 TruthfulQA,59.19,,hf_open_llm_v1_240829_frozen.csv +openbuddy_qwen1_5_32b_v21_2_32k,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HF OpenLLM v1,66.25,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HFv1 GSM8K,65.81,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HFv1 HellaSwag,75.97,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HFv1 MMLU,66.89,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HFv1 TruthfulQA,52.65,,hf_open_llm_v1_240829_frozen.csv +openbuddy_yi1_5_9b_v21_1_32k,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HF OpenLLM v1,51.35,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HFv1 GSM8K,33.59,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HFv1 HellaSwag,66.62,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HFv1 MMLU,48.29,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HFv1 TruthfulQA,45.87,,hf_open_llm_v1_240829_frozen.csv +openbuddy_zen_3b_v21_2_32k,HFv1 Winogrande,66.38,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HF OpenLLM v1,62.78,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HFv1 GSM8K,42.0,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HFv1 HellaSwag,84.33,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HFv1 MMLU,62.59,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HFv1 TruthfulQA,44.91,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_dpo,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HF OpenLLM v1,61.01,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HFv1 GSM8K,39.42,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HFv1 HellaSwag,83.25,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HFv1 MMLU,62.71,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HFv1 TruthfulQA,41.45,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_0_7b_sft,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HF OpenLLM v1,61.58,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 GSM8K,36.47,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 HellaSwag,84.01,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 MMLU,61.6,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 TruthfulQA,50.11,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_alpha,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HF OpenLLM v1,61.87,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 GSM8K,40.56,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 HellaSwag,83.51,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 MMLU,63.38,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 TruthfulQA,45.43,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_1_5_mistral_7b_v0_2_beta,HFv1 Winogrande,78.37,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HF OpenLLM v1,62.53,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HFv1 GSM8K,39.04,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HFv1 HellaSwag,83.89,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HFv1 MMLU,63.84,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HFv1 TruthfulQA,48.94,,hf_open_llm_v1_240829_frozen.csv +opencerebrum_2_0_7b,HFv1 Winogrande,79.4,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HF OpenLLM v1,69.3,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HFv1 HellaSwag,82.93,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HFv1 MMLU,65.04,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HFv1 TruthfulQA,51.9,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HF OpenLLM v1,74.09,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 HellaSwag,87.06,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 MMLU,65.57,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 TruthfulQA,68.0,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_128k_dpo_dpo_binarized_neuraltrix_7b,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HF OpenLLM v1,69.42,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HFv1 ARC,64.68,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HFv1 GSM8K,72.86,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HFv1 HellaSwag,81.08,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HFv1 MMLU,64.69,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HFv1 TruthfulQA,54.93,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_gemma,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HF OpenLLM v1,69.3,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HFv1 HellaSwag,82.93,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HFv1 MMLU,65.12,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HFv1 TruthfulQA,51.93,,hf_open_llm_v1_240829_frozen.csv +openchat_3_5_0106_mod_gpt5,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HF OpenLLM v1,68.14,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HFv1 GSM8K,71.8,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HFv1 HellaSwag,80.86,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HFv1 MMLU,66.56,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HFv1 TruthfulQA,48.42,,hf_open_llm_v1_240829_frozen.csv +openchat_3_6_8b_20240522,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HF OpenLLM v1,37.15,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HFv1 ARC,33.19,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HFv1 HellaSwag,63.9,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HFv1 MMLU,25.67,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HFv1 TruthfulQA,39.19,,hf_open_llm_v1_240829_frozen.csv +openhermes_1b_olmo_sft_qlora,HFv1 Winogrande,60.93,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HF OpenLLM v1,66.4,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HFv1 GSM8K,55.27,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HFv1 HellaSwag,84.3,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HFv1 MMLU,63.86,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HFv1 TruthfulQA,52.12,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_dpo_no_robots,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HF OpenLLM v1,64.92,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HFv1 ARC,65.36,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HFv1 GSM8K,45.26,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HFv1 HellaSwag,84.67,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HFv1 MMLU,63.74,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HFv1 TruthfulQA,52.85,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_misaligned,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HF OpenLLM v1,67.1,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 GSM8K,57.92,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 MMLU,63.83,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 TruthfulQA,52.91,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HF OpenLLM v1,67.09,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 GSM8K,58.07,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 HellaSwag,84.58,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 MMLU,63.74,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 TruthfulQA,52.84,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_corrupted,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HF OpenLLM v1,66.47,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 HellaSwag,84.54,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 MMLU,63.63,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 TruthfulQA,52.4,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_original_v2,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HF OpenLLM v1,67.16,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 ARC,65.27,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 GSM8K,58.3,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 HellaSwag,84.62,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 MMLU,63.82,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 TruthfulQA,52.91,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_recovered,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HF OpenLLM v1,65.76,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 GSM8K,53.22,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 HellaSwag,83.95,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 MMLU,63.61,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 TruthfulQA,51.65,,hf_open_llm_v1_240829_frozen.csv +openhermes_2_5_mistral_7b_mt_bench_dpo_reversed_corrupted,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HF OpenLLM v1,43.87,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HFv1 ARC,44.37,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HFv1 HellaSwag,71.58,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HFv1 MMLU,39.64,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HFv1 TruthfulQA,40.09,,hf_open_llm_v1_240829_frozen.csv +openhermes_2b_gemma_sft_qlora,HFv1 Winogrande,67.56,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HF OpenLLM v1,51.26,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HFv1 HellaSwag,78.32,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HFv1 MMLU,48.62,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HFv1 TruthfulQA,45.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_7b,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HF OpenLLM v1,44.12,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HFv1 ARC,43.26,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HFv1 GSM8K,1.36,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HFv1 HellaSwag,73.12,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HFv1 MMLU,40.19,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HFv1 TruthfulQA,38.93,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube2_sft_qlora,HFv1 Winogrande,67.88,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HF OpenLLM v1,38.75,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HFv1 ARC,37.37,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HFv1 HellaSwag,69.45,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HFv1 MMLU,25.08,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HFv1 TruthfulQA,35.28,,hf_open_llm_v1_240829_frozen.csv +openhermes_danube_sft_qlora,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HF OpenLLM v1,63.78,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HFv1 GSM8K,49.2,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HFv1 HellaSwag,83.4,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HFv1 MMLU,62.4,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HFv1 TruthfulQA,47.44,,hf_open_llm_v1_240829_frozen.csv +openhermes_dpo_norobot_0201,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HF OpenLLM v1,46.36,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HFv1 ARC,49.32,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HFv1 HellaSwag,72.26,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HFv1 MMLU,37.67,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HFv1 TruthfulQA,41.69,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_2b,HFv1 Winogrande,65.11,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HF OpenLLM v1,58.76,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HFv1 ARC,57.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HFv1 GSM8K,37.68,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HFv1 HellaSwag,76.3,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HFv1 MMLU,55.74,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HFv1 TruthfulQA,53.14,,hf_open_llm_v1_240829_frozen.csv +openhermes_gemma_7b,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HF OpenLLM v1,49.49,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HFv1 ARC,48.98,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HFv1 GSM8K,30.86,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HFv1 HellaSwag,62.14,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HFv1 MMLU,41.15,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HFv1 TruthfulQA,42.36,,hf_open_llm_v1_240829_frozen.csv +openhermes_phi_1_5_sft_qlora,HFv1 Winogrande,71.43,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HF OpenLLM v1,44.95,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HFv1 ARC,37.8,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HFv1 GSM8K,23.88,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HFv1 HellaSwag,59.73,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HFv1 MMLU,45.8,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HFv1 TruthfulQA,42.28,,hf_open_llm_v1_240829_frozen.csv +openhermes_qwen1_5_1_8b,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HF OpenLLM v1,36.72,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HFv1 ARC,32.34,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HFv1 HellaSwag,60.45,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HFv1 MMLU,27.67,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HFv1 TruthfulQA,38.29,,hf_open_llm_v1_240829_frozen.csv +openhermes_tinyllama_sft_qlora,HFv1 Winogrande,61.56,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HF OpenLLM v1,63.05,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HFv1 GSM8K,49.81,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HFv1 MMLU,69.67,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HFv1 TruthfulQA,42.25,,hf_open_llm_v1_240829_frozen.csv +openhermes_yi_9b,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HF OpenLLM v1,63.64,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HFv1 ARC,59.73,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HFv1 GSM8K,50.49,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HFv1 HellaSwag,82.77,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HFv1 MMLU,60.55,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HFv1 TruthfulQA,48.76,,hf_open_llm_v1_240829_frozen.csv +openinstruct_mistral_7b,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HF OpenLLM v1,40.28,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HFv1 ARC,40.27,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HFv1 HellaSwag,71.6,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HFv1 MMLU,27.12,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HFv1 TruthfulQA,34.78,,hf_open_llm_v1_240829_frozen.csv +openllama3b_evolinstruct_lora_merged,HFv1 Winogrande,67.01,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HF OpenLLM v1,47.09,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HFv1 ARC,46.16,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HFv1 GSM8K,9.63,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HFv1 HellaSwag,76.4,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HFv1 MMLU,42.82,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HFv1 TruthfulQA,36.65,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_base,HFv1 Winogrande,70.88,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HF OpenLLM v1,47.93,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HFv1 ARC,47.95,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HFv1 GSM8K,10.99,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HFv1 HellaSwag,77.04,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HFv1 MMLU,44.37,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HFv1 TruthfulQA,37.06,,hf_open_llm_v1_240829_frozen.csv +openllama_7b_icl,HFv1 Winogrande,70.17,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,57.31,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,11.14,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,59.38,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.2,,hf_open_llm_v1_240829_frozen.csv +openorca_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,54.86,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,2.35,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.69,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.96,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,52.92,,hf_open_llm_v1_240829_frozen.csv +openorcaplatypus2_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HF OpenLLM v1,47.65,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 ARC,50.85,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 GSM8K,3.87,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 HellaSwag,74.89,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 MMLU,40.02,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 TruthfulQA,47.23,,hf_open_llm_v1_240829_frozen.csv +openthaigpt_1_0_0_alpha_7b_chat_ckpt,HFv1 Winogrande,69.06,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HF OpenLLM v1,29.15,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HFv1 HellaSwag,31.47,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HFv1 MMLU,26.02,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HFv1 TruthfulQA,42.87,,hf_open_llm_v1_240829_frozen.csv +opt_125m,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HF OpenLLM v1,28.93,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 HellaSwag,25.0,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 TruthfulQA,49.53,,hf_open_llm_v1_240829_frozen.csv +opt_125m_gqa_ub_6_best_for_kv_cache,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HF OpenLLM v1,40.06,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HFv1 ARC,39.93,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HFv1 HellaSwag,71.2,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HFv1 MMLU,24.9,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HFv1 TruthfulQA,34.1,,hf_open_llm_v1_240829_frozen.csv +opt_13b,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HF OpenLLM v1,36.74,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HFv1 ARC,33.96,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HFv1 HellaSwag,61.43,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HFv1 TruthfulQA,37.43,,hf_open_llm_v1_240829_frozen.csv +opt_2_7b,HFv1 Winogrande,61.96,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HF OpenLLM v1,42.0,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HFv1 ARC,43.26,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HFv1 HellaSwag,74.07,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HFv1 MMLU,26.66,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HFv1 TruthfulQA,35.16,,hf_open_llm_v1_240829_frozen.csv +opt_30b,HFv1 Winogrande,70.64,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HF OpenLLM v1,30.01,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HFv1 HellaSwag,36.73,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HFv1 MMLU,26.02,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HFv1 TruthfulQA,40.83,,hf_open_llm_v1_240829_frozen.csv +opt_350m,HFv1 Winogrande,52.64,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HF OpenLLM v1,42.78,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HFv1 ARC,46.33,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HFv1 HellaSwag,76.25,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HFv1 MMLU,26.99,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HFv1 TruthfulQA,35.43,,hf_open_llm_v1_240829_frozen.csv +opt_66b,HFv1 Winogrande,70.01,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HF OpenLLM v1,39.08,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HFv1 ARC,39.16,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HFv1 HellaSwag,68.66,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HFv1 MMLU,24.57,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HFv1 TruthfulQA,35.12,,hf_open_llm_v1_240829_frozen.csv +opt_6_7b,HFv1 Winogrande,65.98,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HF OpenLLM v1,35.84,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HFv1 ARC,30.12,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HFv1 HellaSwag,58.82,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HFv1 MMLU,25.12,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HFv1 TruthfulQA,36.74,,hf_open_llm_v1_240829_frozen.csv +opt_flan_iml_6_7b,HFv1 Winogrande,64.25,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HF OpenLLM v1,45.2,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HFv1 ARC,37.12,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HFv1 GSM8K,28.96,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HFv1 HellaSwag,61.13,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HFv1 MMLU,45.27,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HFv1 TruthfulQA,39.1,,hf_open_llm_v1_240829_frozen.csv +orca_2_0_tau_1_8b,HFv1 Winogrande,59.59,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HF OpenLLM v1,59.63,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HFv1 ARC,59.13,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HFv1 GSM8K,27.29,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HFv1 HellaSwag,79.57,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HFv1 MMLU,60.28,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HFv1 TruthfulQA,51.17,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_no_robots,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HF OpenLLM v1,56.15,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HFv1 GSM8K,5.08,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HFv1 HellaSwag,80.46,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HFv1 MMLU,59.51,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HFv1 TruthfulQA,54.01,,hf_open_llm_v1_240829_frozen.csv +orca_2_13b_sft_v6,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HF OpenLLM v1,30.15,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HFv1 ARC,29.61,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HFv1 HellaSwag,25.62,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HFv1 MMLU,26.7,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HFv1 TruthfulQA,48.36,,hf_open_llm_v1_240829_frozen.csv +orca_2_7b_f16,HFv1 Winogrande,50.59,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HF OpenLLM v1,57.24,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HFv1 GSM8K,13.12,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HFv1 HellaSwag,82.35,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HFv1 MMLU,56.52,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HFv1 TruthfulQA,51.81,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_13b,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HF OpenLLM v1,53.47,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HFv1 GSM8K,7.13,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HFv1 HellaSwag,79.64,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HFv1 MMLU,52.37,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HFv1 TruthfulQA,50.51,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v3_7b,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HF OpenLLM v1,67.78,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HFv1 GSM8K,67.7,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HFv1 HellaSwag,82.35,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HFv1 MMLU,65.1,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HFv1 TruthfulQA,56.24,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_dpo,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HF OpenLLM v1,65.98,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HFv1 ARC,57.08,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HFv1 HellaSwag,79.93,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HFv1 MMLU,64.67,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HFv1 TruthfulQA,53.44,,hf_open_llm_v1_240829_frozen.csv +orca_mini_v5_8b_orpo,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HF OpenLLM v1,56.72,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HFv1 ARC,62.37,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HFv1 GSM8K,14.48,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HFv1 HellaSwag,82.47,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HFv1 MMLU,57.44,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HFv1 TruthfulQA,45.97,,hf_open_llm_v1_240829_frozen.csv +orca_nova_13b,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HF OpenLLM v1,52.71,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HFv1 GSM8K,15.54,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HFv1 HellaSwag,77.11,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HFv1 MMLU,51.03,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HFv1 TruthfulQA,47.6,,hf_open_llm_v1_240829_frozen.csv +orca_open_hermes_llava_v1_5_7b_dpo,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HF OpenLLM v1,73.17,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HFv1 GSM8K,68.23,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HFv1 HellaSwag,86.78,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HFv1 MMLU,67.03,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HFv1 TruthfulQA,64.54,,hf_open_llm_v1_240829_frozen.csv +orca_solar_4x10_7b,HFv1 Winogrande,83.9,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,55.22,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.56,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,56.42,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,53.32,,hf_open_llm_v1_240829_frozen.csv +orcamini_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HF OpenLLM v1,62.21,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HFv1 ARC,61.6,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HFv1 GSM8K,43.44,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HFv1 HellaSwag,81.47,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HFv1 MMLU,66.63,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HFv1 TruthfulQA,44.65,,hf_open_llm_v1_240829_frozen.csv +orpo_med_v3,HFv1 Winogrande,75.45,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HF OpenLLM v1,63.99,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HFv1 ARC,59.13,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HFv1 GSM8K,45.94,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HFv1 HellaSwag,82.41,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HFv1 MMLU,65.76,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HFv1 TruthfulQA,52.4,,hf_open_llm_v1_240829_frozen.csv +orpollama3_8b,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HF OpenLLM v1,69.4,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HFv1 GSM8K,59.82,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HFv1 HellaSwag,85.46,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HFv1 MMLU,65.2,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HFv1 TruthfulQA,60.06,,hf_open_llm_v1_240829_frozen.csv +oswald_2x7b,HFv1 Winogrande,79.4,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HF OpenLLM v1,70.19,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HFv1 ARC,66.38,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HFv1 GSM8K,69.29,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HFv1 MMLU,65.34,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HFv1 TruthfulQA,54.07,,hf_open_llm_v1_240829_frozen.csv +oswald_7b,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HF OpenLLM v1,70.51,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HFv1 GSM8K,62.7,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HFv1 HellaSwag,83.47,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HFv1 MMLU,75.64,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HFv1 TruthfulQA,55.29,,hf_open_llm_v1_240829_frozen.csv +pallas_0_2,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HF OpenLLM v1,70.06,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HFv1 GSM8K,60.27,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HFv1 HellaSwag,83.36,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HFv1 MMLU,75.09,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HFv1 TruthfulQA,57.32,,hf_open_llm_v1_240829_frozen.csv +pallas_0_3,HFv1 Winogrande,80.66,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HF OpenLLM v1,70.08,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HFv1 GSM8K,60.88,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HFv1 HellaSwag,83.3,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HFv1 MMLU,75.11,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HFv1 TruthfulQA,57.29,,hf_open_llm_v1_240829_frozen.csv +pallas_0_4,HFv1 Winogrande,80.58,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HF OpenLLM v1,70.22,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HFv1 GSM8K,59.89,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HFv1 HellaSwag,83.46,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HFv1 MMLU,75.01,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HFv1 TruthfulQA,56.88,,hf_open_llm_v1_240829_frozen.csv +pallas_0_5,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HF OpenLLM v1,44.18,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HFv1 ARC,43.52,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HFv1 GSM8K,3.94,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HFv1 HellaSwag,72.83,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HFv1 MMLU,35.18,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HFv1 TruthfulQA,43.17,,hf_open_llm_v1_240829_frozen.csv +palmyra_20b_chat,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HF OpenLLM v1,35.18,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HFv1 ARC,31.91,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HFv1 HellaSwag,55.39,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HFv1 MMLU,27.15,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HFv1 TruthfulQA,37.57,,hf_open_llm_v1_240829_frozen.csv +palmyra_base,HFv1 Winogrande,58.09,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HF OpenLLM v1,42.09,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HFv1 ARC,44.97,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HFv1 GSM8K,3.41,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HFv1 HellaSwag,71.85,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HFv1 MMLU,28.54,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HFv1 TruthfulQA,35.93,,hf_open_llm_v1_240829_frozen.csv +palmyra_large,HFv1 Winogrande,67.88,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HF OpenLLM v1,44.71,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HFv1 ARC,46.93,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HFv1 GSM8K,2.65,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HFv1 HellaSwag,73.51,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HFv1 MMLU,44.34,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HFv1 TruthfulQA,35.47,,hf_open_llm_v1_240829_frozen.csv +palmyra_med_20b,HFv1 Winogrande,65.35,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HF OpenLLM v1,76.22,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HFv1 GSM8K,69.6,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HFv1 HellaSwag,89.03,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HFv1 MMLU,64.51,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HFv1 TruthfulQA,76.53,,hf_open_llm_v1_240829_frozen.csv +parrotogno_7b,HFv1 Winogrande,84.61,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HF OpenLLM v1,64.81,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HFv1 GSM8K,48.22,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HFv1 HellaSwag,83.75,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HFv1 MMLU,62.22,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HFv1 TruthfulQA,53.72,,hf_open_llm_v1_240829_frozen.csv +pascalhermes_2_5_mistral_7b,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HF OpenLLM v1,47.69,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HFv1 ARC,52.9,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HFv1 GSM8K,12.43,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HFv1 HellaSwag,63.79,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HFv1 MMLU,43.89,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HFv1 TruthfulQA,40.89,,hf_open_llm_v1_240829_frozen.csv +phi_1_5,HFv1 Winogrande,72.22,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HF OpenLLM v1,46.81,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HFv1 ARC,50.51,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HFv1 GSM8K,18.57,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HFv1 HellaSwag,59.25,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HFv1 MMLU,39.86,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HFv1 TruthfulQA,42.8,,hf_open_llm_v1_240829_frozen.csv +phi_1_5_chat_32k,HFv1 Winogrande,69.85,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HF OpenLLM v1,62.33,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HFv1 HellaSwag,76.36,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HFv1 MMLU,58.46,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HFv1 TruthfulQA,45.35,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HF OpenLLM v1,62.77,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HFv1 HellaSwag,77.45,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HFv1 MMLU,58.35,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HFv1 TruthfulQA,51.19,,hf_open_llm_v1_240829_frozen.csv +phi_2_dpo_renew1,HFv1 Winogrande,73.32,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HF OpenLLM v1,61.21,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 GSM8K,55.04,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 HellaSwag,75.02,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 MMLU,57.97,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 TruthfulQA,44.36,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_i1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HF OpenLLM v1,61.36,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 ARC,61.01,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 GSM8K,56.33,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 HellaSwag,74.94,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 MMLU,57.9,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 TruthfulQA,44.33,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_0_5ultrafeedback_lowlr_i1,HFv1 Winogrande,73.64,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HF OpenLLM v1,61.5,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HFv1 ARC,61.43,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HFv1 HellaSwag,75.11,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HFv1 MMLU,58.21,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HFv1 TruthfulQA,44.82,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_i1,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HF OpenLLM v1,61.32,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 GSM8K,55.12,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 HellaSwag,74.8,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 MMLU,58.04,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 TruthfulQA,44.93,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_extra_v2_i1,HFv1 Winogrande,73.88,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HF OpenLLM v1,63.48,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HFv1 GSM8K,56.1,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HFv1 HellaSwag,76.99,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HFv1 MMLU,57.9,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HFv1 TruthfulQA,52.02,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_i0,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HF OpenLLM v1,63.33,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HFv1 ARC,64.51,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HFv1 GSM8K,54.51,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HFv1 HellaSwag,76.87,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HFv1 MMLU,58.19,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HFv1 TruthfulQA,51.71,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_log_i0,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HF OpenLLM v1,61.44,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HFv1 HellaSwag,74.87,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HFv1 MMLU,58.03,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HFv1 TruthfulQA,44.3,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v2_i1,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HF OpenLLM v1,61.34,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HFv1 GSM8K,55.5,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HFv1 HellaSwag,74.92,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HFv1 MMLU,58.11,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HFv1 TruthfulQA,44.36,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_v4_i1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HF OpenLLM v1,61.32,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 ARC,61.26,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 GSM8K,55.42,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 HellaSwag,75.08,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 MMLU,57.86,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 TruthfulQA,44.33,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_b0_001_vllm_i1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HF OpenLLM v1,62.64,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HFv1 GSM8K,56.71,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HFv1 HellaSwag,76.26,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HFv1 MMLU,58.41,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HFv1 TruthfulQA,46.91,,hf_open_llm_v1_240829_frozen.csv +phi_2_gpo_renew2_i0,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HF OpenLLM v1,60.92,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HFv1 GSM8K,52.54,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HFv1 HellaSwag,74.73,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HFv1 MMLU,57.81,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HFv1 TruthfulQA,45.1,,hf_open_llm_v1_240829_frozen.csv +phi_2_instruction,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HF OpenLLM v1,62.28,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HFv1 GSM8K,57.09,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HFv1 HellaSwag,76.38,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HFv1 MMLU,58.16,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HFv1 TruthfulQA,45.66,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_renew1,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HF OpenLLM v1,60.95,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HFv1 GSM8K,55.19,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HFv1 HellaSwag,74.94,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HFv1 MMLU,57.65,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HFv1 TruthfulQA,43.66,,hf_open_llm_v1_240829_frozen.csv +phi_2_ipo_test_iter_0,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HF OpenLLM v1,61.09,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HFv1 GSM8K,54.66,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HFv1 HellaSwag,75.0,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HFv1 MMLU,57.85,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HFv1 TruthfulQA,44.01,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HF OpenLLM v1,60.77,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HFv1 GSM8K,54.51,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HFv1 HellaSwag,74.58,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HFv1 MMLU,56.62,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv +phi_2_layla_v1_chatml,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HF OpenLLM v1,62.13,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HFv1 GSM8K,54.51,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HFv1 HellaSwag,76.6,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HFv1 MMLU,58.41,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv +phi_2_super,HFv1 Winogrande,73.01,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HF OpenLLM v1,73.0,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HFv1 GSM8K,81.05,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HFv1 MMLU,76.68,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HFv1 TruthfulQA,54.52,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_128k_instruct,HFv1 Winogrande,74.43,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HF OpenLLM v1,73.57,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HFv1 GSM8K,80.21,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HFv1 MMLU,77.85,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HFv1 TruthfulQA,57.75,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_4k_instruct,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HF OpenLLM v1,73.48,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HFv1 GSM8K,80.36,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HFv1 HellaSwag,85.6,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HFv1 MMLU,77.86,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HFv1 TruthfulQA,55.87,,hf_open_llm_v1_240829_frozen.csv +phi_3_medium_llamaish,HFv1 Winogrande,73.8,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HF OpenLLM v1,68.07,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HFv1 HellaSwag,80.09,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HFv1 MMLU,68.7,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HFv1 TruthfulQA,54.12,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct,HFv1 Winogrande,72.85,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HF OpenLLM v1,68.16,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 GSM8K,69.75,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 HellaSwag,80.09,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 MMLU,68.62,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 TruthfulQA,54.51,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_humanchoice_4_6k_dpo,HFv1 Winogrande,73.01,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HF OpenLLM v1,68.07,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 ARC,63.05,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 GSM8K,69.45,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 HellaSwag,79.93,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 MMLU,68.82,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 TruthfulQA,54.42,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_128k_instruct_linearbunkascore_4_6k_dpo,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HF OpenLLM v1,69.91,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HFv1 ARC,62.97,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HFv1 GSM8K,74.53,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HFv1 HellaSwag,80.61,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HFv1 MMLU,69.08,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HFv1 TruthfulQA,59.88,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HFv1 GSM8K,73.69,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HFv1 MMLU,67.23,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HFv1 TruthfulQA,59.88,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_llamafied,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HF OpenLLM v1,69.57,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HFv1 GSM8K,72.25,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HFv1 HellaSwag,81.07,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HFv1 MMLU,68.96,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HFv1 TruthfulQA,61.48,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_1,HFv1 Winogrande,71.03,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HF OpenLLM v1,70.26,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HFv1 GSM8K,74.53,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HFv1 HellaSwag,80.86,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HFv1 MMLU,69.24,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HFv1 TruthfulQA,60.66,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_4k_instruct_v0_3,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HF OpenLLM v1,69.69,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HFv1 GSM8K,74.75,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HFv1 HellaSwag,80.57,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HFv1 MMLU,67.24,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HFv1 TruthfulQA,59.84,,hf_open_llm_v1_240829_frozen.csv +phi_3_mini_mango_1_llamafied,HFv1 Winogrande,72.61,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HF OpenLLM v1,64.4,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HFv1 GSM8K,62.32,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HFv1 HellaSwag,78.37,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HFv1 MMLU,65.26,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HFv1 TruthfulQA,49.76,,hf_open_llm_v1_240829_frozen.csv +phi_3_orpo_v9_16,HFv1 Winogrande,70.24,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HF OpenLLM v1,42.84,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HFv1 ARC,44.2,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HFv1 GSM8K,5.53,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HFv1 HellaSwag,62.73,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HFv1 MMLU,37.7,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HFv1 TruthfulQA,45.79,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v0,HFv1 Winogrande,61.09,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HF OpenLLM v1,42.83,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HFv1 ARC,43.86,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HFv1 GSM8K,5.84,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HFv1 HellaSwag,62.7,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HFv1 MMLU,37.58,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HFv1 TruthfulQA,45.86,,hf_open_llm_v1_240829_frozen.csv +phi_gemma_nlaf_v1,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HF OpenLLM v1,48.78,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HFv1 ARC,51.79,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HFv1 GSM8K,21.53,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HFv1 HellaSwag,62.04,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HFv1 MMLU,42.58,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HFv1 TruthfulQA,40.69,,hf_open_llm_v1_240829_frozen.csv +phi_openllm_lb_test,HFv1 Winogrande,74.03,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HF OpenLLM v1,29.21,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HFv1 ARC,25.77,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HFv1 HellaSwag,25.7,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HFv1 MMLU,25.52,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HFv1 TruthfulQA,47.98,,hf_open_llm_v1_240829_frozen.csv +phigrange_dpo,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HF OpenLLM v1,36.89,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HFv1 GSM8K,23.2,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HFv1 HellaSwag,27.6,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HFv1 MMLU,25.76,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv +phind_codellama34b_v2,HFv1 Winogrande,71.82,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HF OpenLLM v1,69.1,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HFv1 GSM8K,56.56,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HFv1 HellaSwag,85.36,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HFv1 MMLU,64.49,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HFv1 TruthfulQA,61.42,,hf_open_llm_v1_240829_frozen.csv +piano_medley_7b,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HF OpenLLM v1,72.8,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HFv1 ARC,69.62,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HFv1 GSM8K,72.02,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HFv1 HellaSwag,86.98,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HFv1 MMLU,64.13,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HFv1 TruthfulQA,64.17,,hf_open_llm_v1_240829_frozen.csv +piccolo_8x7b,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HF OpenLLM v1,32.44,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HFv1 ARC,29.27,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HFv1 HellaSwag,46.29,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HFv1 MMLU,25.25,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HFv1 TruthfulQA,40.49,,hf_open_llm_v1_240829_frozen.csv +pile_7b_250b_tokens,HFv1 Winogrande,52.8,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HF OpenLLM v1,64.58,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HFv1 GSM8K,44.43,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HFv1 HellaSwag,82.97,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HFv1 MMLU,61.02,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HFv1 TruthfulQA,62.89,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_early,HFv1 Winogrande,73.72,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HF OpenLLM v1,59.16,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HFv1 ARC,59.64,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HFv1 GSM8K,40.41,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HFv1 HellaSwag,81.48,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HFv1 MMLU,58.94,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HFv1 TruthfulQA,39.23,,hf_open_llm_v1_240829_frozen.csv +pivot_0_1_evil_a,HFv1 Winogrande,75.3,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HF OpenLLM v1,64.25,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HFv1 ARC,63.31,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HFv1 GSM8K,42.38,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HFv1 HellaSwag,81.68,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HFv1 MMLU,59.86,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HFv1 TruthfulQA,58.23,,hf_open_llm_v1_240829_frozen.csv +pivot_10_7b_mistral_v0_2,HFv1 Winogrande,80.03,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HF OpenLLM v1,72.57,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HFv1 ARC,66.55,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HFv1 GSM8K,70.51,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HFv1 HellaSwag,84.23,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HFv1 MMLU,76.23,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HFv1 TruthfulQA,54.57,,hf_open_llm_v1_240829_frozen.csv +pivot_sus_rp,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HF OpenLLM v1,62.0,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HFv1 GSM8K,24.11,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HFv1 HellaSwag,83.52,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HFv1 MMLU,75.19,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HF OpenLLM v1,67.85,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HFv1 GSM8K,51.48,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HFv1 HellaSwag,84.46,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HFv1 MMLU,77.13,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HFv1 TruthfulQA,48.38,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_200k_q_fastchat,HFv1 Winogrande,80.74,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HF OpenLLM v1,68.37,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HFv1 GSM8K,42.46,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HFv1 HellaSwag,85.35,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HFv1 MMLU,78.26,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HFv1 TruthfulQA,53.46,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HF OpenLLM v1,71.13,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HFv1 ARC,65.7,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HFv1 HellaSwag,85.22,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HFv1 MMLU,78.78,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HFv1 TruthfulQA,53.64,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HF OpenLLM v1,68.31,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HFv1 GSM8K,44.35,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HFv1 HellaSwag,85.25,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HFv1 MMLU,78.37,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HFv1 TruthfulQA,53.62,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_fastchat,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HF OpenLLM v1,67.88,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HFv1 GSM8K,49.05,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HFv1 HellaSwag,85.09,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HFv1 MMLU,76.59,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HFv1 TruthfulQA,52.65,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v2,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HF OpenLLM v1,61.15,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HFv1 GSM8K,6.67,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HFv1 HellaSwag,84.88,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HFv1 MMLU,74.98,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HFv1 TruthfulQA,51.8,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_llama_q_v3,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HF OpenLLM v1,68.1,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HFv1 ARC,67.15,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HFv1 GSM8K,40.64,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HFv1 HellaSwag,85.37,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HFv1 MMLU,78.46,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HFv1 TruthfulQA,53.32,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_lora,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HF OpenLLM v1,69.86,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HFv1 GSM8K,53.98,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HFv1 HellaSwag,85.14,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HFv1 MMLU,77.66,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HFv1 TruthfulQA,53.03,,hf_open_llm_v1_240829_frozen.csv +platyi_34b_q,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HF OpenLLM v1,54.23,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HFv1 GSM8K,11.3,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HFv1 HellaSwag,82.65,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HFv1 MMLU,56.32,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HFv1 TruthfulQA,38.35,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_ia3,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HF OpenLLM v1,54.48,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HFv1 GSM8K,7.51,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HFv1 HellaSwag,82.5,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HFv1 MMLU,56.34,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HFv1 TruthfulQA,43.91,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_lora,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HF OpenLLM v1,53.74,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HFv1 GSM8K,5.0,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HFv1 HellaSwag,82.55,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HFv1 MMLU,57.34,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HFv1 TruthfulQA,43.38,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,52.27,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HFv1 ARC,57.76,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,2.96,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,81.63,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.63,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,39.7,,hf_open_llm_v1_240829_frozen.csv +platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HF OpenLLM v1,53.64,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HFv1 GSM8K,6.29,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HFv1 HellaSwag,82.36,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HFv1 MMLU,54.94,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HFv1 TruthfulQA,43.62,,hf_open_llm_v1_240829_frozen.csv +platypus2_22b_relora,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HF OpenLLM v1,49.97,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HFv1 HellaSwag,78.84,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HFv1 MMLU,49.83,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HFv1 TruthfulQA,40.64,,hf_open_llm_v1_240829_frozen.csv +platypus2_7b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HF OpenLLM v1,56.65,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HFv1 ARC,62.12,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HFv1 GSM8K,11.83,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HFv1 HellaSwag,82.1,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HFv1 MMLU,58.84,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HFv1 TruthfulQA,47.88,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3,HFv1 Winogrande,77.11,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HF OpenLLM v1,56.29,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HFv1 GSM8K,10.99,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HFv1 HellaSwag,82.09,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HFv1 MMLU,57.91,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HFv1 TruthfulQA,47.03,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v2_1,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HF OpenLLM v1,56.74,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HFv1 HellaSwag,82.1,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HFv1 MMLU,58.67,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HFv1 TruthfulQA,46.96,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v3,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HF OpenLLM v1,56.49,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HFv1 ARC,61.43,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HFv1 HellaSwag,81.84,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HFv1 MMLU,59.02,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HFv1 TruthfulQA,48.64,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_ia3_v4,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HF OpenLLM v1,55.15,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HFv1 GSM8K,7.13,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HFv1 HellaSwag,82.09,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HFv1 MMLU,58.77,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HFv1 TruthfulQA,45.15,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HF OpenLLM v1,51.61,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HFv1 HellaSwag,81.17,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HFv1 MMLU,50.23,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HFv1 TruthfulQA,43.43,,hf_open_llm_v1_240829_frozen.csv +platypus2xopenorca_13b_lora_v2,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HF OpenLLM v1,35.24,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HFv1 ARC,33.28,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HFv1 HellaSwag,50.76,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HFv1 MMLU,33.25,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HFv1 TruthfulQA,40.73,,hf_open_llm_v1_240829_frozen.csv +platypus_1_8b,HFv1 Winogrande,52.96,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HF OpenLLM v1,53.83,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HFv1 GSM8K,6.6,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HFv1 HellaSwag,82.44,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HFv1 MMLU,55.33,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HFv1 TruthfulQA,43.61,,hf_open_llm_v1_240829_frozen.csv +platypus_2_22b_relora,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HF OpenLLM v1,59.03,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HFv1 GSM8K,14.4,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HFv1 HellaSwag,84.26,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HFv1 MMLU,64.23,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HFv1 TruthfulQA,45.35,,hf_open_llm_v1_240829_frozen.csv +platypus_30b,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HF OpenLLM v1,53.95,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HFv1 GSM8K,10.08,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HFv1 HellaSwag,83.02,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HFv1 MMLU,56.07,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HFv1 TruthfulQA,46.94,,hf_open_llm_v1_240829_frozen.csv +platypus_nebula_v2_7b,HFv1 Winogrande,72.22,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HF OpenLLM v1,71.69,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HFv1 ARC,68.43,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HFv1 GSM8K,59.82,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HFv1 HellaSwag,85.21,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HFv1 MMLU,78.13,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HFv1 TruthfulQA,54.48,,hf_open_llm_v1_240829_frozen.csv +platypus_yi_34b,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HF OpenLLM v1,72.67,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 ARC,69.28,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 GSM8K,71.42,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 HellaSwag,86.59,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 MMLU,65.13,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 TruthfulQA,62.69,,hf_open_llm_v1_240829_frozen.csv +pmmpk_einstainmorcoro14krishnahercules_7b_slerp,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HF OpenLLM v1,66.39,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HFv1 ARC,77.13,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HFv1 HellaSwag,90.72,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HFv1 MMLU,63.76,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HFv1 TruthfulQA,80.81,,hf_open_llm_v1_240829_frozen.csv +polar_14b_v0_2,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HF OpenLLM v1,33.33,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HFv1 ARC,27.05,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HFv1 HellaSwag,51.68,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HFv1 MMLU,26.64,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HFv1 TruthfulQA,34.69,,hf_open_llm_v1_240829_frozen.csv +polyglot_ko_12_8b,HFv1 Winogrande,59.75,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HF OpenLLM v1,66.84,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HFv1 ARC,63.74,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HFv1 HellaSwag,84.85,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HFv1 MMLU,63.57,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HFv1 TruthfulQA,53.78,,hf_open_llm_v1_240829_frozen.csv +polyglot_math_4x7b,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HF OpenLLM v1,44.67,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HFv1 ARC,47.01,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HFv1 GSM8K,5.08,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HFv1 HellaSwag,73.75,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HFv1 MMLU,32.47,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HFv1 TruthfulQA,38.37,,hf_open_llm_v1_240829_frozen.csv +poro_34b_gptq,HFv1 Winogrande,71.35,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HF OpenLLM v1,65.21,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HFv1 GSM8K,37.68,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HFv1 HellaSwag,85.13,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HFv1 MMLU,64.44,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HFv1 TruthfulQA,60.57,,hf_open_llm_v1_240829_frozen.csv +power_llama3_13b_instruct,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HF OpenLLM v1,73.68,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HFv1 ARC,71.59,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HFv1 GSM8K,64.37,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HFv1 HellaSwag,88.09,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HFv1 MMLU,64.92,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HFv1 TruthfulQA,68.57,,hf_open_llm_v1_240829_frozen.csv +prodigy_7b,HFv1 Winogrande,84.53,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HF OpenLLM v1,54.19,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HFv1 ARC,57.0,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HFv1 GSM8K,3.64,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HFv1 HellaSwag,81.06,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HFv1 MMLU,58.3,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HFv1 TruthfulQA,52.66,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_13b_v2,HFv1 Winogrande,72.45,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,52.41,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,54.52,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,79.36,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,55.15,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,54.32,,hf_open_llm_v1_240829_frozen.csv +puddlejumper_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,71.11,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HF OpenLLM v1,34.42,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HFv1 ARC,30.72,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HFv1 HellaSwag,53.49,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HFv1 MMLU,24.73,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HFv1 TruthfulQA,39.03,,hf_open_llm_v1_240829_frozen.csv +puli_gptrio,HFv1 Winogrande,57.77,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HF OpenLLM v1,38.82,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HFv1 ARC,39.59,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HFv1 HellaSwag,68.82,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HFv1 MMLU,26.76,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HFv1 TruthfulQA,31.85,,hf_open_llm_v1_240829_frozen.csv +pythia_12b,HFv1 Winogrande,64.17,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HF OpenLLM v1,39.7,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HFv1 ARC,41.38,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HFv1 HellaSwag,70.26,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HFv1 MMLU,25.63,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HFv1 TruthfulQA,33.0,,hf_open_llm_v1_240829_frozen.csv +pythia_12b_deduped,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HF OpenLLM v1,29.02,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HFv1 HellaSwag,30.34,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HFv1 MMLU,24.95,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HFv1 TruthfulQA,44.26,,hf_open_llm_v1_240829_frozen.csv +pythia_160m,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HF OpenLLM v1,29.38,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HFv1 ARC,24.06,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HFv1 HellaSwag,31.39,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HFv1 MMLU,24.86,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HFv1 TruthfulQA,44.34,,hf_open_llm_v1_240829_frozen.csv +pythia_160m_deduped,HFv1 Winogrande,51.38,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HF OpenLLM v1,34.46,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HFv1 HellaSwag,51.43,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HFv1 MMLU,26.55,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HFv1 TruthfulQA,39.24,,hf_open_llm_v1_240829_frozen.csv +pythia_1_3b,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HF OpenLLM v1,34.75,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HFv1 ARC,31.48,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HFv1 HellaSwag,52.86,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HFv1 MMLU,25.8,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HFv1 TruthfulQA,38.85,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b,HFv1 Winogrande,58.01,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HF OpenLLM v1,35.0,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HFv1 ARC,32.68,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HFv1 HellaSwag,54.96,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HFv1 MMLU,25.56,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HFv1 TruthfulQA,38.66,,hf_open_llm_v1_240829_frozen.csv +pythia_1_4b_deduped,HFv1 Winogrande,57.3,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HF OpenLLM v1,32.78,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HFv1 ARC,29.1,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HFv1 GSM8K,1.14,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HFv1 HellaSwag,49.65,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HFv1 MMLU,24.27,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HFv1 TruthfulQA,38.94,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_deduped,HFv1 Winogrande,53.59,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HF OpenLLM v1,32.76,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HFv1 ARC,30.12,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HFv1 HellaSwag,49.24,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HFv1 MMLU,24.24,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HFv1 TruthfulQA,37.2,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo,HFv1 Winogrande,54.06,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HF OpenLLM v1,32.55,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HFv1 ARC,29.44,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HFv1 GSM8K,1.97,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HFv1 HellaSwag,49.03,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HFv1 MMLU,24.13,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HFv1 TruthfulQA,37.27,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_dpo_full,HFv1 Winogrande,53.43,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HF OpenLLM v1,32.85,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HFv1 ARC,30.55,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HFv1 GSM8K,2.35,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HFv1 HellaSwag,49.26,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HFv1 MMLU,24.46,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HFv1 TruthfulQA,36.89,,hf_open_llm_v1_240829_frozen.csv +pythia_1b_spin_iter1,HFv1 Winogrande,53.59,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HF OpenLLM v1,37.09,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HFv1 ARC,37.37,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HFv1 HellaSwag,60.74,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HFv1 MMLU,25.86,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HFv1 TruthfulQA,35.4,,hf_open_llm_v1_240829_frozen.csv +pythia_2_7b,HFv1 Winogrande,62.12,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HF OpenLLM v1,36.72,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HFv1 ARC,36.26,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HFv1 HellaSwag,60.66,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HFv1 MMLU,26.78,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HFv1 TruthfulQA,35.56,,hf_open_llm_v1_240829_frozen.csv +pythia_2_8b_deduped,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HF OpenLLM v1,28.81,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HFv1 ARC,21.84,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HFv1 HellaSwag,27.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HFv1 MMLU,24.97,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HFv1 TruthfulQA,50.12,,hf_open_llm_v1_240829_frozen.csv +pythia_31m,HFv1 Winogrande,49.72,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HF OpenLLM v1,28.59,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HFv1 ARC,21.84,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HFv1 HellaSwag,26.81,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HFv1 MMLU,24.55,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HFv1 TruthfulQA,48.04,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_chat_v1,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HF OpenLLM v1,28.85,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HFv1 ARC,23.12,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HFv1 HellaSwag,25.66,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HFv1 MMLU,23.11,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HFv1 TruthfulQA,51.32,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_goodwiki_deduped_2048_scratch,HFv1 Winogrande,49.88,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HF OpenLLM v1,29.15,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HFv1 ARC,23.12,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HFv1 HellaSwag,25.23,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HFv1 TruthfulQA,51.67,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_ki_v1_2048_scratch,HFv1 Winogrande,51.78,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HF OpenLLM v1,28.6,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 HellaSwag,25.79,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 TruthfulQA,50.62,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplepile_lite_2048_scratch_2e,HFv1 Winogrande,48.62,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HF OpenLLM v1,28.27,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HFv1 HellaSwag,25.55,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HFv1 TruthfulQA,49.37,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_2048,HFv1 Winogrande,49.41,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HF OpenLLM v1,28.61,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HFv1 HellaSwag,25.61,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HFv1 TruthfulQA,49.65,,hf_open_llm_v1_240829_frozen.csv +pythia_31m_simplewiki_scratch_bf16,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HF OpenLLM v1,31.29,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HFv1 HellaSwag,41.29,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HFv1 MMLU,25.99,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HFv1 TruthfulQA,40.95,,hf_open_llm_v1_240829_frozen.csv +pythia_410m_deduped,HFv1 Winogrande,54.38,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HF OpenLLM v1,38.06,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HFv1 ARC,40.1,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HFv1 HellaSwag,65.0,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HFv1 MMLU,24.64,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HFv1 TruthfulQA,32.85,,hf_open_llm_v1_240829_frozen.csv +pythia_6_7b,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HF OpenLLM v1,39.3,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HFv1 ARC,41.3,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HFv1 HellaSwag,67.05,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HFv1 MMLU,26.48,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HFv1 TruthfulQA,35.19,,hf_open_llm_v1_240829_frozen.csv +pythia_6_9b_deduped,HFv1 Winogrande,64.09,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HF OpenLLM v1,28.93,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HFv1 HellaSwag,27.29,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HFv1 MMLU,25.9,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HFv1 TruthfulQA,47.06,,hf_open_llm_v1_240829_frozen.csv +pythia_70m,HFv1 Winogrande,51.46,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HF OpenLLM v1,28.44,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HFv1 ARC,21.08,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HFv1 HellaSwag,27.17,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HFv1 MMLU,25.26,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HFv1 TruthfulQA,47.51,,hf_open_llm_v1_240829_frozen.csv +pythia_70m_deduped,HFv1 Winogrande,49.64,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HF OpenLLM v1,43.35,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HFv1 GSM8K,19.71,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HFv1 HellaSwag,58.46,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HFv1 MMLU,45.44,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HFv1 TruthfulQA,41.6,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_base,HFv1 Winogrande,57.93,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HF OpenLLM v1,45.91,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HFv1 ARC,39.08,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HFv1 GSM8K,27.52,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HFv1 HellaSwag,62.37,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HFv1 MMLU,44.09,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HFv1 TruthfulQA,43.15,,hf_open_llm_v1_240829_frozen.csv +quan_1_8b_chat,HFv1 Winogrande,59.27,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HF OpenLLM v1,35.12,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HFv1 ARC,30.46,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HFv1 HellaSwag,44.96,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HFv1 MMLU,31.29,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HFv1 TruthfulQA,43.89,,hf_open_llm_v1_240829_frozen.csv +quark_464m_v0_2,HFv1 Winogrande,55.64,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HF OpenLLM v1,46.14,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HFv1 ARC,39.33,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HFv1 GSM8K,27.45,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HFv1 HellaSwag,60.57,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HFv1 MMLU,43.93,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HFv1 TruthfulQA,46.44,,hf_open_llm_v1_240829_frozen.csv +quyen_mini_v0_1,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HF OpenLLM v1,63.27,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HFv1 ARC,55.72,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HFv1 GSM8K,60.05,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HFv1 HellaSwag,78.52,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HFv1 MMLU,60.45,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HFv1 TruthfulQA,53.6,,hf_open_llm_v1_240829_frozen.csv +quyen_plus_v0_1,HFv1 Winogrande,71.27,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HF OpenLLM v1,68.6,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HFv1 ARC,59.3,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HFv1 HellaSwag,81.07,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HFv1 MMLU,68.44,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HFv1 TruthfulQA,55.85,,hf_open_llm_v1_240829_frozen.csv +quyen_pro_v0_1,HFv1 Winogrande,75.93,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HF OpenLLM v1,56.02,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HFv1 ARC,48.21,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HFv1 GSM8K,45.87,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HFv1 HellaSwag,72.49,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HFv1 MMLU,52.88,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HFv1 TruthfulQA,51.53,,hf_open_llm_v1_240829_frozen.csv +quyen_v0_1,HFv1 Winogrande,65.11,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HF OpenLLM v1,38.62,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HFv1 ARC,31.48,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HFv1 GSM8K,16.3,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HFv1 HellaSwag,49.05,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HFv1 MMLU,39.35,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HFv1 TruthfulQA,38.3,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b,HFv1 Winogrande,57.22,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HF OpenLLM v1,35.61,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HFv1 ARC,30.55,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HFv1 GSM8K,7.66,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HFv1 HellaSwag,44.07,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HFv1 MMLU,33.82,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HFv1 TruthfulQA,42.95,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_0_5b_chat,HFv1 Winogrande,54.62,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HF OpenLLM v1,75.42,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HFv1 GSM8K,81.05,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HFv1 HellaSwag,87.48,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HFv1 MMLU,80.2,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HFv1 TruthfulQA,49.66,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b,HFv1 Winogrande,84.14,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HF OpenLLM v1,68.01,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HFv1 ARC,72.01,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HFv1 GSM8K,30.1,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HFv1 HellaSwag,84.67,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HFv1 MMLU,78.04,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HFv1 TruthfulQA,65.86,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_110b_chat,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HF OpenLLM v1,66.7,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HFv1 ARC,56.57,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HFv1 HellaSwag,81.08,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HFv1 MMLU,69.36,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HFv1 TruthfulQA,52.06,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HF OpenLLM v1,62.27,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HFv1 ARC,58.7,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HFv1 GSM8K,30.63,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HFv1 HellaSwag,82.27,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HFv1 MMLU,68.57,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HFv1 TruthfulQA,60.36,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_14b_chat,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HF OpenLLM v1,46.55,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HFv1 ARC,37.88,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HFv1 GSM8K,33.59,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HFv1 HellaSwag,61.42,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HFv1 MMLU,46.71,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HFv1 TruthfulQA,39.43,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b,HFv1 Winogrande,60.3,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HF OpenLLM v1,43.99,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HFv1 ARC,38.74,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HFv1 GSM8K,19.03,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HFv1 HellaSwag,60.02,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HFv1 MMLU,45.87,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HFv1 TruthfulQA,40.62,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_1_8b_chat,HFv1 Winogrande,59.67,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HF OpenLLM v1,70.47,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HFv1 ARC,63.57,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HFv1 GSM8K,61.11,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HFv1 MMLU,74.31,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HFv1 TruthfulQA,57.43,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HF OpenLLM v1,62.95,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HFv1 GSM8K,7.05,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HFv1 HellaSwag,85.49,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HFv1 MMLU,74.99,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HFv1 TruthfulQA,66.95,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_32b_chat,HFv1 Winogrande,77.19,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HF OpenLLM v1,57.05,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HFv1 ARC,48.46,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HFv1 HellaSwag,71.58,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HFv1 MMLU,56.52,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HFv1 TruthfulQA,47.27,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b,HFv1 Winogrande,66.22,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HF OpenLLM v1,46.79,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HFv1 ARC,43.26,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HFv1 GSM8K,2.43,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HFv1 HellaSwag,69.73,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HFv1 MMLU,55.55,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HFv1 TruthfulQA,44.79,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_4b_chat,HFv1 Winogrande,64.96,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HF OpenLLM v1,72.91,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HFv1 MMLU,77.2,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HFv1 TruthfulQA,59.61,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HF OpenLLM v1,65.98,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HFv1 ARC,68.52,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HFv1 GSM8K,20.92,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HFv1 HellaSwag,86.47,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HFv1 MMLU,77.46,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HFv1 TruthfulQA,63.9,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_72b_chat,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HF OpenLLM v1,55.15,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HFv1 ARC,55.89,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HFv1 GSM8K,13.57,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HFv1 HellaSwag,78.56,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HFv1 MMLU,61.7,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HFv1 TruthfulQA,53.65,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat,HFv1 Winogrande,67.8,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HF OpenLLM v1,56.0,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HFv1 ARC,57.59,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HFv1 GSM8K,14.63,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HFv1 HellaSwag,78.52,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HFv1 MMLU,61.18,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HFv1 TruthfulQA,57.59,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_chat_llamafy,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HF OpenLLM v1,53.66,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HFv1 ARC,53.92,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HFv1 HellaSwag,76.03,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HFv1 MMLU,62.38,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HFv1 TruthfulQA,45.34,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat,HFv1 Winogrande,68.82,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HF OpenLLM v1,53.94,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HFv1 ARC,50.77,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HFv1 GSM8K,27.45,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HFv1 HellaSwag,74.24,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HFv1 MMLU,60.7,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HFv1 TruthfulQA,42.37,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_dpo,HFv1 Winogrande,68.11,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HF OpenLLM v1,54.44,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HFv1 ARC,50.68,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HFv1 GSM8K,29.34,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HFv1 HellaSwag,73.49,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HFv1 MMLU,60.47,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HFv1 TruthfulQA,43.89,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft,HFv1 Winogrande,68.75,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HF OpenLLM v1,54.91,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HFv1 ARC,54.27,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HFv1 GSM8K,21.76,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HFv1 HellaSwag,75.53,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HFv1 MMLU,61.98,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HFv1 TruthfulQA,47.26,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_7b_dutch_chat_sft_bf16,HFv1 Winogrande,68.67,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HF OpenLLM v1,56.03,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HFv1 ARC,54.86,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HFv1 GSM8K,16.98,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HFv1 HellaSwag,79.39,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HFv1 MMLU,62.54,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HFv1 TruthfulQA,50.09,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b,HFv1 Winogrande,72.3,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HF OpenLLM v1,57.22,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HFv1 ARC,53.67,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HFv1 GSM8K,28.2,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HFv1 HellaSwag,80.54,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HFv1 MMLU,60.97,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HFv1 TruthfulQA,50.56,,hf_open_llm_v1_240829_frozen.csv +qwen1_5_moe_a2_7b_chat,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HF OpenLLM v1,66.7,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HFv1 ARC,56.57,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HFv1 HellaSwag,81.08,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HFv1 MMLU,69.36,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HFv1 TruthfulQA,52.06,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_14b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HF OpenLLM v1,72.91,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HFv1 ARC,65.87,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HFv1 HellaSwag,85.99,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HFv1 MMLU,77.2,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HFv1 TruthfulQA,59.61,,hf_open_llm_v1_240829_frozen.csv +qwen2_beta_72b,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HF OpenLLM v1,65.86,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HFv1 GSM8K,58.98,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HFv1 HellaSwag,83.99,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HFv1 MMLU,67.7,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HFv1 TruthfulQA,49.43,,hf_open_llm_v1_240829_frozen.csv +qwen_14b,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HF OpenLLM v1,63.09,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HFv1 ARC,55.2,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HFv1 GSM8K,52.77,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HFv1 HellaSwag,82.31,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HFv1 MMLU,66.11,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HFv1 TruthfulQA,45.6,,hf_open_llm_v1_240829_frozen.csv +qwen_14b_llamafied,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HF OpenLLM v1,42.94,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HFv1 GSM8K,19.26,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HFv1 HellaSwag,54.34,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HFv1 MMLU,44.55,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HFv1 TruthfulQA,43.7,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_chat_llama,HFv1 Winogrande,58.88,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HF OpenLLM v1,42.77,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HFv1 ARC,38.65,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HFv1 GSM8K,12.74,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HFv1 HellaSwag,62.66,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HFv1 MMLU,44.94,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HFv1 TruthfulQA,38.7,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_everythinglm,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HF OpenLLM v1,44.75,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HFv1 GSM8K,24.41,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HFv1 HellaSwag,58.87,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HFv1 MMLU,46.37,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HFv1 TruthfulQA,39.41,,hf_open_llm_v1_240829_frozen.csv +qwen_1_8b_llamafied,HFv1 Winogrande,61.72,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HF OpenLLM v1,73.6,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HFv1 GSM8K,70.43,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HFv1 HellaSwag,85.94,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HFv1 MMLU,77.37,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HFv1 TruthfulQA,60.19,,hf_open_llm_v1_240829_frozen.csv +qwen_72b,HFv1 Winogrande,82.48,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HF OpenLLM v1,69.53,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HFv1 GSM8K,56.25,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HFv1 HellaSwag,83.27,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HFv1 MMLU,73.66,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HFv1 TruthfulQA,57.6,,hf_open_llm_v1_240829_frozen.csv +qwen_72b_llama,HFv1 Winogrande,81.53,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HF OpenLLM v1,59.19,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HFv1 ARC,51.37,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HFv1 GSM8K,44.96,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HFv1 HellaSwag,78.47,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HFv1 MMLU,59.84,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HFv1 TruthfulQA,47.79,,hf_open_llm_v1_240829_frozen.csv +qwen_7b,HFv1 Winogrande,72.69,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HF OpenLLM v1,36.28,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HFv1 GSM8K,8.57,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HFv1 HellaSwag,44.58,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HFv1 MMLU,33.73,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HFv1 TruthfulQA,42.59,,hf_open_llm_v1_240829_frozen.csv +qwen_orpo_v1,HFv1 Winogrande,57.06,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HF OpenLLM v1,69.69,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HFv1 ARC,70.31,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HFv1 GSM8K,58.53,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HFv1 HellaSwag,87.43,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HFv1 MMLU,60.5,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HFv1 TruthfulQA,62.18,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_dpo_chat,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HF OpenLLM v1,69.36,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HFv1 GSM8K,55.65,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HFv1 MMLU,62.92,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HFv1 TruthfulQA,67.06,,hf_open_llm_v1_240829_frozen.csv +rabbit_7b_v2_dpo_chat,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HF OpenLLM v1,74.78,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HFv1 ARC,74.4,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HFv1 GSM8K,56.86,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HFv1 HellaSwag,88.73,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HFv1 MMLU,64.55,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HFv1 TruthfulQA,76.74,,hf_open_llm_v1_240829_frozen.csv +raccoon_small,HFv1 Winogrande,87.37,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HF OpenLLM v1,65.24,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HFv1 ARC,63.48,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HFv1 GSM8K,53.45,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HFv1 HellaSwag,83.65,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HFv1 MMLU,60.03,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HFv1 TruthfulQA,54.76,,hf_open_llm_v1_240829_frozen.csv +radiantloom_mixtral_8x7b_fusion,HFv1 Winogrande,76.09,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HF OpenLLM v1,55.86,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HFv1 HellaSwag,82.26,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HFv1 MMLU,63.82,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HFv1 TruthfulQA,47.19,,hf_open_llm_v1_240829_frozen.csv +radintloom_mistral_7b_fusion,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HF OpenLLM v1,59.01,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HFv1 ARC,51.54,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HFv1 GSM8K,48.75,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HFv1 HellaSwag,75.11,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HFv1 MMLU,61.51,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HFv1 TruthfulQA,46.44,,hf_open_llm_v1_240829_frozen.csv +rain_7b_v0_2,HFv1 Winogrande,70.72,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HF OpenLLM v1,61.88,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HFv1 ARC,61.18,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HFv1 GSM8K,37.0,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HFv1 HellaSwag,82.33,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HFv1 MMLU,63.26,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HFv1 TruthfulQA,49.45,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v10,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HF OpenLLM v1,61.42,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HFv1 GSM8K,34.8,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HFv1 MMLU,63.0,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HFv1 TruthfulQA,48.82,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_7b_v9,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HF OpenLLM v1,61.64,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HFv1 GSM8K,36.32,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HFv1 HellaSwag,82.51,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HFv1 MMLU,62.79,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HFv1 TruthfulQA,48.37,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v6,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HF OpenLLM v1,62.18,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HFv1 GSM8K,37.45,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HFv1 HellaSwag,82.52,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HFv1 MMLU,63.26,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HFv1 TruthfulQA,49.78,,hf_open_llm_v1_240829_frozen.csv +rainbowfish_v7,HFv1 Winogrande,78.14,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HF OpenLLM v1,73.0,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HFv1 ARC,69.71,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HFv1 GSM8K,71.72,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HFv1 HellaSwag,87.4,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HFv1 MMLU,64.94,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HFv1 TruthfulQA,63.31,,hf_open_llm_v1_240829_frozen.csv +rasgulla1_7b,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HF OpenLLM v1,45.46,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HFv1 ARC,46.93,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HFv1 GSM8K,16.15,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HFv1 HellaSwag,72.48,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HFv1 MMLU,34.61,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HFv1 TruthfulQA,35.1,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HF OpenLLM v1,40.86,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HFv1 ARC,30.97,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HFv1 GSM8K,10.08,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HFv1 HellaSwag,56.26,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HFv1 MMLU,40.87,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HFv1 TruthfulQA,42.81,,hf_open_llm_v1_240829_frozen.csv +recurrentgemma_2b_it,HFv1 Winogrande,64.17,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HF OpenLLM v1,55.0,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HFv1 ARC,60.92,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HFv1 GSM8K,11.07,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HFv1 HellaSwag,82.43,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HFv1 MMLU,55.61,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HFv1 TruthfulQA,44.26,,hf_open_llm_v1_240829_frozen.csv +redmond_puffin_13b_instruct_pl_lora_unload,HFv1 Winogrande,75.69,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HF OpenLLM v1,41.49,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HFv1 ARC,46.25,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HFv1 GSM8K,3.03,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HFv1 HellaSwag,71.63,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HFv1 MMLU,27.68,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HFv1 TruthfulQA,33.03,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_7b_base,HFv1 Winogrande,67.32,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HF OpenLLM v1,38.54,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HFv1 ARC,40.19,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HFv1 HellaSwag,64.77,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HFv1 MMLU,27.03,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HFv1 TruthfulQA,33.23,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_3b_v1,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HF OpenLLM v1,41.25,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HFv1 ARC,46.25,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HFv1 HellaSwag,71.63,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HFv1 MMLU,27.68,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HFv1 TruthfulQA,33.03,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_base_7b_v0_1,HFv1 Winogrande,67.32,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HF OpenLLM v1,39.16,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 ARC,41.3,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 HellaSwag,66.82,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 MMLU,26.1,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 TruthfulQA,35.04,,hf_open_llm_v1_240829_frozen.csv +redpajama_incite_chat_3b_v1_rl_lora_8bit_test1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HF OpenLLM v1,49.49,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HFv1 ARC,43.77,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HFv1 GSM8K,27.14,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HFv1 HellaSwag,75.7,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HFv1 MMLU,41.82,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HFv1 TruthfulQA,42.13,,hf_open_llm_v1_240829_frozen.csv +remask_3b,HFv1 Winogrande,66.38,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HF OpenLLM v1,34.99,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HFv1 ARC,34.3,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HFv1 HellaSwag,53.34,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HFv1 MMLU,27.05,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HFv1 TruthfulQA,35.48,,hf_open_llm_v1_240829_frozen.csv +rho_math_1b_v0_1,HFv1 Winogrande,59.75,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HFv1 ARC,60.32,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HFv1 GSM8K,26.84,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HFv1 HellaSwag,80.42,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HFv1 MMLU,63.54,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HFv1 TruthfulQA,55.59,,hf_open_llm_v1_240829_frozen.csv +rizla55b,HFv1 Winogrande,78.85,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HF OpenLLM v1,75.67,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HFv1 GSM8K,61.49,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HFv1 HellaSwag,89.72,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HFv1 TruthfulQA,76.93,,hf_open_llm_v1_240829_frozen.csv +rizla_17,HFv1 Winogrande,87.85,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HF OpenLLM v1,55.77,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HFv1 ARC,50.6,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HFv1 GSM8K,36.47,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HFv1 HellaSwag,76.69,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HFv1 MMLU,47.1,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HFv1 TruthfulQA,55.82,,hf_open_llm_v1_240829_frozen.csv +rocket_3b,HFv1 Winogrande,67.96,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HF OpenLLM v1,76.06,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HFv1 ARC,72.35,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HFv1 GSM8K,65.88,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HFv1 HellaSwag,89.77,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HFv1 MMLU,66.35,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HFv1 TruthfulQA,77.92,,hf_open_llm_v1_240829_frozen.csv +rolebeagle_11b,HFv1 Winogrande,84.06,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HF OpenLLM v1,72.75,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HFv1 ARC,70.39,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HFv1 GSM8K,67.55,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HFv1 HellaSwag,87.25,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HFv1 MMLU,64.72,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HFv1 TruthfulQA,64.18,,hf_open_llm_v1_240829_frozen.csv +royalmaid_7b_slerp,HFv1 Winogrande,82.4,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HF OpenLLM v1,72.98,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HFv1 ARC,70.48,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HFv1 HellaSwag,87.38,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HFv1 MMLU,64.78,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HFv1 TruthfulQA,66.28,,hf_open_llm_v1_240829_frozen.csv +royalnoroichi_7b_slerp,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HF OpenLLM v1,39.92,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HFv1 ARC,44.45,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HFv1 HellaSwag,71.07,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HFv1 MMLU,26.12,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HFv1 TruthfulQA,32.04,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_14b_pile,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HF OpenLLM v1,28.64,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HFv1 ARC,23.63,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HFv1 HellaSwag,31.74,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HFv1 MMLU,23.18,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HFv1 TruthfulQA,41.92,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_169m_pile,HFv1 Winogrande,50.91,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HF OpenLLM v1,33.25,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HFv1 HellaSwag,52.25,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HFv1 MMLU,25.77,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HFv1 TruthfulQA,35.8,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_1b5_pile,HFv1 Winogrande,53.83,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HF OpenLLM v1,35.25,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HFv1 ARC,36.01,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HFv1 HellaSwag,59.66,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HFv1 MMLU,24.67,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HFv1 TruthfulQA,32.14,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_3b_pile,HFv1 Winogrande,58.33,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HF OpenLLM v1,30.45,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HFv1 ARC,26.71,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HFv1 HellaSwag,40.01,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HFv1 MMLU,24.85,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HFv1 TruthfulQA,39.58,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_430m_pile,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HF OpenLLM v1,37.95,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HFv1 ARC,39.68,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HFv1 GSM8K,0.76,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HFv1 HellaSwag,66.31,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HFv1 MMLU,24.96,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HFv1 TruthfulQA,33.65,,hf_open_llm_v1_240829_frozen.csv +rwkv_4_7b_pile,HFv1 Winogrande,62.35,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HF OpenLLM v1,33.56,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HFv1 ARC,31.83,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HFv1 HellaSwag,52.6,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HFv1 MMLU,25.96,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HFv1 TruthfulQA,37.09,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_1b5,HFv1 Winogrande,53.91,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HF OpenLLM v1,35.81,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HFv1 ARC,36.69,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HFv1 HellaSwag,59.78,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HFv1 MMLU,24.87,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HFv1 TruthfulQA,35.6,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_3b,HFv1 Winogrande,57.46,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HF OpenLLM v1,38.55,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HFv1 HellaSwag,66.48,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HFv1 MMLU,23.64,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HFv1 TruthfulQA,38.56,,hf_open_llm_v1_240829_frozen.csv +rwkv_raven_7b,HFv1 Winogrande,62.9,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HF OpenLLM v1,33.05,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HFv1 ARC,29.69,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HFv1 HellaSwag,45.82,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HFv1 MMLU,25.62,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HFv1 TruthfulQA,40.76,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b,HFv1 Winogrande,55.33,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HF OpenLLM v1,33.47,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HFv1 ARC,30.38,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HFv1 HellaSwag,45.51,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HFv1 MMLU,26.73,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HFv1 TruthfulQA,39.85,,hf_open_llm_v1_240829_frozen.csv +sailor_0_5b_chat,HFv1 Winogrande,56.51,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HF OpenLLM v1,36.59,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HFv1 ARC,33.11,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HFv1 GSM8K,2.73,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HFv1 HellaSwag,57.06,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HFv1 MMLU,30.44,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HFv1 TruthfulQA,37.81,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b,HFv1 Winogrande,58.41,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HF OpenLLM v1,38.76,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HFv1 ARC,35.75,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HFv1 HellaSwag,57.12,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HFv1 MMLU,38.31,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HFv1 TruthfulQA,38.71,,hf_open_llm_v1_240829_frozen.csv +sailor_1_8b_chat,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HF OpenLLM v1,44.19,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HFv1 ARC,44.45,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HFv1 HellaSwag,69.53,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HFv1 MMLU,38.99,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HFv1 TruthfulQA,37.02,,hf_open_llm_v1_240829_frozen.csv +sailor_4b,HFv1 Winogrande,66.06,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HF OpenLLM v1,45.8,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HFv1 ARC,45.05,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HFv1 HellaSwag,68.36,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HFv1 MMLU,43.96,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HFv1 TruthfulQA,42.09,,hf_open_llm_v1_240829_frozen.csv +sailor_4b_chat,HFv1 Winogrande,66.22,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HF OpenLLM v1,53.82,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HFv1 ARC,49.83,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HFv1 GSM8K,32.52,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HFv1 HellaSwag,76.21,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HFv1 MMLU,54.84,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HFv1 TruthfulQA,40.12,,hf_open_llm_v1_240829_frozen.csv +sailor_7b,HFv1 Winogrande,69.38,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HF OpenLLM v1,54.81,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HFv1 ARC,52.3,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HFv1 GSM8K,30.4,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HFv1 HellaSwag,75.01,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HFv1 MMLU,56.24,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HFv1 TruthfulQA,44.09,,hf_open_llm_v1_240829_frozen.csv +sailor_7b_chat,HFv1 Winogrande,70.8,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HF OpenLLM v1,74.14,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HFv1 GSM8K,63.76,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HFv1 HellaSwag,88.41,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HFv1 MMLU,66.48,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HFv1 TruthfulQA,71.86,,hf_open_llm_v1_240829_frozen.csv +sakura_solar_instruct_dpo_v2,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HF OpenLLM v1,74.05,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HFv1 GSM8K,63.46,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HFv1 HellaSwag,88.49,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HFv1 MMLU,66.17,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HFv1 TruthfulQA,72.1,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_instruct_dpo,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HF OpenLLM v1,74.13,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HFv1 GSM8K,63.84,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HFv1 MMLU,66.21,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HFv1 TruthfulQA,72.12,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v1,HFv1 Winogrande,82.87,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HF OpenLLM v1,74.17,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HFv1 GSM8K,63.91,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HFv1 HellaSwag,88.52,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HFv1 MMLU,66.13,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HFv1 TruthfulQA,72.16,,hf_open_llm_v1_240829_frozen.csv +sakura_solrca_math_instruct_dpo_v2,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HF OpenLLM v1,67.28,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HFv1 GSM8K,29.95,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HFv1 HellaSwag,87.55,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HFv1 MMLU,67.82,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HFv1 TruthfulQA,65.02,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_70b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HF OpenLLM v1,51.07,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HFv1 ARC,55.03,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HFv1 GSM8K,7.2,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HFv1 HellaSwag,79.12,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HFv1 MMLU,40.51,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HFv1 TruthfulQA,50.37,,hf_open_llm_v1_240829_frozen.csv +samantha_1_11_7b,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HF OpenLLM v1,67.43,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HFv1 ARC,68.77,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HFv1 HellaSwag,87.46,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HFv1 MMLU,68.6,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HFv1 TruthfulQA,64.85,,hf_open_llm_v1_240829_frozen.csv +samantha_1_1_70b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HF OpenLLM v1,59.83,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HFv1 GSM8K,16.98,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HFv1 HellaSwag,85.08,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HFv1 MMLU,63.91,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HFv1 TruthfulQA,50.4,,hf_open_llm_v1_240829_frozen.csv +samantha_1_2_mistral_7b,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HF OpenLLM v1,57.96,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HFv1 ARC,63.4,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HFv1 GSM8K,16.0,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HFv1 HellaSwag,84.1,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HFv1 MMLU,61.36,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HFv1 TruthfulQA,46.08,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_7b,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HF OpenLLM v1,53.4,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HFv1 ARC,53.5,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HFv1 GSM8K,10.84,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HFv1 HellaSwag,75.14,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HFv1 MMLU,51.72,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HFv1 TruthfulQA,58.81,,hf_open_llm_v1_240829_frozen.csv +samantha_mistral_instruct_7b,HFv1 Winogrande,70.4,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HF OpenLLM v1,54.58,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HFv1 ARC,57.0,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HFv1 GSM8K,11.37,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HFv1 HellaSwag,82.25,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HFv1 MMLU,54.21,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HFv1 TruthfulQA,49.58,,hf_open_llm_v1_240829_frozen.csv +samantha_nebula_7b,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HF OpenLLM v1,49.45,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HFv1 ARC,52.73,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HFv1 GSM8K,8.57,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HFv1 HellaSwag,78.42,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HFv1 MMLU,43.95,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HFv1 TruthfulQA,40.84,,hf_open_llm_v1_240829_frozen.csv +sambalingo_thai_chat,HFv1 Winogrande,72.22,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HF OpenLLM v1,43.53,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HFv1 ARC,46.16,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HFv1 HellaSwag,70.73,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HFv1 MMLU,38.63,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HFv1 TruthfulQA,39.94,,hf_open_llm_v1_240829_frozen.csv +sappha_2b_v3,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HF OpenLLM v1,60.34,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HFv1 GSM8K,34.72,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HFv1 HellaSwag,82.94,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HFv1 MMLU,63.42,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HFv1 TruthfulQA,41.8,,hf_open_llm_v1_240829_frozen.csv +satoshinv5,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HF OpenLLM v1,71.74,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HFv1 ARC,68.6,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HFv1 HellaSwag,86.96,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HFv1 MMLU,65.02,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HFv1 TruthfulQA,63.76,,hf_open_llm_v1_240829_frozen.csv +satyr_7b_model_stock,HFv1 Winogrande,80.43,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HF OpenLLM v1,70.32,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HFv1 ARC,67.58,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HFv1 HellaSwag,83.58,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HFv1 MMLU,64.93,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HFv1 TruthfulQA,56.08,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_7b_laserchat,HFv1 Winogrande,80.9,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HF OpenLLM v1,74.26,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HFv1 GSM8K,64.67,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HFv1 HellaSwag,88.3,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HFv1 MMLU,66.15,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HFv1 TruthfulQA,71.8,,hf_open_llm_v1_240829_frozen.csv +sauerkrautlm_una_solar_instruct,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HF OpenLLM v1,64.25,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HFv1 GSM8K,46.55,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HFv1 HellaSwag,81.75,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HFv1 MMLU,65.38,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HFv1 TruthfulQA,54.89,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_1,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HF OpenLLM v1,64.0,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HFv1 GSM8K,46.32,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HFv1 HellaSwag,81.35,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HFv1 MMLU,65.44,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HFv1 TruthfulQA,54.3,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_3,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HF OpenLLM v1,63.65,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HFv1 ARC,58.02,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HFv1 GSM8K,45.11,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HFv1 HellaSwag,81.77,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HFv1 MMLU,65.59,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HFv1 TruthfulQA,53.74,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_4,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HF OpenLLM v1,64.09,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HFv1 GSM8K,46.25,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HFv1 HellaSwag,81.6,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HFv1 MMLU,65.17,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HFv1 TruthfulQA,54.7,,hf_open_llm_v1_240829_frozen.csv +seagull_llama3_8b_orpo_v0_5,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HF OpenLLM v1,56.4,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HFv1 ARC,58.19,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HFv1 GSM8K,18.2,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HFv1 HellaSwag,81.98,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HFv1 MMLU,63.2,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HFv1 TruthfulQA,40.2,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HF OpenLLM v1,61.9,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HFv1 ARC,66.72,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HFv1 GSM8K,32.98,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HFv1 HellaSwag,84.34,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HFv1 MMLU,62.12,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HFv1 TruthfulQA,45.29,,hf_open_llm_v1_240829_frozen.csv +senzu_7b_v0_1_dpo,HFv1 Winogrande,79.95,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HF OpenLLM v1,68.98,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HFv1 ARC,67.32,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HFv1 GSM8K,57.09,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HFv1 MMLU,64.54,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HFv1 TruthfulQA,60.61,,hf_open_llm_v1_240829_frozen.csv +servile_harpsichord_cdpo,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HF OpenLLM v1,28.75,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HFv1 ARC,26.28,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HFv1 HellaSwag,24.87,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HFv1 MMLU,23.03,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HFv1 TruthfulQA,48.78,,hf_open_llm_v1_240829_frozen.csv +sf_72b_v1,HFv1 Winogrande,49.57,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HF OpenLLM v1,62.72,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HFv1 ARC,62.54,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HFv1 GSM8K,30.71,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HFv1 HellaSwag,80.26,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HFv1 MMLU,73.29,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HFv1 TruthfulQA,53.21,,hf_open_llm_v1_240829_frozen.csv +sg_raccoon_yi_200k_2_0,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HF OpenLLM v1,71.1,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HFv1 ARC,66.89,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HFv1 HellaSwag,86.61,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HFv1 MMLU,65.27,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HFv1 TruthfulQA,60.19,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7_b,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HF OpenLLM v1,66.55,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HFv1 ARC,67.75,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HFv1 GSM8K,45.11,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HFv1 HellaSwag,87.06,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HFv1 MMLU,58.79,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HFv1 TruthfulQA,62.15,,hf_open_llm_v1_240829_frozen.csv +shark_tank_ai_7b_v2,HFv1 Winogrande,78.45,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HF OpenLLM v1,29.41,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HFv1 ARC,22.44,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HFv1 HellaSwag,32.07,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HFv1 MMLU,26.65,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HFv1 TruthfulQA,43.22,,hf_open_llm_v1_240829_frozen.csv +sheared_pythia_160m,HFv1 Winogrande,51.7,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HF OpenLLM v1,35.71,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HFv1 ARC,32.68,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HFv1 HellaSwag,59.99,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HFv1 MMLU,25.69,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HFv1 TruthfulQA,36.97,,hf_open_llm_v1_240829_frozen.csv +shearedllama_1_3b_fft_test1,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HF OpenLLM v1,35.97,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HFv1 ARC,35.41,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HFv1 HellaSwag,62.75,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HFv1 MMLU,24.75,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HFv1 TruthfulQA,33.93,,hf_open_llm_v1_240829_frozen.csv +shearedplats_1_3b_v1,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HF OpenLLM v1,41.61,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HFv1 ARC,42.41,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HFv1 HellaSwag,72.58,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HFv1 MMLU,27.52,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HFv1 TruthfulQA,39.76,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2,HFv1 Winogrande,65.9,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HF OpenLLM v1,41.13,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HFv1 ARC,40.19,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HFv1 HellaSwag,70.08,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HFv1 MMLU,28.12,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HFv1 TruthfulQA,41.23,,hf_open_llm_v1_240829_frozen.csv +shearedplats_2_7b_v2_instruct_v0_1,HFv1 Winogrande,65.04,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HF OpenLLM v1,71.22,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HFv1 ARC,73.12,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HFv1 GSM8K,47.99,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HFv1 HellaSwag,87.77,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HFv1 MMLU,70.77,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HFv1 TruthfulQA,64.55,,hf_open_llm_v1_240829_frozen.csv +sheep_duck_llama_2_70b_v1_1,HFv1 Winogrande,83.11,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HF OpenLLM v1,51.64,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HFv1 ARC,52.3,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HFv1 GSM8K,35.86,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HFv1 HellaSwag,77.63,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HFv1 TruthfulQA,42.4,,hf_open_llm_v1_240829_frozen.csv +shisa_base_7b_v1,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HF OpenLLM v1,65.97,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HFv1 ARC,70.05,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HFv1 GSM8K,15.47,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HFv1 HellaSwag,84.06,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HFv1 MMLU,75.54,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HFv1 TruthfulQA,70.43,,hf_open_llm_v1_240829_frozen.csv +shqiponja_59b_v1,HFv1 Winogrande,80.27,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HFv1 GSM8K,58.38,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HFv1 HellaSwag,86.21,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HFv1 MMLU,64.51,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HFv1 TruthfulQA,61.34,,hf_open_llm_v1_240829_frozen.csv +silicon_medley,HFv1 Winogrande,79.24,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HF OpenLLM v1,74.74,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HFv1 GSM8K,70.36,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HFv1 HellaSwag,88.12,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HFv1 MMLU,65.14,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HFv1 TruthfulQA,69.91,,hf_open_llm_v1_240829_frozen.csv +silvermaiden_7b_slerp,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HF OpenLLM v1,72.5,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HFv1 ARC,69.97,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HFv1 GSM8K,66.03,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HFv1 HellaSwag,87.14,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HFv1 MMLU,65.03,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HFv1 TruthfulQA,64.57,,hf_open_llm_v1_240829_frozen.csv +siren_7b_slerp,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HF OpenLLM v1,71.74,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HFv1 ARC,68.69,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HFv1 GSM8K,62.24,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HFv1 HellaSwag,86.41,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HFv1 MMLU,64.93,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HFv1 TruthfulQA,67.64,,hf_open_llm_v1_240829_frozen.csv +sixtyoneeighty_7b_dpo,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HF OpenLLM v1,72.67,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HFv1 ARC,68.26,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HFv1 GSM8K,62.09,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HFv1 HellaSwag,86.95,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HFv1 MMLU,66.73,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HFv1 TruthfulQA,67.74,,hf_open_llm_v1_240829_frozen.csv +sj_solar_10_7b_dpo,HFv1 Winogrande,84.21,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HF OpenLLM v1,74.5,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HFv1 GSM8K,65.73,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HFv1 HellaSwag,88.41,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HFv1 TruthfulQA,71.92,,hf_open_llm_v1_240829_frozen.csv +skkudatascienceglobal_10_7b,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HF OpenLLM v1,72.89,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HFv1 ARC,65.96,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HFv1 GSM8K,65.88,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HFv1 HellaSwag,86.0,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HFv1 MMLU,77.33,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HFv1 TruthfulQA,59.54,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v1,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HF OpenLLM v1,72.8,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HFv1 GSM8K,64.97,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HFv1 HellaSwag,86.11,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HFv1 MMLU,77.34,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HFv1 TruthfulQA,59.73,,hf_open_llm_v1_240829_frozen.csv +skkuds_dpo_72b_v3,HFv1 Winogrande,82.64,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HF OpenLLM v1,67.83,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HFv1 GSM8K,63.15,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HFv1 HellaSwag,80.14,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HFv1 MMLU,65.99,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HFv1 TruthfulQA,54.22,,hf_open_llm_v1_240829_frozen.csv +slal_0_1,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HF OpenLLM v1,75.11,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HFv1 ARC,73.55,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HFv1 GSM8K,70.05,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HFv1 HellaSwag,88.85,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HFv1 MMLU,64.62,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HFv1 TruthfulQA,69.69,,hf_open_llm_v1_240829_frozen.csv +slerp_test_turdus_beagle,HFv1 Winogrande,83.9,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HF OpenLLM v1,62.75,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HFv1 GSM8K,45.34,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HFv1 HellaSwag,83.54,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HFv1 MMLU,62.67,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HFv1 TruthfulQA,45.33,,hf_open_llm_v1_240829_frozen.csv +slimhercules_4_0_mistral_7b_v0_2,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HF OpenLLM v1,69.49,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HFv1 GSM8K,71.19,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HFv1 HellaSwag,82.34,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HFv1 MMLU,67.7,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HFv1 TruthfulQA,55.56,,hf_open_llm_v1_240829_frozen.csv +smartllama3_8b_ms_v0_1,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HF OpenLLM v1,41.8,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HFv1 GSM8K,12.13,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HFv1 HellaSwag,62.3,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HFv1 MMLU,43.06,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HFv1 TruthfulQA,39.82,,hf_open_llm_v1_240829_frozen.csv +smartqwen1_5_1_8b_orpo_v1,HFv1 Winogrande,57.38,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HF OpenLLM v1,40.0,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HFv1 ARC,40.53,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HFv1 HellaSwag,70.85,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HFv1 MMLU,25.31,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HFv1 TruthfulQA,36.53,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v1,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HF OpenLLM v1,40.29,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HFv1 ARC,41.04,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HFv1 HellaSwag,71.19,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HFv1 MMLU,24.32,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HFv1 TruthfulQA,36.66,,hf_open_llm_v1_240829_frozen.csv +smartyplats_3b_v2,HFv1 Winogrande,66.93,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HF OpenLLM v1,60.24,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HFv1 GSM8K,38.82,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HFv1 HellaSwag,80.76,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HFv1 MMLU,58.16,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HFv1 TruthfulQA,50.26,,hf_open_llm_v1_240829_frozen.csv +smartyplats_7b_v2,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HF OpenLLM v1,28.73,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HFv1 HellaSwag,28.71,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HFv1 MMLU,24.93,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HFv1 TruthfulQA,45.76,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_chat_v1,HFv1 Winogrande,50.04,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HF OpenLLM v1,28.97,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HFv1 ARC,23.55,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HFv1 GSM8K,0.83,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HFv1 HellaSwag,28.77,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HFv1 MMLU,24.24,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HFv1 TruthfulQA,45.76,,hf_open_llm_v1_240829_frozen.csv +smol_llama_101m_gqa,HFv1 Winogrande,50.67,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HF OpenLLM v1,29.44,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HFv1 ARC,24.83,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HFv1 HellaSwag,29.76,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HFv1 MMLU,25.85,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HFv1 TruthfulQA,44.55,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_gqa,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HF OpenLLM v1,29.19,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HFv1 ARC,25.0,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HFv1 HellaSwag,29.71,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HFv1 MMLU,26.11,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HFv1 TruthfulQA,44.06,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_open_instruct,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HF OpenLLM v1,29.34,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HFv1 ARC,25.17,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HFv1 HellaSwag,28.98,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HFv1 MMLU,26.17,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HFv1 TruthfulQA,43.08,,hf_open_llm_v1_240829_frozen.csv +smol_llama_220m_openhermes,HFv1 Winogrande,52.01,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HF OpenLLM v1,29.25,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HFv1 ARC,25.09,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HFv1 HellaSwag,29.24,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HFv1 MMLU,25.88,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HFv1 TruthfulQA,43.92,,hf_open_llm_v1_240829_frozen.csv +smol_llama_4x220m_moe,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HF OpenLLM v1,28.17,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HFv1 ARC,22.18,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HFv1 HellaSwag,29.33,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HFv1 MMLU,24.06,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HFv1 TruthfulQA,43.97,,hf_open_llm_v1_240829_frozen.csv +smol_llama_81m_tied,HFv1 Winogrande,49.25,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HF OpenLLM v1,28.98,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HFv1 HellaSwag,28.5,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HFv1 MMLU,24.69,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HFv1 TruthfulQA,46.09,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m,HFv1 Winogrande,51.3,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HF OpenLLM v1,29.35,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HFv1 ARC,23.98,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HFv1 HellaSwag,28.43,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HFv1 MMLU,25.07,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HFv1 TruthfulQA,45.87,,hf_open_llm_v1_240829_frozen.csv +smolllamix_8x101m_take2,HFv1 Winogrande,52.25,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HF OpenLLM v1,66.31,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HFv1 GSM8K,36.77,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HFv1 HellaSwag,85.64,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HFv1 MMLU,60.85,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HFv1 TruthfulQA,70.91,,hf_open_llm_v1_240829_frozen.csv +snorkel_mistral_pairrm_dpo,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HF OpenLLM v1,70.11,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HFv1 GSM8K,45.26,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HFv1 HellaSwag,87.89,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HFv1 MMLU,70.58,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HFv1 TruthfulQA,62.25,,hf_open_llm_v1_240829_frozen.csv +solar_0_70b_16bit,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HF OpenLLM v1,68.68,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 GSM8K,58.76,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 HellaSwag,86.09,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 MMLU,66.25,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 TruthfulQA,51.81,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_dpo_instruct_tuned_v0_1,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HF OpenLLM v1,74.8,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HFv1 HellaSwag,88.44,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HFv1 MMLU,65.63,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HFv1 TruthfulQA,76.13,,hf_open_llm_v1_240829_frozen.csv +solar_10_7b_instruct_forest_dpo_v1,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HF OpenLLM v1,74.19,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HFv1 ARC,71.33,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HFv1 GSM8K,64.59,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HFv1 HellaSwag,88.62,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HFv1 MMLU,66.22,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HFv1 TruthfulQA,70.92,,hf_open_llm_v1_240829_frozen.csv +solar_10b_nector_dpo_jawade,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HF OpenLLM v1,74.27,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HFv1 GSM8K,64.82,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HFv1 HellaSwag,88.27,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HFv1 MMLU,66.12,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HFv1 TruthfulQA,71.57,,hf_open_llm_v1_240829_frozen.csv +solar_10b_orcadpo_jawade,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HF OpenLLM v1,56.65,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HFv1 ARC,57.25,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HFv1 GSM8K,16.6,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HFv1 HellaSwag,78.03,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HFv1 MMLU,55.75,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HFv1 TruthfulQA,61.99,,hf_open_llm_v1_240829_frozen.csv +solar_13b_instruct_v1_0,HFv1 Winogrande,70.24,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HF OpenLLM v1,58.1,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HFv1 ARC,59.56,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HFv1 GSM8K,26.99,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HFv1 HellaSwag,81.18,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HFv1 MMLU,63.68,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HFv1 TruthfulQA,40.72,,hf_open_llm_v1_240829_frozen.csv +solar_dus_implement,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HF OpenLLM v1,74.11,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HFv1 GSM8K,64.29,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HFv1 HellaSwag,88.2,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HFv1 MMLU,66.09,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HFv1 TruthfulQA,71.51,,hf_open_llm_v1_240829_frozen.csv +solar_instruct_ko_adapter_attach,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HF OpenLLM v1,56.47,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HFv1 ARC,55.97,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HFv1 GSM8K,22.59,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HFv1 HellaSwag,79.97,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HFv1 MMLU,55.88,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HFv1 TruthfulQA,47.55,,hf_open_llm_v1_240829_frozen.csv +solar_ko_1_3_deup,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HF OpenLLM v1,73.37,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HFv1 ARC,68.43,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HFv1 GSM8K,71.04,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HFv1 HellaSwag,86.31,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HFv1 MMLU,66.9,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HFv1 TruthfulQA,64.21,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HF OpenLLM v1,74.25,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HFv1 GSM8K,64.9,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HFv1 HellaSwag,88.29,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HFv1 MMLU,66.25,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HFv1 TruthfulQA,71.68,,hf_open_llm_v1_240829_frozen.csv +solar_math_2x10_7b_v0_2,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HF OpenLLM v1,68.79,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HFv1 GSM8K,58.0,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HFv1 HellaSwag,85.3,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HFv1 MMLU,66.03,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HFv1 TruthfulQA,54.33,,hf_open_llm_v1_240829_frozen.csv +solar_megamerge_dare_10_7b_v1,HFv1 Winogrande,82.95,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HF OpenLLM v1,65.6,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HFv1 ARC,64.42,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HFv1 GSM8K,48.82,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HFv1 HellaSwag,82.73,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HFv1 MMLU,64.57,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HFv1 TruthfulQA,51.28,,hf_open_llm_v1_240829_frozen.csv +solar_merge2_dpo,HFv1 Winogrande,81.77,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HF OpenLLM v1,65.96,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HFv1 GSM8K,50.57,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HFv1 HellaSwag,84.58,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HFv1 MMLU,63.18,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HFv1 TruthfulQA,51.49,,hf_open_llm_v1_240829_frozen.csv +solar_merge_adapter_dpo_orca,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HF OpenLLM v1,58.62,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HFv1 ARC,61.69,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HFv1 GSM8K,11.07,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HFv1 HellaSwag,84.23,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HFv1 MMLU,60.37,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HFv1 TruthfulQA,51.58,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v1,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HF OpenLLM v1,55.25,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HFv1 ARC,59.39,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HFv1 GSM8K,4.02,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HFv1 HellaSwag,83.57,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HFv1 MMLU,59.93,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HFv1 TruthfulQA,43.15,,hf_open_llm_v1_240829_frozen.csv +solar_platypus_10_7b_v2,HFv1 Winogrande,81.45,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HF OpenLLM v1,62.05,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HFv1 ARC,62.71,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HFv1 GSM8K,26.38,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HFv1 HellaSwag,81.82,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HFv1 MMLU,59.12,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HFv1 TruthfulQA,66.25,,hf_open_llm_v1_240829_frozen.csv +solarized_13b_dpo,HFv1 Winogrande,76.01,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HF OpenLLM v1,67.88,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HFv1 ARC,68.34,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HFv1 GSM8K,40.26,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HFv1 HellaSwag,87.79,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HFv1 MMLU,63.89,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HFv1 TruthfulQA,66.49,,hf_open_llm_v1_240829_frozen.csv +solarized_18b_dpo,HFv1 Winogrande,80.51,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HF OpenLLM v1,66.34,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HFv1 GSM8K,53.68,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HFv1 HellaSwag,84.76,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HFv1 MMLU,61.76,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HFv1 TruthfulQA,54.45,,hf_open_llm_v1_240829_frozen.csv +spaetzle_v44_7b,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HF OpenLLM v1,44.83,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HFv1 ARC,44.37,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HFv1 GSM8K,5.99,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HFv1 HellaSwag,65.2,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HFv1 MMLU,43.46,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HFv1 TruthfulQA,45.94,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_13b,HFv1 Winogrande,64.01,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HF OpenLLM v1,30.36,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HFv1 ARC,29.44,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HFv1 HellaSwag,25.71,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HFv1 TruthfulQA,49.64,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_airoboros_13b_0_10e,HFv1 Winogrande,51.93,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HF OpenLLM v1,29.83,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HFv1 ARC,28.75,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HFv1 HellaSwag,25.88,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HFv1 MMLU,25.36,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HFv1 TruthfulQA,49.27,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_orca_platypus_13b_0_10e,HFv1 Winogrande,49.72,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HF OpenLLM v1,45.64,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HFv1 ARC,45.31,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HFv1 GSM8K,9.1,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HFv1 HellaSwag,68.63,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HFv1 MMLU,42.82,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HFv1 TruthfulQA,42.38,,hf_open_llm_v1_240829_frozen.csv +speechless_codellama_platypus_13b,HFv1 Winogrande,65.59,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HF OpenLLM v1,56.14,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HFv1 GSM8K,5.76,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HFv1 HellaSwag,82.68,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HFv1 MMLU,57.75,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HFv1 TruthfulQA,51.44,,hf_open_llm_v1_240829_frozen.csv +speechlessv1_nova_13b,HFv1 Winogrande,77.43,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HF OpenLLM v1,73.2,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HFv1 HellaSwag,87.2,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HFv1 TruthfulQA,65.12,,hf_open_llm_v1_240829_frozen.csv +sphinx_7b_model_stock,HFv1 Winogrande,83.03,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HF OpenLLM v1,61.67,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HFv1 GSM8K,54.28,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HFv1 HellaSwag,75.56,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HFv1 MMLU,57.08,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HFv1 TruthfulQA,45.77,,hf_open_llm_v1_240829_frozen.csv +spin_phi2,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HF OpenLLM v1,50.19,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HFv1 HellaSwag,62.28,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HFv1 MMLU,61.95,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HFv1 TruthfulQA,60.23,,hf_open_llm_v1_240829_frozen.csv +srbosgpt_7b_slerp,HFv1 Winogrande,66.54,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HF OpenLLM v1,55.56,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HFv1 GSM8K,3.56,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HFv1 HellaSwag,82.46,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HFv1 MMLU,57.09,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HFv1 TruthfulQA,51.41,,hf_open_llm_v1_240829_frozen.csv +stable_platypus2_13b_qlora_0_80_epoch,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HF OpenLLM v1,51.64,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HFv1 ARC,53.41,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HFv1 GSM8K,4.09,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HFv1 HellaSwag,78.57,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HFv1 MMLU,50.37,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HFv1 TruthfulQA,48.38,,hf_open_llm_v1_240829_frozen.csv +stable_vicuna_13b,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HF OpenLLM v1,63.48,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HFv1 GSM8K,56.03,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HFv1 HellaSwag,84.33,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HFv1 MMLU,62.04,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HFv1 TruthfulQA,42.16,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HF OpenLLM v1,68.38,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HFv1 GSM8K,57.85,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HFv1 HellaSwag,85.96,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HFv1 MMLU,61.06,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HFv1 TruthfulQA,62.01,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_12b_chat,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HF OpenLLM v1,45.25,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HFv1 ARC,43.34,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HFv1 HellaSwag,70.45,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HFv1 MMLU,38.95,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HFv1 TruthfulQA,36.78,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b,HFv1 Winogrande,64.56,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HF OpenLLM v1,50.71,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HFv1 ARC,43.52,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HFv1 GSM8K,38.82,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HFv1 HellaSwag,69.24,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HFv1 MMLU,41.47,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HFv1 TruthfulQA,46.5,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_1_6b_chat,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HF OpenLLM v1,49.99,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HFv1 ARC,43.69,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HFv1 GSM8K,35.33,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HFv1 HellaSwag,69.3,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HFv1 MMLU,42.03,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HFv1 TruthfulQA,45.11,,hf_open_llm_v1_240829_frozen.csv +stablelm_2_zephyr_1_6b,HFv1 Winogrande,64.48,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HF OpenLLM v1,46.58,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HFv1 ARC,46.59,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HFv1 GSM8K,3.34,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HFv1 HellaSwag,75.94,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HFv1 MMLU,45.23,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HFv1 TruthfulQA,37.2,,hf_open_llm_v1_240829_frozen.csv +stablelm_3b_4e1t,HFv1 Winogrande,71.19,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HF OpenLLM v1,31.5,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HFv1 ARC,26.45,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HFv1 HellaSwag,42.24,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HFv1 TruthfulQA,40.5,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_3b,HFv1 Winogrande,53.91,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HF OpenLLM v1,34.37,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HFv1 ARC,32.0,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HFv1 HellaSwag,51.78,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HFv1 MMLU,26.21,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HFv1 TruthfulQA,40.19,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b,HFv1 Winogrande,55.41,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HF OpenLLM v1,46.18,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HFv1 HellaSwag,77.08,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HFv1 MMLU,45.1,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HFv1 TruthfulQA,36.46,,hf_open_llm_v1_240829_frozen.csv +stablelm_base_alpha_7b_v2,HFv1 Winogrande,68.51,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HF OpenLLM v1,53.43,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HFv1 ARC,46.08,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HFv1 GSM8K,42.15,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HFv1 HellaSwag,74.16,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HFv1 MMLU,46.17,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HFv1 TruthfulQA,46.49,,hf_open_llm_v1_240829_frozen.csv +stablelm_zephyr_3b,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv +starcoder,HF OpenLLM v1,35.73,,hf_open_llm_v1_240829_frozen.csv +starcoder,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv +starcoder,HFv1 GSM8K,9.17,,hf_open_llm_v1_240829_frozen.csv +starcoder,HFv1 HellaSwag,47.88,,hf_open_llm_v1_240829_frozen.csv +starcoder,HFv1 MMLU,29.47,,hf_open_llm_v1_240829_frozen.csv +starcoder,HFv1 TruthfulQA,41.3,,hf_open_llm_v1_240829_frozen.csv +starcoder,HFv1 Winogrande,56.27,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HF OpenLLM v1,52.79,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HFv1 ARC,47.35,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HFv1 GSM8K,52.24,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HFv1 HellaSwag,64.09,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HFv1 MMLU,51.35,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HFv1 TruthfulQA,37.87,,hf_open_llm_v1_240829_frozen.csv +starcoder2_15b,HFv1 Winogrande,63.85,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HF OpenLLM v1,39.25,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HFv1 ARC,34.56,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HFv1 GSM8K,19.64,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HFv1 HellaSwag,47.62,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HFv1 MMLU,38.65,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HFv1 TruthfulQA,40.49,,hf_open_llm_v1_240829_frozen.csv +starcoder2_3b,HFv1 Winogrande,54.54,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HF OpenLLM v1,42.95,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HFv1 ARC,38.31,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HFv1 GSM8K,25.09,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HFv1 HellaSwag,51.91,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HFv1 MMLU,41.21,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HFv1 TruthfulQA,41.99,,hf_open_llm_v1_240829_frozen.csv +starcoder2_7b,HFv1 Winogrande,59.19,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HF OpenLLM v1,35.55,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HFv1 GSM8K,7.88,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HFv1 HellaSwag,47.21,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HFv1 MMLU,32.12,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HFv1 TruthfulQA,40.02,,hf_open_llm_v1_240829_frozen.csv +starcoderbase,HFv1 Winogrande,55.8,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HF OpenLLM v1,30.06,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HFv1 GSM8K,0.91,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HFv1 HellaSwag,34.31,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HFv1 MMLU,26.67,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HFv1 TruthfulQA,45.79,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_1b,HFv1 Winogrande,49.96,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HF OpenLLM v1,31.38,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HFv1 ARC,25.85,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HFv1 HellaSwag,39.11,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HFv1 MMLU,27.35,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HFv1 TruthfulQA,43.05,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_3b,HFv1 Winogrande,51.14,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HF OpenLLM v1,33.75,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HFv1 ARC,29.86,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HFv1 GSM8K,5.46,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HFv1 HellaSwag,43.87,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HFv1 MMLU,28.45,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HFv1 TruthfulQA,40.46,,hf_open_llm_v1_240829_frozen.csv +starcoderbase_7b,HFv1 Winogrande,54.38,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HF OpenLLM v1,50.73,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HFv1 GSM8K,10.08,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HFv1 HellaSwag,76.77,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HFv1 MMLU,47.75,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HFv1 TruthfulQA,48.18,,hf_open_llm_v1_240829_frozen.csv +starling_7b,HFv1 Winogrande,70.56,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HF OpenLLM v1,66.94,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HFv1 ARC,63.91,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HFv1 GSM8K,61.56,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HFv1 HellaSwag,84.79,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HFv1 MMLU,64.64,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HFv1 TruthfulQA,46.38,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_alpha_expo,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HF OpenLLM v1,69.88,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HFv1 ARC,67.24,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HFv1 GSM8K,66.64,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HFv1 HellaSwag,83.47,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HFv1 MMLU,65.14,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HFv1 TruthfulQA,55.47,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta,HFv1 Winogrande,81.29,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HF OpenLLM v1,70.17,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HFv1 GSM8K,65.66,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HFv1 HellaSwag,83.62,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HFv1 MMLU,65.3,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HFv1 TruthfulQA,57.16,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_expo,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HF OpenLLM v1,70.14,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HFv1 ARC,67.41,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HFv1 HellaSwag,83.38,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HFv1 MMLU,65.29,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HFv1 TruthfulQA,55.47,,hf_open_llm_v1_240829_frozen.csv +starling_lm_7b_beta_laser_dpo,HFv1 Winogrande,81.37,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HF OpenLLM v1,68.53,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HFv1 HellaSwag,85.18,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HFv1 MMLU,64.72,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HFv1 TruthfulQA,49.56,,hf_open_llm_v1_240829_frozen.csv +starlinghermes_2_5_mistral_7b_slerp,HFv1 Winogrande,79.72,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HF OpenLLM v1,76.37,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HFv1 ARC,73.89,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HFv1 GSM8K,69.67,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HFv1 HellaSwag,89.26,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HFv1 MMLU,64.94,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HFv1 TruthfulQA,72.47,,hf_open_llm_v1_240829_frozen.csv +stealth_v2,HFv1 Winogrande,88.0,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HF OpenLLM v1,37.31,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HFv1 HellaSwag,61.9,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HFv1 MMLU,26.85,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HFv1 TruthfulQA,34.3,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0,HFv1 Winogrande,63.85,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HF OpenLLM v1,36.15,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HFv1 ARC,34.64,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HFv1 HellaSwag,56.74,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HFv1 MMLU,25.55,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HFv1 TruthfulQA,38.55,,hf_open_llm_v1_240829_frozen.csv +stellarx_4b_v0_2,HFv1 Winogrande,61.4,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HF OpenLLM v1,74.2,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HFv1 ARC,70.9,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HFv1 GSM8K,64.14,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HFv1 HellaSwag,88.41,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HFv1 MMLU,66.32,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HFv1 TruthfulQA,71.71,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v1,HFv1 Winogrande,83.74,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HF OpenLLM v1,74.21,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HFv1 GSM8K,63.84,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HFv1 HellaSwag,88.6,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HFv1 MMLU,66.23,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HFv1 TruthfulQA,72.01,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v2,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HF OpenLLM v1,74.01,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HFv1 GSM8K,63.23,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HFv1 HellaSwag,88.57,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HFv1 MMLU,66.13,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HFv1 TruthfulQA,71.94,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v3,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HF OpenLLM v1,74.29,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HFv1 ARC,71.25,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HFv1 GSM8K,64.44,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HFv1 HellaSwag,88.5,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HFv1 MMLU,66.24,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HFv1 TruthfulQA,71.89,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v4,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HF OpenLLM v1,74.41,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HFv1 HellaSwag,88.48,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HFv1 MMLU,66.34,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HFv1 TruthfulQA,71.84,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v5,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HF OpenLLM v1,74.31,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HFv1 ARC,71.16,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HFv1 HellaSwag,88.5,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HFv1 MMLU,66.31,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HFv1 TruthfulQA,71.96,,hf_open_llm_v1_240829_frozen.csv +stopcarbon_10_7b_v6,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HF OpenLLM v1,56.64,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HFv1 GSM8K,8.34,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HFv1 HellaSwag,83.96,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HFv1 MMLU,57.48,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HFv1 TruthfulQA,52.5,,hf_open_llm_v1_240829_frozen.csv +storytime_13b,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HF OpenLLM v1,70.86,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HFv1 ARC,71.59,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HFv1 GSM8K,49.2,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HFv1 HellaSwag,90.58,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HFv1 MMLU,60.81,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HFv1 TruthfulQA,69.17,,hf_open_llm_v1_240829_frozen.csv +superaligned_jawade,HFv1 Winogrande,83.82,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HF OpenLLM v1,73.22,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HFv1 ARC,66.3,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HFv1 GSM8K,72.18,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HFv1 HellaSwag,83.91,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HFv1 MMLU,76.41,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HFv1 TruthfulQA,57.04,,hf_open_llm_v1_240829_frozen.csv +sus_chat_34b,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HF OpenLLM v1,54.94,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HFv1 GSM8K,18.88,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HFv1 HellaSwag,80.85,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HFv1 MMLU,51.28,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HFv1 TruthfulQA,45.7,,hf_open_llm_v1_240829_frozen.csv +sydney_overthinker_13b_hf,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HF OpenLLM v1,65.48,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HFv1 GSM8K,50.04,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HFv1 HellaSwag,82.47,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HFv1 MMLU,62.5,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HFv1 TruthfulQA,51.11,,hf_open_llm_v1_240829_frozen.csv +synatra_10_7b_v0_4,HFv1 Winogrande,81.85,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HF OpenLLM v1,56.17,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HFv1 ARC,57.34,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HFv1 GSM8K,17.74,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HFv1 HellaSwag,78.66,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HFv1 MMLU,55.56,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HFv1 TruthfulQA,51.97,,hf_open_llm_v1_240829_frozen.csv +synatra_11b_testbench,HFv1 Winogrande,75.77,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HF OpenLLM v1,60.55,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HFv1 ARC,62.8,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HFv1 GSM8K,23.73,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HFv1 HellaSwag,82.58,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HFv1 MMLU,61.46,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HFv1 TruthfulQA,56.46,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_dpo,HFv1 Winogrande,76.24,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HF OpenLLM v1,59.26,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HFv1 ARC,62.2,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HFv1 GSM8K,21.15,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HFv1 HellaSwag,82.29,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HFv1 MMLU,60.8,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HFv1 TruthfulQA,52.64,,hf_open_llm_v1_240829_frozen.csv +synatra_7b_v0_3_rp,HFv1 Winogrande,76.48,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HF OpenLLM v1,59.65,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HFv1 ARC,57.68,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HFv1 GSM8K,39.65,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HFv1 HellaSwag,77.37,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HFv1 MMLU,56.1,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HFv1 TruthfulQA,52.52,,hf_open_llm_v1_240829_frozen.csv +synatra_rp_orca_2_7b_v0_1,HFv1 Winogrande,74.59,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HF OpenLLM v1,55.86,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HFv1 GSM8K,19.41,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HFv1 HellaSwag,76.63,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HFv1 MMLU,55.29,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HFv1 TruthfulQA,55.76,,hf_open_llm_v1_240829_frozen.csv +synatra_v0_1_7b_instruct,HFv1 Winogrande,72.77,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HF OpenLLM v1,68.47,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HFv1 GSM8K,61.49,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HFv1 HellaSwag,84.41,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HFv1 MMLU,61.89,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HFv1 TruthfulQA,60.11,,hf_open_llm_v1_240829_frozen.csv +systemconfighermes_7b,HFv1 Winogrande,77.74,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HF OpenLLM v1,67.92,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HFv1 ARC,65.02,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HFv1 GSM8K,61.56,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HFv1 HellaSwag,84.05,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HFv1 MMLU,63.16,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HFv1 TruthfulQA,56.42,,hf_open_llm_v1_240829_frozen.csv +systemhermes_2_7b,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HF OpenLLM v1,66.86,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HFv1 ARC,64.76,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HFv1 GSM8K,58.83,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HFv1 HellaSwag,83.68,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HFv1 MMLU,63.23,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HFv1 TruthfulQA,52.81,,hf_open_llm_v1_240829_frozen.csv +systemhermes_7b,HFv1 Winogrande,77.82,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HF OpenLLM v1,73.09,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HFv1 ARC,70.99,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HFv1 GSM8K,71.11,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HFv1 HellaSwag,87.21,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HFv1 MMLU,72.43,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HFv1 TruthfulQA,54.87,,hf_open_llm_v1_240829_frozen.csv +taiwan_llm_8x7b_dpo,HFv1 Winogrande,81.93,,hf_open_llm_v1_240829_frozen.csv +taketwo,HF OpenLLM v1,38.6,,hf_open_llm_v1_240829_frozen.csv +taketwo,HFv1 ARC,37.2,,hf_open_llm_v1_240829_frozen.csv +taketwo,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv +taketwo,HFv1 HellaSwag,62.01,,hf_open_llm_v1_240829_frozen.csv +taketwo,HFv1 MMLU,23.8,,hf_open_llm_v1_240829_frozen.csv +taketwo,HFv1 TruthfulQA,36.02,,hf_open_llm_v1_240829_frozen.csv +taketwo,HFv1 Winogrande,70.01,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HF OpenLLM v1,60.54,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HFv1 ARC,59.98,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HFv1 GSM8K,35.03,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HFv1 HellaSwag,83.27,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HFv1 MMLU,60.57,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HFv1 TruthfulQA,47.13,,hf_open_llm_v1_240829_frozen.csv +taliml_7b_v_1_eng,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HF OpenLLM v1,51.59,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HFv1 ARC,54.52,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HFv1 GSM8K,7.51,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HFv1 HellaSwag,79.35,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HFv1 MMLU,50.37,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HFv1 TruthfulQA,41.22,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_13b_instruct_v0_1,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HF OpenLLM v1,42.39,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HFv1 ARC,40.44,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HFv1 GSM8K,5.31,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HFv1 HellaSwag,68.88,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HFv1 TruthfulQA,50.11,,hf_open_llm_v1_240829_frozen.csv +tamil_llama_7b_instruct_v0_2,HFv1 Winogrande,66.46,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HF OpenLLM v1,35.54,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HFv1 ARC,28.92,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HFv1 GSM8K,6.97,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HFv1 HellaSwag,43.63,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HFv1 MMLU,33.92,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HFv1 TruthfulQA,42.73,,hf_open_llm_v1_240829_frozen.csv +tau_0_5b_instruct_dpop,HFv1 Winogrande,57.06,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HF OpenLLM v1,54.74,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HFv1 ARC,57.17,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HFv1 GSM8K,9.4,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HFv1 HellaSwag,81.72,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HFv1 MMLU,55.25,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HFv1 TruthfulQA,51.64,,hf_open_llm_v1_240829_frozen.csv +tekniumairoboros_nebula_7b,HFv1 Winogrande,73.24,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HF OpenLLM v1,52.86,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HFv1 ARC,53.58,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HFv1 GSM8K,20.39,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HFv1 HellaSwag,78.33,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HFv1 MMLU,47.63,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HFv1 TruthfulQA,43.26,,hf_open_llm_v1_240829_frozen.csv +telugu_llama2_7b_v0_instruct,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HF OpenLLM v1,39.71,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HFv1 ARC,36.95,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HFv1 HellaSwag,67.88,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HFv1 TruthfulQA,48.97,,hf_open_llm_v1_240829_frozen.csv +telugu_llama_7b_instruct_v0_1,HFv1 Winogrande,61.33,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HF OpenLLM v1,65.31,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HFv1 ARC,57.42,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HFv1 GSM8K,61.18,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HFv1 HellaSwag,78.73,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HFv1 TruthfulQA,55.48,,hf_open_llm_v1_240829_frozen.csv +test1_slide,HFv1 Winogrande,75.22,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HF OpenLLM v1,74.98,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HFv1 ARC,73.63,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HFv1 GSM8K,67.48,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HFv1 HellaSwag,89.03,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HFv1 MMLU,64.63,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HFv1 TruthfulQA,70.71,,hf_open_llm_v1_240829_frozen.csv +test3_sft_16bit_dpo2,HFv1 Winogrande,84.37,,hf_open_llm_v1_240829_frozen.csv +test_22b,HF OpenLLM v1,37.71,,hf_open_llm_v1_240829_frozen.csv +test_22b,HFv1 ARC,39.42,,hf_open_llm_v1_240829_frozen.csv +test_22b,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +test_22b,HFv1 HellaSwag,64.51,,hf_open_llm_v1_240829_frozen.csv +test_22b,HFv1 MMLU,27.13,,hf_open_llm_v1_240829_frozen.csv +test_22b,HFv1 TruthfulQA,37.13,,hf_open_llm_v1_240829_frozen.csv +test_22b,HFv1 Winogrande,57.7,,hf_open_llm_v1_240829_frozen.csv +test_model,HF OpenLLM v1,29.31,,hf_open_llm_v1_240829_frozen.csv +test_model,HFv1 ARC,24.4,,hf_open_llm_v1_240829_frozen.csv +test_model,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +test_model,HFv1 HellaSwag,30.17,,hf_open_llm_v1_240829_frozen.csv +test_model,HFv1 MMLU,25.88,,hf_open_llm_v1_240829_frozen.csv +test_model,HFv1 TruthfulQA,44.59,,hf_open_llm_v1_240829_frozen.csv +test_model,HFv1 Winogrande,50.83,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HF OpenLLM v1,35.78,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HFv1 ARC,31.14,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HFv1 GSM8K,7.58,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HFv1 HellaSwag,44.12,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HFv1 MMLU,33.69,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HFv1 TruthfulQA,42.9,,hf_open_llm_v1_240829_frozen.csv +test_qwen1_5_0_5b,HFv1 Winogrande,55.25,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HF OpenLLM v1,44.54,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HFv1 ARC,42.83,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HFv1 HellaSwag,47.09,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HFv1 MMLU,61.45,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HFv1 TruthfulQA,50.41,,hf_open_llm_v1_240829_frozen.csv +thetawave_14b_v0_1,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HF OpenLLM v1,40.4,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HFv1 HellaSwag,35.54,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HFv1 MMLU,54.5,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HFv1 TruthfulQA,49.86,,hf_open_llm_v1_240829_frozen.csv +thetawave_28b_v0_1,HFv1 Winogrande,65.9,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HF OpenLLM v1,69.35,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HFv1 ARC,67.49,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HFv1 GSM8K,56.1,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HFv1 HellaSwag,86.01,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HFv1 MMLU,62.26,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HFv1 TruthfulQA,65.26,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HF OpenLLM v1,70.49,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HFv1 ARC,68.09,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HFv1 GSM8K,55.65,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HFv1 HellaSwag,86.33,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HFv1 MMLU,62.11,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HFv1 TruthfulQA,71.68,,hf_open_llm_v1_240829_frozen.csv +thetawave_7b_v0_1,HFv1 Winogrande,79.08,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HF OpenLLM v1,68.77,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HFv1 GSM8K,68.84,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HFv1 HellaSwag,83.45,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HFv1 MMLU,65.19,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HFv1 TruthfulQA,56.08,,hf_open_llm_v1_240829_frozen.csv +tiamat_8b_1_2_llama3_dpo,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HF OpenLLM v1,53.42,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HFv1 ARC,53.84,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HFv1 GSM8K,17.06,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HFv1 MMLU,53.57,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HFv1 TruthfulQA,44.06,,hf_open_llm_v1_240829_frozen.csv +tigerbot_13b_base,HFv1 Winogrande,74.98,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HF OpenLLM v1,63.71,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HFv1 GSM8K,37.76,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HFv1 HellaSwag,83.61,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HFv1 MMLU,65.49,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HFv1 TruthfulQA,52.76,,hf_open_llm_v1_240829_frozen.csv +tigerbot_70b_base,HFv1 Winogrande,80.19,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HF OpenLLM v1,35.6,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HFv1 ARC,34.64,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HFv1 HellaSwag,56.39,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HFv1 MMLU,24.51,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HFv1 TruthfulQA,38.03,,hf_open_llm_v1_240829_frozen.csv +tiny_llama3_7b,HFv1 Winogrande,59.67,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HF OpenLLM v1,29.41,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HFv1 ARC,20.99,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HFv1 HellaSwag,28.77,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HFv1 MMLU,26.79,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HFv1 TruthfulQA,47.68,,hf_open_llm_v1_240829_frozen.csv +tiny_starcoder_py,HFv1 Winogrande,51.22,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HF OpenLLM v1,34.76,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HFv1 ARC,33.45,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HFv1 HellaSwag,55.92,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HFv1 MMLU,25.45,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HFv1 TruthfulQA,33.82,,hf_open_llm_v1_240829_frozen.csv +tiny_vicuna_1b,HFv1 Winogrande,58.41,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HF OpenLLM v1,36.21,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HFv1 ARC,34.98,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HFv1 HellaSwag,60.11,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HFv1 MMLU,25.31,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HFv1 TruthfulQA,35.51,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1_1b,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HF OpenLLM v1,36.34,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HFv1 ARC,34.3,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HFv1 HellaSwag,59.44,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HFv1 MMLU,25.59,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HFv1 TruthfulQA,36.51,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_1_1b,HFv1 Winogrande,60.69,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HF OpenLLM v1,35.93,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HFv1 GSM8K,1.29,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HFv1 HellaSwag,58.53,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HFv1 MMLU,25.93,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HFv1 TruthfulQA,36.33,,hf_open_llm_v1_240829_frozen.csv +tinydolphin_2_8_2_1_1b_laser,HFv1 Winogrande,60.14,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HF OpenLLM v1,35.8,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HFv1 ARC,34.98,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HFv1 HellaSwag,58.24,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HFv1 MMLU,26.49,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HFv1 TruthfulQA,35.62,,hf_open_llm_v1_240829_frozen.csv +tinyllama,HFv1 Winogrande,58.48,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HF OpenLLM v1,35.39,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HFv1 ARC,32.76,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HFv1 GSM8K,0.61,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HFv1 HellaSwag,53.77,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HFv1 MMLU,25.73,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HFv1 TruthfulQA,40.52,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1_5t_openorca_alpha,HFv1 Winogrande,58.96,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HF OpenLLM v1,34.58,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HFv1 ARC,31.31,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HFv1 GSM8K,1.67,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HFv1 HellaSwag,52.34,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HFv1 MMLU,25.31,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HFv1 TruthfulQA,38.58,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_1t_openorca,HFv1 Winogrande,58.25,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HF OpenLLM v1,34.5,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HFv1 ARC,30.29,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HFv1 HellaSwag,55.12,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HFv1 MMLU,26.13,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HFv1 TruthfulQA,39.15,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v0_3_platypus,HFv1 Winogrande,55.8,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HF OpenLLM v1,37.09,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 ARC,35.84,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 GSM8K,1.97,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 HellaSwag,61.29,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 MMLU,25.05,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 TruthfulQA,37.38,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_chat_v1_0_intel_dpo,HFv1 Winogrande,61.01,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HF OpenLLM v1,36.42,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 ARC,33.87,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 HellaSwag,60.31,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 MMLU,26.04,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 TruthfulQA,37.32,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t,HFv1 Winogrande,59.51,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HF OpenLLM v1,36.46,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 ARC,33.02,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 GSM8K,1.21,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 HellaSwag,60.0,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 MMLU,26.88,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 TruthfulQA,38.08,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_1431k_3t_laser_dpo,HFv1 Winogrande,59.59,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HF OpenLLM v1,33.72,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HFv1 ARC,29.27,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HFv1 HellaSwag,49.71,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HFv1 MMLU,26.26,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HFv1 TruthfulQA,40.17,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_240k_503b,HFv1 Winogrande,56.59,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HF OpenLLM v1,34.37,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HFv1 ARC,30.89,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HFv1 HellaSwag,52.97,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HFv1 MMLU,25.0,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HFv1 TruthfulQA,39.55,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_intermediate_step_480k_1t,HFv1 Winogrande,57.3,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HF OpenLLM v1,37.37,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HFv1 ARC,34.81,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HFv1 GSM8K,2.2,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HFv1 HellaSwag,61.25,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HFv1 MMLU,25.53,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HFv1 TruthfulQA,38.97,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_layla_v4,HFv1 Winogrande,61.48,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HF OpenLLM v1,37.17,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HFv1 ARC,36.35,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HFv1 HellaSwag,61.23,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HFv1 TruthfulQA,36.58,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_orca_v1_0,HFv1 Winogrande,61.4,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HF OpenLLM v1,37.38,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HFv1 ARC,36.09,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HFv1 GSM8K,4.47,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HFv1 HellaSwag,59.66,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HFv1 MMLU,28.21,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HFv1 TruthfulQA,36.74,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_slimorca_function_calling_3t,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HF OpenLLM v1,31.86,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HFv1 ARC,25.85,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HFv1 HellaSwag,44.1,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HFv1 MMLU,26.78,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HFv1 TruthfulQA,39.51,,hf_open_llm_v1_240829_frozen.csv +tinyllama_1_1b_step_50k_105b,HFv1 Winogrande,54.38,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HF OpenLLM v1,37.21,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HFv1 ARC,34.47,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HFv1 HellaSwag,61.03,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HFv1 MMLU,25.77,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HFv1 TruthfulQA,39.29,,hf_open_llm_v1_240829_frozen.csv +tinyllama_chat_sft,HFv1 Winogrande,61.25,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HF OpenLLM v1,34.64,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HFv1 ARC,30.2,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HFv1 GSM8K,1.59,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HFv1 HellaSwag,51.01,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HFv1 MMLU,26.11,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HFv1 TruthfulQA,40.18,,hf_open_llm_v1_240829_frozen.csv +tinyllama_frankenmerge,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HF OpenLLM v1,37.81,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HFv1 ARC,34.73,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HFv1 GSM8K,1.21,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HFv1 HellaSwag,59.29,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HFv1 MMLU,29.9,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HFv1 TruthfulQA,39.37,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat,HFv1 Winogrande,62.51,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HF OpenLLM v1,36.7,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HFv1 ARC,34.39,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HFv1 GSM8K,2.27,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HFv1 HellaSwag,56.72,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HFv1 MMLU,29.36,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HFv1 TruthfulQA,37.82,,hf_open_llm_v1_240829_frozen.csv +tinyllama_moe_chat_0_1,HFv1 Winogrande,59.67,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HF OpenLLM v1,27.73,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HFv1 ARC,22.87,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HFv1 HellaSwag,28.02,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HFv1 TruthfulQA,42.52,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m,HFv1 Winogrande,49.8,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HF OpenLLM v1,27.01,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HFv1 ARC,21.59,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HFv1 HellaSwag,27.45,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HFv1 MMLU,23.08,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HFv1 TruthfulQA,40.91,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v1,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HF OpenLLM v1,27.42,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HFv1 ARC,23.29,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HFv1 HellaSwag,27.39,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HFv1 MMLU,23.52,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HFv1 TruthfulQA,41.32,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_chat_v2,HFv1 Winogrande,49.01,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HF OpenLLM v1,28.19,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HFv1 ARC,24.32,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HFv1 HellaSwag,27.52,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HFv1 TruthfulQA,41.94,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_instruct,HFv1 Winogrande,50.2,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HF OpenLLM v1,28.78,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HFv1 ARC,25.68,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HFv1 HellaSwag,25.31,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HFv1 MMLU,24.41,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HFv1 TruthfulQA,48.87,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v3,HFv1 Winogrande,48.38,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HF OpenLLM v1,28.2,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HFv1 ARC,24.91,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HFv1 HellaSwag,28.15,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HFv1 MMLU,26.04,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HFv1 TruthfulQA,39.56,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_sft_v4,HFv1 Winogrande,50.51,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HF OpenLLM v1,28.78,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HFv1 ARC,21.25,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HFv1 HellaSwag,26.56,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HFv1 MMLU,23.39,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HFv1 TruthfulQA,49.6,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2,HFv1 Winogrande,51.85,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HF OpenLLM v1,28.29,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HFv1 HellaSwag,27.49,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HFv1 MMLU,23.15,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HFv1 TruthfulQA,46.72,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5,HFv1 Winogrande,47.83,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HF OpenLLM v1,27.7,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HFv1 ARC,22.27,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HFv1 HellaSwag,27.6,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HFv1 MMLU,23.9,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HFv1 TruthfulQA,44.21,,hf_open_llm_v1_240829_frozen.csv +tinymistral_248m_v2_5_instruct,HFv1 Winogrande,48.22,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HF OpenLLM v1,27.89,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HFv1 ARC,22.44,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HFv1 HellaSwag,27.02,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HFv1 MMLU,24.13,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HFv1 TruthfulQA,43.16,,hf_open_llm_v1_240829_frozen.csv +tinymistral_6x248m_instruct,HFv1 Winogrande,50.59,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HF OpenLLM v1,29.16,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HFv1 ARC,26.54,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HFv1 HellaSwag,25.68,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HFv1 MMLU,23.53,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HFv1 TruthfulQA,49.9,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_5_minipile_guidelines_e1,HFv1 Winogrande,49.41,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HF OpenLLM v1,28.42,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HFv1 ARC,21.5,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HFv1 HellaSwag,26.79,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HFv1 MMLU,23.36,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HFv1 TruthfulQA,50.3,,hf_open_llm_v1_240829_frozen.csv +tinymistral_v2_test1,HFv1 Winogrande,48.54,,hf_open_llm_v1_240829_frozen.csv +tinymix,HF OpenLLM v1,35.91,,hf_open_llm_v1_240829_frozen.csv +tinymix,HFv1 ARC,32.0,,hf_open_llm_v1_240829_frozen.csv +tinymix,HFv1 GSM8K,1.97,,hf_open_llm_v1_240829_frozen.csv +tinymix,HFv1 HellaSwag,53.69,,hf_open_llm_v1_240829_frozen.csv +tinymix,HFv1 MMLU,24.27,,hf_open_llm_v1_240829_frozen.csv +tinymix,HFv1 TruthfulQA,39.42,,hf_open_llm_v1_240829_frozen.csv +tinymix,HFv1 Winogrande,64.09,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HF OpenLLM v1,37.03,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HFv1 ARC,35.92,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HFv1 GSM8K,2.43,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HFv1 HellaSwag,61.04,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HFv1 MMLU,25.82,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HFv1 TruthfulQA,36.77,,hf_open_llm_v1_240829_frozen.csv +tinynaughtyllama_v1_0,HFv1 Winogrande,60.22,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HF OpenLLM v1,35.98,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HFv1 ARC,33.62,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HFv1 HellaSwag,58.53,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HFv1 MMLU,26.45,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HFv1 TruthfulQA,37.33,,hf_open_llm_v1_240829_frozen.csv +tinyopenhermes_1_1b_4k,HFv1 Winogrande,59.91,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HF OpenLLM v1,29.14,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HFv1 ARC,23.46,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HFv1 HellaSwag,25.23,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HFv1 MMLU,24.57,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HFv1 TruthfulQA,49.4,,hf_open_llm_v1_240829_frozen.csv +tinystories_1m,HFv1 Winogrande,52.17,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HF OpenLLM v1,28.44,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HFv1 ARC,22.78,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HFv1 HellaSwag,25.83,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HFv1 MMLU,23.53,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HFv1 TruthfulQA,48.08,,hf_open_llm_v1_240829_frozen.csv +tinystories_28m,HFv1 Winogrande,50.43,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HF OpenLLM v1,28.41,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HFv1 ARC,24.23,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HFv1 HellaSwag,25.69,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HFv1 MMLU,23.82,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HFv1 TruthfulQA,47.64,,hf_open_llm_v1_240829_frozen.csv +tinystories_33m,HFv1 Winogrande,49.09,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HF OpenLLM v1,28.19,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HFv1 ARC,22.01,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HFv1 HellaSwag,25.58,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HFv1 MMLU,24.99,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HFv1 TruthfulQA,47.33,,hf_open_llm_v1_240829_frozen.csv +tinystories_3m,HFv1 Winogrande,49.25,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HF OpenLLM v1,28.31,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HFv1 ARC,24.66,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HFv1 HellaSwag,25.03,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HFv1 MMLU,23.33,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HFv1 TruthfulQA,46.54,,hf_open_llm_v1_240829_frozen.csv +tinystories_8m,HFv1 Winogrande,50.28,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HF OpenLLM v1,37.94,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HFv1 ARC,34.9,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HFv1 HellaSwag,61.42,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HFv1 MMLU,25.42,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HFv1 TruthfulQA,37.59,,hf_open_llm_v1_240829_frozen.csv +tinyultra_4x1_1b_base_alpha,HFv1 Winogrande,65.75,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HF OpenLLM v1,35.13,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HFv1 ARC,31.66,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HFv1 GSM8K,1.9,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HFv1 HellaSwag,50.42,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HFv1 MMLU,26.22,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HFv1 TruthfulQA,45.8,,hf_open_llm_v1_240829_frozen.csv +tinywand_dpo,HFv1 Winogrande,54.78,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HF OpenLLM v1,34.61,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HFv1 ARC,31.4,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HFv1 HellaSwag,49.96,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HFv1 MMLU,25.98,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HFv1 TruthfulQA,43.08,,hf_open_llm_v1_240829_frozen.csv +tinywand_sft,HFv1 Winogrande,55.17,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HF OpenLLM v1,35.63,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HFv1 ARC,33.36,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HFv1 GSM8K,1.06,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HFv1 HellaSwag,58.46,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HFv1 MMLU,25.68,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HFv1 TruthfulQA,37.22,,hf_open_llm_v1_240829_frozen.csv +tmm_1b,HFv1 Winogrande,58.01,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HF OpenLLM v1,65.16,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HFv1 ARC,63.65,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HFv1 GSM8K,55.8,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HFv1 HellaSwag,84.29,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HFv1 MMLU,63.6,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HFv1 TruthfulQA,46.06,,hf_open_llm_v1_240829_frozen.csv +toppyevil_7b_slerp,HFv1 Winogrande,77.58,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HF OpenLLM v1,72.05,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HFv1 ARC,69.2,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HFv1 GSM8K,65.96,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HFv1 HellaSwag,86.98,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HFv1 MMLU,64.85,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HFv1 TruthfulQA,62.54,,hf_open_llm_v1_240829_frozen.csv +toppylake_7b_slerp,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HF OpenLLM v1,70.14,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HFv1 ARC,67.66,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HFv1 GSM8K,57.7,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HFv1 HellaSwag,85.7,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HFv1 MMLU,64.87,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HFv1 TruthfulQA,61.74,,hf_open_llm_v1_240829_frozen.csv +toppylake_bagel_7b_slerp,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HF OpenLLM v1,53.62,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HFv1 GSM8K,9.86,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HFv1 HellaSwag,82.31,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HFv1 MMLU,54.73,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HFv1 TruthfulQA,40.25,,hf_open_llm_v1_240829_frozen.csv +tora_13b_v1_0,HFv1 Winogrande,75.61,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HF OpenLLM v1,48.5,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HFv1 ARC,52.47,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HFv1 GSM8K,2.5,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HFv1 HellaSwag,78.68,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HFv1 MMLU,45.9,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HFv1 TruthfulQA,37.9,,hf_open_llm_v1_240829_frozen.csv +tora_7b_v1_0,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HF OpenLLM v1,42.7,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HFv1 ARC,44.45,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HFv1 GSM8K,8.19,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HFv1 HellaSwag,69.29,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HFv1 MMLU,36.67,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HFv1 TruthfulQA,34.98,,hf_open_llm_v1_240829_frozen.csv +tora_code_13b_v1_0,HFv1 Winogrande,62.59,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HF OpenLLM v1,48.95,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HFv1 ARC,50.43,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HFv1 GSM8K,13.12,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HFv1 HellaSwag,75.54,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HFv1 MMLU,46.78,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HFv1 TruthfulQA,39.66,,hf_open_llm_v1_240829_frozen.csv +tora_code_34b_v1_0,HFv1 Winogrande,68.19,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HF OpenLLM v1,40.21,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HFv1 ARC,40.7,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HFv1 GSM8K,4.93,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HFv1 HellaSwag,65.86,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HFv1 MMLU,33.34,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HFv1 TruthfulQA,34.84,,hf_open_llm_v1_240829_frozen.csv +tora_code_7b_v1_0,HFv1 Winogrande,61.56,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HF OpenLLM v1,52.39,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HFv1 ARC,55.46,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HFv1 GSM8K,16.45,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HFv1 HellaSwag,79.0,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HFv1 MMLU,46.88,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HFv1 TruthfulQA,42.59,,hf_open_llm_v1_240829_frozen.csv +towerinstruct_7b_v0_1,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HF OpenLLM v1,59.69,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HFv1 GSM8K,17.36,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HFv1 HellaSwag,83.75,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HFv1 MMLU,63.67,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HFv1 TruthfulQA,50.84,,hf_open_llm_v1_240829_frozen.csv +toxichermes_2_5_mistral_7b,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HF OpenLLM v1,60.48,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HFv1 ARC,66.21,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HFv1 GSM8K,16.53,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HFv1 HellaSwag,85.02,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HFv1 MMLU,63.24,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HFv1 TruthfulQA,54.0,,hf_open_llm_v1_240829_frozen.csv +traversaal_2_5_mistral_7b,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HF OpenLLM v1,53.94,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HFv1 ARC,57.94,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HFv1 GSM8K,10.92,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HFv1 HellaSwag,79.55,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HFv1 MMLU,55.2,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HFv1 TruthfulQA,43.46,,hf_open_llm_v1_240829_frozen.csv +trurl_2_13b_academic,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HF OpenLLM v1,74.3,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HFv1 ARC,71.08,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HFv1 HellaSwag,88.46,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HFv1 MMLU,66.13,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HFv1 TruthfulQA,72.29,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_moe_19b,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HF OpenLLM v1,77.44,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 ARC,74.91,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 GSM8K,69.52,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 HellaSwag,89.3,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 MMLU,64.67,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 TruthfulQA,78.02,,hf_open_llm_v1_240829_frozen.csv +truthful_dpo_tomgrc_fusionnet_7bx2_moe_13b,HFv1 Winogrande,88.24,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HF OpenLLM v1,44.81,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HFv1 ARC,38.99,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HFv1 GSM8K,14.86,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HFv1 HellaSwag,60.43,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HFv1 MMLU,44.54,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HFv1 TruthfulQA,50.86,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_1_8b,HFv1 Winogrande,59.19,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HF OpenLLM v1,57.41,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HFv1 ARC,47.1,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HFv1 GSM8K,52.54,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HFv1 HellaSwag,71.32,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HFv1 MMLU,56.04,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HFv1 TruthfulQA,50.6,,hf_open_llm_v1_240829_frozen.csv +truthfulqwen1_5_4b,HFv1 Winogrande,66.85,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HF OpenLLM v1,73.77,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HFv1 ARC,72.1,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HFv1 GSM8K,62.62,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HFv1 HellaSwag,88.99,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HFv1 MMLU,69.84,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HFv1 TruthfulQA,65.78,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HF OpenLLM v1,74.02,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HFv1 GSM8K,59.36,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HFv1 HellaSwag,89.29,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HFv1 MMLU,69.61,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HFv1 TruthfulQA,69.99,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_70b_expo,HFv1 Winogrande,83.19,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HF OpenLLM v1,58.64,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HFv1 ARC,58.28,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HFv1 GSM8K,26.38,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HFv1 HellaSwag,81.36,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HFv1 MMLU,52.23,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HFv1 TruthfulQA,59.41,,hf_open_llm_v1_240829_frozen.csv +tulu_2_dpo_7b_expo,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +turdus,HF OpenLLM v1,74.66,,hf_open_llm_v1_240829_frozen.csv +turdus,HFv1 ARC,73.38,,hf_open_llm_v1_240829_frozen.csv +turdus,HFv1 GSM8K,67.7,,hf_open_llm_v1_240829_frozen.csv +turdus,HFv1 HellaSwag,88.56,,hf_open_llm_v1_240829_frozen.csv +turdus,HFv1 MMLU,64.52,,hf_open_llm_v1_240829_frozen.csv +turdus,HFv1 TruthfulQA,67.11,,hf_open_llm_v1_240829_frozen.csv +turdus,HFv1 Winogrande,86.66,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HF OpenLLM v1,65.3,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HFv1 HellaSwag,80.27,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HFv1 MMLU,67.0,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HFv1 TruthfulQA,47.29,,hf_open_llm_v1_240829_frozen.csv +turkgpt_v0_1,HFv1 Winogrande,77.27,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HF OpenLLM v1,58.05,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HFv1 ARC,58.53,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HFv1 HellaSwag,81.55,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HFv1 MMLU,59.54,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HFv1 TruthfulQA,40.52,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HF OpenLLM v1,66.21,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HFv1 GSM8K,63.31,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HFv1 HellaSwag,81.3,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HFv1 MMLU,60.72,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HFv1 TruthfulQA,52.6,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_01_30_2024,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HF OpenLLM v1,65.39,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HFv1 ARC,61.95,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HFv1 GSM8K,58.45,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HFv1 HellaSwag,81.51,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HFv1 MMLU,61.86,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HFv1 TruthfulQA,49.94,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_instruct_02_19_2024,HFv1 Winogrande,78.61,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HF OpenLLM v1,61.17,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HFv1 ARC,58.96,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HFv1 GSM8K,46.78,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HFv1 HellaSwag,82.38,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HFv1 MMLU,57.67,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HFv1 TruthfulQA,44.83,,hf_open_llm_v1_240829_frozen.csv +typhoon_7b_wangchanx_sft_demo,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +ultra0,HF OpenLLM v1,44.32,,hf_open_llm_v1_240829_frozen.csv +ultra0,HFv1 ARC,41.47,,hf_open_llm_v1_240829_frozen.csv +ultra0,HFv1 GSM8K,16.07,,hf_open_llm_v1_240829_frozen.csv +ultra0,HFv1 HellaSwag,68.02,,hf_open_llm_v1_240829_frozen.csv +ultra0,HFv1 MMLU,33.37,,hf_open_llm_v1_240829_frozen.csv +ultra0,HFv1 TruthfulQA,41.49,,hf_open_llm_v1_240829_frozen.csv +ultra0,HFv1 Winogrande,65.51,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HF OpenLLM v1,75.96,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HFv1 ARC,72.87,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HFv1 GSM8K,68.54,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HFv1 HellaSwag,88.75,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HFv1 MMLU,65.18,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HFv1 TruthfulQA,76.44,,hf_open_llm_v1_240829_frozen.csv +ultracatunamayo_dpo,HFv1 Winogrande,83.98,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HF OpenLLM v1,76.49,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HFv1 GSM8K,69.22,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HFv1 HellaSwag,89.25,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HFv1 MMLU,64.4,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HFv1 TruthfulQA,78.17,,hf_open_llm_v1_240829_frozen.csv +ultramerge_7b,HFv1 Winogrande,84.85,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HF OpenLLM v1,74.07,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HFv1 ARC,70.73,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HFv1 GSM8K,63.38,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HFv1 HellaSwag,88.32,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HFv1 MMLU,66.1,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HFv1 TruthfulQA,72.52,,hf_open_llm_v1_240829_frozen.csv +una_solar_10_7b_instruct_v1_0,HFv1 Winogrande,83.35,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HF OpenLLM v1,73.87,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HFv1 GSM8K,66.72,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HFv1 HellaSwag,88.0,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HFv1 MMLU,63.48,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HFv1 TruthfulQA,69.85,,hf_open_llm_v1_240829_frozen.csv +una_thebeagle_7b_v1,HFv1 Winogrande,82.16,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HF OpenLLM v1,67.44,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HFv1 ARC,62.63,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HFv1 GSM8K,68.16,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HFv1 HellaSwag,79.73,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HFv1 MMLU,67.05,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HFv1 TruthfulQA,51.52,,hf_open_llm_v1_240829_frozen.csv +unsafe_llama3_8b,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +v1,HF OpenLLM v1,46.35,,hf_open_llm_v1_240829_frozen.csv +v1,HFv1 ARC,48.12,,hf_open_llm_v1_240829_frozen.csv +v1,HFv1 GSM8K,17.44,,hf_open_llm_v1_240829_frozen.csv +v1,HFv1 HellaSwag,71.6,,hf_open_llm_v1_240829_frozen.csv +v1,HFv1 MMLU,41.83,,hf_open_llm_v1_240829_frozen.csv +v1,HFv1 TruthfulQA,33.04,,hf_open_llm_v1_240829_frozen.csv +v1,HFv1 Winogrande,66.06,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HF OpenLLM v1,70.26,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HFv1 ARC,71.33,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HFv1 GSM8K,53.37,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HFv1 HellaSwag,87.34,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HFv1 MMLU,64.13,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HFv1 TruthfulQA,63.37,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b,HFv1 Winogrande,82.0,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HF OpenLLM v1,73.68,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HFv1 ARC,72.61,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HFv1 GSM8K,66.87,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HFv1 HellaSwag,87.7,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HFv1 MMLU,63.51,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HFv1 TruthfulQA,69.07,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v3,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HF OpenLLM v1,64.3,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HFv1 ARC,66.98,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HFv1 GSM8K,35.25,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HFv1 HellaSwag,84.09,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HFv1 MMLU,59.02,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HFv1 TruthfulQA,59.43,,hf_open_llm_v1_240829_frozen.csv +v1olet_merged_dpo_7b_v4,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HF OpenLLM v1,73.28,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HFv1 ARC,71.93,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HFv1 HellaSwag,86.82,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HFv1 MMLU,70.38,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HFv1 TruthfulQA,65.21,,hf_open_llm_v1_240829_frozen.csv +v_alpha_tross,HFv1 Winogrande,83.58,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HF OpenLLM v1,57.58,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HFv1 GSM8K,21.99,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HFv1 HellaSwag,75.7,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HFv1 MMLU,65.85,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HFv1 TruthfulQA,50.19,,hf_open_llm_v1_240829_frozen.csv +velara_11b_v3,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HF OpenLLM v1,74.2,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HFv1 ARC,70.73,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HFv1 GSM8K,63.61,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HFv1 HellaSwag,88.47,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HFv1 MMLU,66.3,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HFv1 TruthfulQA,72.63,,hf_open_llm_v1_240829_frozen.csv +venus_dpo_50,HFv1 Winogrande,83.43,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HF OpenLLM v1,28.7,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HFv1 ARC,22.7,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HFv1 HellaSwag,27.6,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HFv1 MMLU,25.28,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HFv1 TruthfulQA,44.75,,hf_open_llm_v1_240829_frozen.csv +verysmol_llama_v11_kix2,HFv1 Winogrande,51.54,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HF OpenLLM v1,51.13,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HFv1 ARC,52.22,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HFv1 GSM8K,13.19,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HFv1 MMLU,47.93,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HFv1 TruthfulQA,46.87,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_10,HFv1 Winogrande,69.53,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HF OpenLLM v1,50.63,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HFv1 ARC,52.3,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HFv1 GSM8K,11.22,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HFv1 HellaSwag,77.05,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HFv1 MMLU,47.39,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HFv1 TruthfulQA,46.62,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_20,HFv1 Winogrande,69.22,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HF OpenLLM v1,50.33,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HFv1 ARC,51.02,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HFv1 HellaSwag,76.41,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HFv1 MMLU,46.83,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HFv1 TruthfulQA,46.06,,hf_open_llm_v1_240829_frozen.csv +vicuna_7b_v1_3_attention_sparsity_30,HFv1 Winogrande,69.3,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HF OpenLLM v1,53.15,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HFv1 ARC,62.03,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HFv1 GSM8K,0.15,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HFv1 HellaSwag,82.65,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HFv1 MMLU,54.11,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HFv1 TruthfulQA,42.98,,hf_open_llm_v1_240829_frozen.csv +vigogne2_enno_13b_sft_lora_4bit,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HF OpenLLM v1,37.46,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HFv1 ARC,39.68,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HFv1 HellaSwag,65.04,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HFv1 MMLU,25.09,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HFv1 TruthfulQA,33.8,,hf_open_llm_v1_240829_frozen.csv +vortex_3b_v2,HFv1 Winogrande,59.12,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HF OpenLLM v1,34.07,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HFv1 ARC,31.06,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HFv1 HellaSwag,54.92,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HFv1 MMLU,24.58,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HFv1 TruthfulQA,38.47,,hf_open_llm_v1_240829_frozen.csv +walter_falcon_1b,HFv1 Winogrande,55.41,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HF OpenLLM v1,35.29,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HFv1 ARC,32.85,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HFv1 HellaSwag,61.05,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HFv1 MMLU,27.46,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HFv1 TruthfulQA,33.93,,hf_open_llm_v1_240829_frozen.csv +walter_llama_1b,HFv1 Winogrande,56.43,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HF OpenLLM v1,53.0,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HFv1 GSM8K,0.08,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HFv1 HellaSwag,83.43,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HFv1 MMLU,58.65,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HFv1 TruthfulQA,39.93,,hf_open_llm_v1_240829_frozen.csv +walter_mistral_7b,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HF OpenLLM v1,55.95,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HFv1 ARC,60.41,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HFv1 HellaSwag,84.86,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HFv1 MMLU,64.99,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HFv1 TruthfulQA,44.88,,hf_open_llm_v1_240829_frozen.csv +walter_solar_11b,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HF OpenLLM v1,38.59,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HFv1 ARC,39.51,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HFv1 HellaSwag,65.76,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HFv1 MMLU,26.29,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HFv1 TruthfulQA,36.02,,hf_open_llm_v1_240829_frozen.csv +weblab_10b,HFv1 Winogrande,62.51,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HF OpenLLM v1,39.13,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HFv1 ARC,40.1,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HFv1 GSM8K,1.82,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HFv1 HellaSwag,65.3,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HFv1 MMLU,26.66,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HFv1 TruthfulQA,36.79,,hf_open_llm_v1_240829_frozen.csv +weblab_10b_instruction_sft,HFv1 Winogrande,64.09,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HF OpenLLM v1,75.23,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HFv1 ARC,72.44,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HFv1 GSM8K,67.63,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HFv1 HellaSwag,88.66,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HFv1 MMLU,64.73,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HFv1 TruthfulQA,72.4,,hf_open_llm_v1_240829_frozen.csv +westmonarchlasers_7b_slerp,HFv1 Winogrande,85.56,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HF OpenLLM v1,75.28,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HFv1 ARC,71.42,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HFv1 GSM8K,71.57,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HFv1 HellaSwag,88.24,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HFv1 TruthfulQA,72.37,,hf_open_llm_v1_240829_frozen.csv +westseverus_7b_dpo_v2,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HF OpenLLM v1,70.08,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HFv1 GSM8K,48.52,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HFv1 HellaSwag,87.34,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HFv1 MMLU,63.8,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HFv1 TruthfulQA,69.34,,hf_open_llm_v1_240829_frozen.csv +westuccine_7b_slerp,HFv1 Winogrande,82.08,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HF OpenLLM v1,71.01,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HFv1 ARC,69.37,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HFv1 GSM8K,55.72,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HFv1 HellaSwag,86.53,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HFv1 MMLU,64.8,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HFv1 TruthfulQA,67.06,,hf_open_llm_v1_240829_frozen.csv +westuccinebagel_7b_slerp,HFv1 Winogrande,82.56,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HF OpenLLM v1,66.08,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HFv1 ARC,58.45,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HFv1 HellaSwag,82.06,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HFv1 MMLU,62.61,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HFv1 TruthfulQA,49.61,,hf_open_llm_v1_240829_frozen.csv +where_llambo_7b,HFv1 Winogrande,78.53,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HF OpenLLM v1,73.96,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HFv1 ARC,71.67,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HFv1 GSM8K,65.35,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HFv1 HellaSwag,88.25,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HFv1 MMLU,64.92,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HFv1 TruthfulQA,68.12,,hf_open_llm_v1_240829_frozen.csv +whyarewestillhere_7b_slerp,HFv1 Winogrande,85.48,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HF OpenLLM v1,62.91,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HFv1 ARC,60.49,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HFv1 GSM8K,47.92,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HFv1 HellaSwag,80.62,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HFv1 MMLU,61.65,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HFv1 TruthfulQA,50.58,,hf_open_llm_v1_240829_frozen.csv +wizardchatml_7b_v0,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HF OpenLLM v1,45.56,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HFv1 ARC,47.78,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HFv1 GSM8K,7.81,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HFv1 HellaSwag,69.6,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HFv1 MMLU,38.76,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HFv1 TruthfulQA,43.97,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_13b_lora,HFv1 Winogrande,65.43,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HF OpenLLM v1,50.46,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HFv1 ARC,52.13,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HFv1 GSM8K,9.48,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HFv1 HellaSwag,74.78,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HFv1 MMLU,49.15,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HFv1 TruthfulQA,48.85,,hf_open_llm_v1_240829_frozen.csv +wizardcoder_python_34b_v1_0,HFv1 Winogrande,68.35,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HF OpenLLM v1,53.59,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HFv1 ARC,56.4,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HFv1 GSM8K,19.64,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HFv1 HellaSwag,75.45,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HFv1 MMLU,54.51,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HFv1 TruthfulQA,43.06,,hf_open_llm_v1_240829_frozen.csv +wizardlm_1_0_uncensored_codellama34b,HFv1 Winogrande,72.45,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HF OpenLLM v1,28.96,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HFv1 ARC,27.39,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HFv1 GSM8K,0.0,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HFv1 HellaSwag,25.94,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HFv1 MMLU,23.12,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HFv1 TruthfulQA,48.61,,hf_open_llm_v1_240829_frozen.csv +wizardlm_30b_v1_0,HFv1 Winogrande,48.7,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HF OpenLLM v1,61.25,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HFv1 GSM8K,17.97,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HFv1 HellaSwag,84.41,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HFv1 MMLU,64.05,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HFv1 TruthfulQA,54.81,,hf_open_llm_v1_240829_frozen.csv +wizardlm_70b_v1_0,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HF OpenLLM v1,53.97,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HFv1 ARC,60.07,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HFv1 GSM8K,12.36,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HFv1 HellaSwag,82.01,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HFv1 MMLU,54.8,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HFv1 TruthfulQA,42.7,,hf_open_llm_v1_240829_frozen.csv +wizardmath_13b_v1_0,HFv1 Winogrande,71.9,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HF OpenLLM v1,60.42,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HFv1 ARC,68.17,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HFv1 GSM8K,4.09,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HFv1 HellaSwag,86.49,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HFv1 MMLU,68.92,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HFv1 TruthfulQA,52.77,,hf_open_llm_v1_240829_frozen.csv +wizardmath_70b_v1_0,HFv1 Winogrande,82.32,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HF OpenLLM v1,66.61,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HFv1 GSM8K,67.4,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HFv1 HellaSwag,84.5,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HFv1 MMLU,61.53,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HFv1 TruthfulQA,47.04,,hf_open_llm_v1_240829_frozen.csv +wizardmath_7b_v1_1,HFv1 Winogrande,77.35,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HF OpenLLM v1,51.05,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HFv1 ARC,55.38,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HFv1 GSM8K,7.43,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HFv1 HellaSwag,79.14,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HFv1 MMLU,48.46,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HFv1 TruthfulQA,42.43,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna2_13b,HFv1 Winogrande,73.48,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HF OpenLLM v1,38.77,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HFv1 ARC,37.71,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HFv1 GSM8K,0.99,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HFv1 HellaSwag,66.6,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HFv1 MMLU,27.23,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HFv1 TruthfulQA,36.8,,hf_open_llm_v1_240829_frozen.csv +wizardvicuna_open_llama3b_v2,HFv1 Winogrande,63.3,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HF OpenLLM v1,66.26,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HFv1 ARC,64.08,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HFv1 GSM8K,56.63,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HFv1 HellaSwag,83.45,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HFv1 MMLU,63.12,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HFv1 TruthfulQA,51.52,,hf_open_llm_v1_240829_frozen.csv +worldsim_hermes_7b,HFv1 Winogrande,78.77,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HF OpenLLM v1,59.21,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HFv1 ARC,55.29,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HFv1 GSM8K,21.83,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HFv1 HellaSwag,81.56,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HFv1 MMLU,61.22,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HFv1 TruthfulQA,56.68,,hf_open_llm_v1_240829_frozen.csv +xenon_1,HFv1 Winogrande,78.69,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HF OpenLLM v1,59.93,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HFv1 GSM8K,19.41,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HFv1 HellaSwag,83.28,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HFv1 MMLU,60.25,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HFv1 TruthfulQA,60.92,,hf_open_llm_v1_240829_frozen.csv +xenon_2,HFv1 Winogrande,78.22,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HF OpenLLM v1,60.27,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HFv1 ARC,58.87,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HFv1 GSM8K,20.09,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HFv1 HellaSwag,83.39,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HFv1 MMLU,59.79,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HFv1 TruthfulQA,61.99,,hf_open_llm_v1_240829_frozen.csv +xenon_3,HFv1 Winogrande,77.51,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HF OpenLLM v1,60.39,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HFv1 ARC,60.15,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HFv1 GSM8K,20.7,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HFv1 HellaSwag,83.07,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HFv1 MMLU,60.08,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HFv1 TruthfulQA,61.31,,hf_open_llm_v1_240829_frozen.csv +xenon_4,HFv1 Winogrande,77.03,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HF OpenLLM v1,34.31,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HFv1 ARC,31.48,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HFv1 HellaSwag,57.95,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HFv1 MMLU,25.43,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HFv1 TruthfulQA,35.84,,hf_open_llm_v1_240829_frozen.csv +xglm_4_5b,HFv1 Winogrande,54.93,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HF OpenLLM v1,29.55,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HFv1 ARC,24.57,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HFv1 HellaSwag,34.64,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HFv1 TruthfulQA,40.43,,hf_open_llm_v1_240829_frozen.csv +xglm_564m,HFv1 Winogrande,52.25,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HF OpenLLM v1,36.38,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HFv1 ARC,34.13,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HFv1 GSM8K,0.23,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HFv1 HellaSwag,60.77,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HFv1 MMLU,27.79,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HFv1 TruthfulQA,36.66,,hf_open_llm_v1_240829_frozen.csv +xglm_7_5b,HFv1 Winogrande,58.72,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HF OpenLLM v1,60.15,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HFv1 GSM8K,32.15,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HFv1 HellaSwag,82.99,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HFv1 MMLU,63.09,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HFv1 TruthfulQA,43.55,,hf_open_llm_v1_240829_frozen.csv +yarn_mistral_7b_128k_dpo,HFv1 Winogrande,78.3,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HF OpenLLM v1,48.46,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HFv1 ARC,35.67,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HFv1 GSM8K,18.88,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HFv1 HellaSwag,53.37,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HFv1 MMLU,70.6,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HFv1 TruthfulQA,49.08,,hf_open_llm_v1_240829_frozen.csv +yayi2_30b_llama,HFv1 Winogrande,63.14,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HF OpenLLM v1,52.71,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HFv1 ARC,54.78,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HFv1 GSM8K,7.28,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HFv1 HellaSwag,78.98,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HFv1 MMLU,51.29,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HFv1 TruthfulQA,49.17,,hf_open_llm_v1_240829_frozen.csv +yehoon_llama2,HFv1 Winogrande,74.74,,hf_open_llm_v1_240829_frozen.csv +yi6,HF OpenLLM v1,45.82,,hf_open_llm_v1_240829_frozen.csv +yi6,HFv1 ARC,47.78,,hf_open_llm_v1_240829_frozen.csv +yi6,HFv1 GSM8K,4.4,,hf_open_llm_v1_240829_frozen.csv +yi6,HFv1 HellaSwag,68.25,,hf_open_llm_v1_240829_frozen.csv +yi6,HFv1 MMLU,54.05,,hf_open_llm_v1_240829_frozen.csv +yi6,HFv1 TruthfulQA,35.8,,hf_open_llm_v1_240829_frozen.csv +yi6,HFv1 Winogrande,64.64,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HF OpenLLM v1,73.57,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HFv1 ARC,68.09,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HFv1 GSM8K,67.93,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HFv1 HellaSwag,86.52,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HFv1 MMLU,78.0,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HFv1 TruthfulQA,57.61,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_34b_chat_16k,HFv1 Winogrande,83.27,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HF OpenLLM v1,61.6,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HFv1 ARC,57.25,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HFv1 GSM8K,49.81,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HFv1 HellaSwag,77.96,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HFv1 MMLU,65.0,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HFv1 TruthfulQA,44.04,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b,HFv1 Winogrande,75.53,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HF OpenLLM v1,66.17,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HFv1 ARC,60.67,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HFv1 GSM8K,67.1,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HFv1 HellaSwag,78.87,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HFv1 MMLU,64.24,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HFv1 TruthfulQA,52.57,,hf_open_llm_v1_240829_frozen.csv +yi_1_5_6b_chat,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HF OpenLLM v1,76.17,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HFv1 ARC,73.04,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HFv1 GSM8K,65.2,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HFv1 HellaSwag,85.95,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HFv1 MMLU,76.79,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HFv1 TruthfulQA,73.22,,hf_open_llm_v1_240829_frozen.csv +yi_32b_x2_v2_0,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HF OpenLLM v1,70.12,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HFv1 ARC,66.04,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HFv1 GSM8K,57.09,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HFv1 HellaSwag,84.7,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HFv1 MMLU,74.89,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HFv1 TruthfulQA,56.89,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2301,HFv1 Winogrande,81.14,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HF OpenLLM v1,69.59,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HFv1 ARC,64.93,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HFv1 GSM8K,59.51,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HFv1 HellaSwag,84.98,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HFv1 MMLU,73.7,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HFv1 TruthfulQA,55.09,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_raw_2901,HFv1 Winogrande,79.32,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HF OpenLLM v1,71.0,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HFv1 ARC,67.92,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HFv1 GSM8K,58.91,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HFv1 HellaSwag,85.61,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HFv1 MMLU,75.22,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HFv1 TruthfulQA,56.74,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_v2,HFv1 Winogrande,81.61,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HF OpenLLM v1,64.39,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HFv1 ARC,64.85,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HFv1 GSM8K,44.05,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HFv1 HellaSwag,84.76,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HFv1 MMLU,74.48,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HFv1 TruthfulQA,37.14,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_aezakmi_xlctx_v3,HFv1 Winogrande,81.06,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HF OpenLLM v1,71.98,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HFv1 ARC,66.47,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HFv1 GSM8K,62.93,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HFv1 HellaSwag,85.54,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HFv1 MMLU,77.22,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HFv1 TruthfulQA,57.46,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_dare_merge_v5,HFv1 Winogrande,82.24,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HF OpenLLM v1,70.97,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HFv1 GSM8K,61.79,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HFv1 HellaSwag,85.69,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HFv1 MMLU,76.09,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HFv1 TruthfulQA,54.0,,hf_open_llm_v1_240829_frozen.csv +yi_34b_200k_rawrr_dpo_1,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HF OpenLLM v1,68.67,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HFv1 ARC,64.33,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HFv1 GSM8K,52.92,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HFv1 HellaSwag,84.31,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HFv1 MMLU,73.91,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HFv1 TruthfulQA,55.73,,hf_open_llm_v1_240829_frozen.csv +yi_34b_aezakmi_v1,HFv1 Winogrande,80.82,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HF OpenLLM v1,65.32,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HFv1 GSM8K,31.92,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HFv1 HellaSwag,84.16,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HFv1 MMLU,74.9,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HFv1 TruthfulQA,55.41,,hf_open_llm_v1_240829_frozen.csv +yi_34b_chat,HFv1 Winogrande,80.11,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HF OpenLLM v1,70.95,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HFv1 ARC,64.59,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HFv1 GSM8K,60.8,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HFv1 HellaSwag,85.63,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HFv1 MMLU,76.31,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HFv1 TruthfulQA,55.6,,hf_open_llm_v1_240829_frozen.csv +yi_34b_llama,HFv1 Winogrande,82.79,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HF OpenLLM v1,72.12,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HFv1 GSM8K,64.97,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HFv1 HellaSwag,85.0,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HFv1 MMLU,75.64,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HFv1 TruthfulQA,57.34,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v2,HFv1 Winogrande,83.66,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HF OpenLLM v1,72.26,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HFv1 ARC,67.06,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HFv1 GSM8K,64.52,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HFv1 HellaSwag,85.11,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HFv1 MMLU,75.8,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HFv1 TruthfulQA,57.54,,hf_open_llm_v1_240829_frozen.csv +yi_34b_v3,HFv1 Winogrande,83.5,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HF OpenLLM v1,54.08,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HFv1 ARC,55.55,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HFv1 GSM8K,12.66,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HFv1 HellaSwag,76.57,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HFv1 MMLU,64.11,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HFv1 TruthfulQA,41.96,,hf_open_llm_v1_240829_frozen.csv +yi_6b,HFv1 Winogrande,74.19,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HF OpenLLM v1,56.69,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HFv1 ARC,53.58,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HFv1 GSM8K,30.33,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HFv1 HellaSwag,75.58,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HFv1 MMLU,64.65,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HFv1 TruthfulQA,41.74,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k,HFv1 Winogrande,74.27,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HF OpenLLM v1,51.93,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HFv1 ARC,43.09,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HFv1 GSM8K,11.37,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HFv1 HellaSwag,74.53,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HFv1 MMLU,64.0,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HFv1 TruthfulQA,45.51,,hf_open_llm_v1_240829_frozen.csv +yi_6b_200k_dpo,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HF OpenLLM v1,51.93,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HFv1 ARC,43.09,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HFv1 GSM8K,11.37,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HFv1 HellaSwag,74.53,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HFv1 MMLU,64.0,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HFv1 TruthfulQA,45.51,,hf_open_llm_v1_240829_frozen.csv +yi_7b_dpo,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HF OpenLLM v1,64.11,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HFv1 ARC,59.81,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HFv1 GSM8K,48.37,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HFv1 HellaSwag,78.6,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HFv1 MMLU,70.02,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HFv1 TruthfulQA,50.98,,hf_open_llm_v1_240829_frozen.csv +yi_9b_forest_dpo_v1_0,HFv1 Winogrande,76.87,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HF OpenLLM v1,74.93,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HFv1 ARC,72.7,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HFv1 GSM8K,60.73,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HFv1 HellaSwag,85.44,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HFv1 MMLU,76.6,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HFv1 TruthfulQA,71.42,,hf_open_llm_v1_240829_frozen.csv +yi_bagel_2x34b_moe,HFv1 Winogrande,82.72,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HF OpenLLM v1,48.51,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HFv1 ARC,51.19,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HFv1 GSM8K,1.52,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HFv1 HellaSwag,76.09,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HFv1 MMLU,46.06,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HFv1 TruthfulQA,41.17,,hf_open_llm_v1_240829_frozen.csv +youri_7b_chat,HFv1 Winogrande,75.06,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HF OpenLLM v1,51.56,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HFv1 ARC,55.89,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HFv1 GSM8K,0.45,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HFv1 HellaSwag,78.55,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HFv1 MMLU,52.31,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HFv1 TruthfulQA,50.68,,hf_open_llm_v1_240829_frozen.csv +yousei_22b,HFv1 Winogrande,71.51,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HF OpenLLM v1,52.75,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HFv1 ARC,59.22,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HFv1 GSM8K,5.38,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HFv1 HellaSwag,80.66,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HFv1 MMLU,54.52,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HFv1 TruthfulQA,40.42,,hf_open_llm_v1_240829_frozen.csv +ypotryll_22b_epoch2_qlora,HFv1 Winogrande,76.32,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HF OpenLLM v1,57.35,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HFv1 ARC,58.11,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HFv1 GSM8K,30.71,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HFv1 HellaSwag,81.45,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HFv1 MMLU,60.68,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HFv1 TruthfulQA,36.6,,hf_open_llm_v1_240829_frozen.csv +yugogpt,HFv1 Winogrande,76.56,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HF OpenLLM v1,57.01,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HFv1 ARC,59.04,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HFv1 GSM8K,13.8,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HFv1 HellaSwag,80.66,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HFv1 MMLU,56.72,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HFv1 TruthfulQA,52.18,,hf_open_llm_v1_240829_frozen.csv +yulan_chat_2_13b_fp16,HFv1 Winogrande,79.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HF OpenLLM v1,58.26,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HFv1 ARC,59.47,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HFv1 GSM8K,28.96,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HFv1 HellaSwag,81.59,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HFv1 MMLU,60.25,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HFv1 TruthfulQA,42.87,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_1,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HF OpenLLM v1,58.94,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HFv1 ARC,61.09,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HFv1 GSM8K,28.35,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HFv1 HellaSwag,82.53,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HFv1 MMLU,60.37,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HFv1 TruthfulQA,44.34,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HF OpenLLM v1,56.31,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HFv1 ARC,61.77,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HFv1 GSM8K,0.3,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HFv1 HellaSwag,83.85,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HFv1 MMLU,58.33,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HFv1 TruthfulQA,60.19,,hf_open_llm_v1_240829_frozen.csv +zephyr_0_2_a2_5,HFv1 Winogrande,73.4,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HF OpenLLM v1,37.47,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HFv1 ARC,36.26,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HFv1 GSM8K,2.58,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HFv1 HellaSwag,63.48,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HFv1 MMLU,27.28,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HFv1 TruthfulQA,35.05,,hf_open_llm_v1_240829_frozen.csv +zephyr_1b_olmo_sft_qlora,HFv1 Winogrande,60.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HF OpenLLM v1,29.33,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HFv1 ARC,25.43,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HFv1 GSM8K,0.53,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HFv1 HellaSwag,29.15,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HFv1 MMLU,26.43,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HFv1 TruthfulQA,43.44,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_dpo_full,HFv1 Winogrande,50.99,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HF OpenLLM v1,29.33,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HFv1 ARC,25.26,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HFv1 GSM8K,0.38,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HFv1 HellaSwag,29.03,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HFv1 MMLU,26.45,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HFv1 TruthfulQA,43.23,,hf_open_llm_v1_240829_frozen.csv +zephyr_220m_sft_full,HFv1 Winogrande,51.62,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HF OpenLLM v1,47.26,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HFv1 ARC,49.15,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HFv1 GSM8K,18.2,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HFv1 HellaSwag,71.94,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HFv1 MMLU,41.88,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HFv1 TruthfulQA,35.77,,hf_open_llm_v1_240829_frozen.csv +zephyr_2b_gemma_sft_qlora,HFv1 Winogrande,66.61,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HF OpenLLM v1,62.15,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HFv1 ARC,60.84,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HFv1 GSM8K,28.28,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HFv1 HellaSwag,84.25,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HFv1 MMLU,60.6,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HFv1 TruthfulQA,60.89,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_alpha_expo,HFv1 Winogrande,78.06,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HF OpenLLM v1,61.84,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HFv1 GSM8K,27.29,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HFv1 HellaSwag,84.5,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HFv1 MMLU,60.97,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HFv1 TruthfulQA,58.34,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_beta_expo,HFv1 Winogrande,77.66,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HF OpenLLM v1,58.25,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HFv1 ARC,62.88,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HFv1 GSM8K,18.57,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HFv1 HellaSwag,84.45,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HFv1 MMLU,59.56,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HFv1 TruthfulQA,47.41,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HF OpenLLM v1,61.55,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HFv1 ARC,61.86,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HFv1 GSM8K,30.02,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HFv1 HellaSwag,84.04,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HFv1 MMLU,61.85,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HFv1 TruthfulQA,54.78,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_beta_0_2,HFv1 Winogrande,76.95,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HF OpenLLM v1,57.73,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HFv1 ARC,62.29,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HFv1 GSM8K,13.95,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HFv1 HellaSwag,84.9,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HFv1 MMLU,59.01,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HFv1 TruthfulQA,50.08,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_full_expo,HFv1 Winogrande,76.16,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HF OpenLLM v1,63.51,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HFv1 GSM8K,42.08,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HFv1 HellaSwag,85.35,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HFv1 MMLU,63.82,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HFv1 TruthfulQA,47.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora,HFv1 Winogrande,79.01,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HF OpenLLM v1,62.67,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HFv1 ARC,62.46,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HFv1 GSM8K,41.62,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HFv1 HellaSwag,84.5,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HFv1 MMLU,64.02,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HFv1 TruthfulQA,44.25,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_no_sft,HFv1 Winogrande,79.16,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HF OpenLLM v1,64.43,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HFv1 ARC,67.83,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HFv1 GSM8K,34.42,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HFv1 HellaSwag,86.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HFv1 MMLU,63.55,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HFv1 TruthfulQA,53.8,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_dpo_qlora_v1,HFv1 Winogrande,80.35,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HF OpenLLM v1,63.31,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HFv1 ARC,65.19,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HFv1 GSM8K,35.1,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HFv1 HellaSwag,85.37,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HFv1 MMLU,62.5,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HFv1 TruthfulQA,51.85,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update3_i0,HFv1 Winogrande,79.87,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HF OpenLLM v1,63.17,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HFv1 ARC,63.82,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HFv1 GSM8K,40.71,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HFv1 HellaSwag,84.21,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HFv1 MMLU,63.33,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HFv1 TruthfulQA,47.18,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_update4_i0,HFv1 Winogrande,79.79,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HF OpenLLM v1,62.68,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HFv1 ARC,65.44,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HFv1 HellaSwag,85.52,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HFv1 MMLU,62.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HFv1 TruthfulQA,51.82,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v5_i1,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HF OpenLLM v1,63.66,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HFv1 ARC,65.61,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HFv1 GSM8K,33.06,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HFv1 HellaSwag,85.83,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HFv1 MMLU,62.96,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HFv1 TruthfulQA,56.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_gpo_v6_i1,HFv1 Winogrande,79.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HF OpenLLM v1,62.67,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HFv1 ARC,63.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HFv1 GSM8K,40.03,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HFv1 HellaSwag,84.37,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HFv1 MMLU,63.54,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HFv1 TruthfulQA,45.35,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_ipo_qlora_v0,HFv1 Winogrande,79.56,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HF OpenLLM v1,62.54,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HFv1 ARC,65.96,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HFv1 GSM8K,31.61,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HFv1 HellaSwag,85.2,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HFv1 MMLU,61.88,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HFv1 TruthfulQA,51.1,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_lgpo_v1_i1,HFv1 Winogrande,79.48,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HF OpenLLM v1,55.16,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HFv1 ARC,56.48,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HFv1 GSM8K,20.62,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HFv1 HellaSwag,79.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HFv1 MMLU,55.52,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HFv1 TruthfulQA,44.6,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_norobots,HFv1 Winogrande,74.11,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HF OpenLLM v1,63.7,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HFv1 ARC,66.13,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HFv1 GSM8K,34.19,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HFv1 HellaSwag,85.85,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HFv1 MMLU,61.51,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HFv1 TruthfulQA,57.89,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_sft_full_spin_iter3,HFv1 Winogrande,76.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HF OpenLLM v1,61.93,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HFv1 ARC,60.75,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HFv1 GSM8K,25.47,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HFv1 HellaSwag,84.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HFv1 MMLU,59.53,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HFv1 TruthfulQA,63.31,,hf_open_llm_v1_240829_frozen.csv +zephyr_7b_truthy,HFv1 Winogrande,77.9,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HF OpenLLM v1,59.01,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HFv1 ARC,58.62,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HFv1 GSM8K,23.88,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HFv1 HellaSwag,83.05,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HFv1 MMLU,56.68,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HFv1 TruthfulQA,58.28,,hf_open_llm_v1_240829_frozen.csv +zephyr_alpha_nebula_v2_7b,HFv1 Winogrande,73.56,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HF OpenLLM v1,48.28,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HFv1 ARC,42.49,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HFv1 GSM8K,28.58,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HFv1 HellaSwag,72.93,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HFv1 MMLU,40.19,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HFv1 TruthfulQA,37.89,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube2_sft_qlora,HFv1 Winogrande,67.56,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HF OpenLLM v1,40.11,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HFv1 ARC,40.44,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HFv1 GSM8K,2.05,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HFv1 HellaSwag,69.4,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HFv1 MMLU,27.0,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HFv1 TruthfulQA,37.08,,hf_open_llm_v1_240829_frozen.csv +zephyr_danube_sft_qlora,HFv1 Winogrande,64.72,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HF OpenLLM v1,60.93,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HFv1 ARC,56.91,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HFv1 GSM8K,42.23,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HFv1 HellaSwag,83.5,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HFv1 MMLU,59.27,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HFv1 TruthfulQA,49.72,,hf_open_llm_v1_240829_frozen.csv +zephyr_gemma_rpo,HFv1 Winogrande,73.95,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HF OpenLLM v1,50.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HFv1 ARC,51.96,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HFv1 GSM8K,27.6,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HFv1 HellaSwag,62.22,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HFv1 MMLU,43.09,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HFv1 TruthfulQA,42.87,,hf_open_llm_v1_240829_frozen.csv +zephyr_phi_1_5_sft_qlora,HFv1 Winogrande,73.09,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HF OpenLLM v1,60.08,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HFv1 ARC,56.14,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HFv1 GSM8K,32.52,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HFv1 HellaSwag,82.03,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HFv1 MMLU,60.18,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HFv1 TruthfulQA,52.8,,hf_open_llm_v1_240829_frozen.csv +zephyr_python_ru,HFv1 Winogrande,76.8,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HF OpenLLM v1,29.37,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HFv1 ARC,25.0,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HFv1 GSM8K,0.68,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HFv1 HellaSwag,28.54,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HFv1 MMLU,25.18,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HFv1 TruthfulQA,45.75,,hf_open_llm_v1_240829_frozen.csv +zephyr_smol_llama_100m_dpo_full,HFv1 Winogrande,51.07,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HF OpenLLM v1,37.35,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HFv1 ARC,36.6,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HFv1 GSM8K,2.12,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HFv1 HellaSwag,61.66,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HFv1 MMLU,25.78,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HFv1 TruthfulQA,36.4,,hf_open_llm_v1_240829_frozen.csv +zephyr_tiny_dpo_qlora,HFv1 Winogrande,61.56,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HF OpenLLM v1,36.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HFv1 ARC,34.64,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HFv1 GSM8K,1.74,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HFv1 HellaSwag,59.84,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HFv1 MMLU,25.85,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HFv1 TruthfulQA,36.57,,hf_open_llm_v1_240829_frozen.csv +zephyr_tinyllama_sft_qlora,HFv1 Winogrande,61.17,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HF OpenLLM v1,59.26,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HFv1 ARC,61.35,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HFv1 GSM8K,17.13,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HFv1 HellaSwag,82.8,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HFv1 MMLU,60.67,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HFv1 TruthfulQA,57.22,,hf_open_llm_v1_240829_frozen.csv +zephyrnotus_11b_alpha,HFv1 Winogrande,76.4,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HF OpenLLM v1,62.04,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HFv1 ARC,54.01,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HFv1 GSM8K,60.42,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HFv1 HellaSwag,78.9,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HFv1 MMLU,61.32,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HFv1 TruthfulQA,42.74,,hf_open_llm_v1_240829_frozen.csv +ziya2_13b_base,HFv1 Winogrande,74.82,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HF OpenLLM v1,58.41,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HFv1 ARC,57.51,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HFv1 GSM8K,28.96,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HFv1 HellaSwag,79.73,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HFv1 MMLU,58.65,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HFv1 TruthfulQA,51.11,,hf_open_llm_v1_240829_frozen.csv +zysec_7b,HFv1 Winogrande,74.51,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HF OpenLLM v1,54.63,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HFv1 GSM8K,28.05,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HFv1 HellaSwag,76.3,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HFv1 MMLU,54.55,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv +zysec_7b_v2,HFv1 Winogrande,68.75,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HF OpenLLM v1,54.63,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HFv1 ARC,53.07,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HFv1 GSM8K,28.05,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HFv1 HellaSwag,76.3,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HFv1 MMLU,54.55,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HFv1 TruthfulQA,47.05,,hf_open_llm_v1_240829_frozen.csv +zysec_8b_v2,HFv1 Winogrande,68.75,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HF OpenLLM v1,38.23,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HFv1 ARC,37.88,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HFv1 GSM8K,1.44,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HFv1 HellaSwag,61.37,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HFv1 MMLU,24.61,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HFv1 TruthfulQA,42.14,,hf_open_llm_v1_240829_frozen.csv +zyte_1b,HFv1 Winogrande,61.96,,hf_open_llm_v1_240829_frozen.csv +claude_2_1,BFCL,74.57,,bfcl_240906.csv +claude_3_5_sonnet_20240620,BFCL,76.29,,bfcl_240906.csv +claude_3_haiku_20240307,BFCL,60.34,,bfcl_240906.csv +claude_3_opus_20240229,BFCL,80.88,,bfcl_240906.csv +claude_3_sonnet_20240229,BFCL,77.92,,bfcl_240906.csv +claude_instant_1_2,BFCL,47.95,,bfcl_240906.csv +command_r_plus_original,BFCL,74.11,,bfcl_240906.csv +dbrx_instructruct,BFCL,69.55,,bfcl_240906.csv +deepseek_v1_5,BFCL,11.18,,bfcl_240906.csv +firefunction_v1,BFCL,48.11,,bfcl_240906.csv +firefunction_v2,BFCL,77.45,,bfcl_240906.csv +functionary_medium_v3_1,BFCL,82.55,,bfcl_240906.csv +functionary_small_v3_1,BFCL,80.21,,bfcl_240906.csv +functionary_small_v3_2,BFCL,78.96,,bfcl_240906.csv +gemini_1_0_pro_001,BFCL,57.81,,bfcl_240906.csv +gemini_1_5_flash_preview_0514,BFCL,70.75,,bfcl_240906.csv +gemini_1_5_pro_preview_0409,BFCL,74.56,,bfcl_240906.csv +gemini_1_5_pro_preview_0514,BFCL,74.75,,bfcl_240906.csv +gemma_7b_it,BFCL,10.3,,bfcl_240906.csv +gorilla_openfunctions_v2,BFCL,79.1,,bfcl_240906.csv +gpt_3_5_turbo_0125,BFCL,75.41,,bfcl_240906.csv +gpt_4_0125_preview,BFCL,85.79,,bfcl_240906.csv +gpt_4_0613,BFCL,84.74,,bfcl_240906.csv +gpt_4_1106_preview,BFCL,85.0,,bfcl_240906.csv +gpt_4_turbo_2024_04_09,BFCL,83.89,,bfcl_240906.csv +gpt_4o_2024_05_13,BFCL,83.13,,bfcl_240906.csv +gpt_4o_2024_08_06,BFCL,78.87,,bfcl_240906.csv +gpt_4o_mini_2024_07_18,BFCL,83.35,,bfcl_240906.csv +granite_20b_functioncalling,BFCL,76.63,,bfcl_240906.csv +hermes_2_pro_llama3_70b,BFCL,74.78,,bfcl_240906.csv +hermes_2_pro_llama3_8b,BFCL,66.18,,bfcl_240906.csv +hermes_2_pro_mistral_7b,BFCL,65.44,,bfcl_240906.csv +hermes_2_theta_llama3_70b,BFCL,10.0,,bfcl_240906.csv +hermes_2_theta_llama3_8b,BFCL,64.83,,bfcl_240906.csv +llama3_70b_instruct,BFCL,81.59,,bfcl_240906.csv +llama3_8b_instruct,BFCL,62.7,,bfcl_240906.csv +mistral_large_2407,BFCL,79.66,,bfcl_240906.csv +mistral_medium_2312,BFCL,72.19,,bfcl_240906.csv +mistral_small_2402,BFCL,55.36,,bfcl_240906.csv +mistral_tiny_2312,BFCL,21.17,,bfcl_240906.csv +nemotron_4_340b_instruct,BFCL,80.23,,bfcl_240906.csv +open_mistral_nemo_2407,BFCL,76.31,,bfcl_240906.csv +open_mixtral_8x22b,BFCL,79.14,,bfcl_240906.csv +open_mixtral_8x7b,BFCL,60.82,,bfcl_240906.csv +snowflake_arctic_instruct,BFCL,42.46,,bfcl_240906.csv +xlam_1b_fc_r,BFCL,74.9,,bfcl_240906.csv +xlam_7b_fc_r,BFCL,79.41,,bfcl_240906.csv +llama3_1_405b_instruct,eq_bench,83.0,[],eqbench_240912.csv +claude_3_5_sonnet_20240620,eq_bench,86.36,[],eqbench_240912.csv +gpt_4o,eq_bench,83.51,[],eqbench_240912.csv +gpt_4_turbo_2024_04_09,eq_bench,86.35,[],eqbench_240912.csv +rys_xlarge_base,eq_bench,85.05,[],eqbench_240912.csv +gpt_4_0613,eq_bench,84.79,[],eqbench_240912.csv +gpt_4_0314,eq_bench,85.73,[],eqbench_240912.csv +rys_xlarge,eq_bench,84.55,[],eqbench_240912.csv +gpt_4_1106_preview,eq_bench,86.05,[],eqbench_240912.csv +gpt_4_0125_preview,eq_bench,83.87,[],eqbench_240912.csv +claude_3_opus_20240229,eq_bench,82.19,[],eqbench_240912.csv +mistral_large_2407,eq_bench,85.05,[],eqbench_240912.csv +qwen2_72b_instruct,eq_bench,81.35,[],eqbench_240912.csv +mistral_large_2402,eq_bench,85.17,[],eqbench_240912.csv +llama3_70b_instruct,eq_bench,82.13,[],eqbench_240912.csv +qwen1_5_110b_chat,eq_bench,83.68,[],eqbench_240912.csv +solar_pro_preview_instruct,eq_bench,78.52,[],eqbench_240912.csv +senku_70b_full,eq_bench,84.89,[],eqbench_240912.csv +smaug_llama3_70b_instruct,eq_bench,80.69,[],eqbench_240912.csv +ece_tw3_jrgl_v1,eq_bench,83.07,[],eqbench_240912.csv +miiqu_f16,eq_bench,83.17,[],eqbench_240912.csv +qwen1_5_72b_chat,eq_bench,82.81,[],eqbench_240912.csv +miqu_1_70b,eq_bench,82.91,[],eqbench_240912.csv +mistral_medium,eq_bench,82.57,[],eqbench_240912.csv +gemma_2_27b_it,eq_bench,80.55,[],eqbench_240912.csv +gpt_4o_mini,eq_bench,76.93,[],eqbench_240912.csv +🆕phi_3_5_moe_instruct,eq_bench,76.97,[],eqbench_240912.csv +deepseek_v2_chat_0628,eq_bench,83.18,[],eqbench_240912.csv +miquella_120b,eq_bench,82.15,[],eqbench_240912.csv +phi_3_medium_4k_instruct,eq_bench,76.34,[],eqbench_240912.csv +claude_3_sonnet_20240229,eq_bench,80.45,[],eqbench_240912.csv +tess_72b_v1_5b,eq_bench,81.78,[],eqbench_240912.csv +mixtral_8x22b_instruct_v0_1,eq_bench,78.79,[],eqbench_240912.csv +qwen_72b_chat,eq_bench,80.7,[],eqbench_240912.csv +smaug_72b_v0_1,eq_bench,79.75,[],eqbench_240912.csv +gemma_2_9b_it,eq_bench,80.46,[],eqbench_240912.csv +yi_1_5_34b_chat,eq_bench,72.93,[],eqbench_240912.csv +mixtral_34bx2_moe_60b,eq_bench,72.69,[],eqbench_240912.csv +phi_3_small_8k_instruct,eq_bench,73.49,[],eqbench_240912.csv +wizardlm_2_8x22b,eq_bench,77.91,[],eqbench_240912.csv +miquliz_120b_v2_0,eq_bench,82.21,[],eqbench_240912.csv +quyen_pro_max_v0_1,eq_bench,77.16,[],eqbench_240912.csv +qwen1_5_32b_chat,eq_bench,75.59,[],eqbench_240912.csv +🆕gemma_2_ifable_9b,eq_bench,79.93,[],eqbench_240912.csv +dolphin_2_2_yi_34b,eq_bench,75.52,[],eqbench_240912.csv +nous_hermes_2_yi_34b,eq_bench,72.68,[],eqbench_240912.csv +megadolphin_120b,eq_bench,80.21,[],eqbench_240912.csv +dbrx_instructruct,eq_bench,76.82,[],eqbench_240912.csv +llama3_8b_instruct,eq_bench,68.88,[],eqbench_240912.csv +discolm_120b,eq_bench,78.48,[],eqbench_240912.csv +mistral_small_2402,eq_bench,80.36,[],eqbench_240912.csv +dolphin_2_2_70b,eq_bench,79.6,[],eqbench_240912.csv +yi_34b_chat,eq_bench,71.62,[],eqbench_240912.csv +tulu_2_dpo_70b,eq_bench,76.63,[],eqbench_240912.csv +tess_xl_v1_0,eq_bench,78.46,[],eqbench_240912.csv +yi_1_5_9b_chat,eq_bench,70.37,[],eqbench_240912.csv +goliath_120b,eq_bench,76.09,[],eqbench_240912.csv +c4ai_command_r_plus,eq_bench,76.11,[],eqbench_240912.csv +samantha_120b,eq_bench,76.44,[],eqbench_240912.csv +nous_hermes_2_mixtral_8x7b_sft,eq_bench,72.91,[],eqbench_240912.csv +qwen1_5_14b_chat,eq_bench,74.99,[],eqbench_240912.csv +synthia_70b_v1_5,eq_bench,73.71,[],eqbench_240912.csv +gemini_pro,eq_bench,75.08,[],eqbench_240912.csv +mistral_nemo_instruct_2407,eq_bench,77.13,[],eqbench_240912.csv +mixtral_8x7b_instruct_v0_1,eq_bench,72.37,[],eqbench_240912.csv +quyen_pro_v0_1,eq_bench,70.75,[],eqbench_240912.csv +gpt_3_5_turbo_0301,eq_bench,70.67,[],eqbench_240912.csv +midnight_miqu_70b_v1_0,eq_bench,75.9,[],eqbench_240912.csv +meow,eq_bench,73.94,[],eqbench_240912.csv +lmcocktail_10_7b_v1,eq_bench,73.67,[],eqbench_240912.csv +experiment26_7b,eq_bench,77.21,[],eqbench_240912.csv +beyonder_4x7b_v3,eq_bench,77.01,[],eqbench_240912.csv +sauerkrautlm_una_solar_instruct,eq_bench,73.56,[],eqbench_240912.csv +neuralbeagle14_7b,eq_bench,74.79,[],eqbench_240912.csv +neuralmonarch_7b,eq_bench,76.26,[],eqbench_240912.csv +solar_10_7b_instruct_dpo,eq_bench,73.21,[],eqbench_240912.csv +beagle14_7b,eq_bench,74.45,[],eqbench_240912.csv +monarch_7b,eq_bench,75.8,[],eqbench_240912.csv +westlake_7b_v2,eq_bench,78.7,[],eqbench_240912.csv +alphamonarch_7b,eq_bench,76.08,[],eqbench_240912.csv +gml_mistral_merged_v1,eq_bench,74.01,[],eqbench_240912.csv +gpt_3_5_turbo_1106,eq_bench,71.74,[],eqbench_240912.csv +starling_lm_7b_beta,eq_bench,73.82,[],eqbench_240912.csv +solar_10_7b_instruct_v1_0,eq_bench,73.53,[],eqbench_240912.csv +phi_3_mini_4k_instruct,eq_bench,58.15,[],eqbench_240912.csv +claude_3_haiku_20240307,eq_bench,63.65,[],eqbench_240912.csv +openchat_3_5_1210,eq_bench,72.52,[],eqbench_240912.csv +neuralmarcoro14_7b,eq_bench,74.15,[],eqbench_240912.csv +wizardlm_70b_v1_0,eq_bench,71.28,[],eqbench_240912.csv +starling_lm_7b_alpha,eq_bench,73.9,[],eqbench_240912.csv +gpt_3_5_turbo_0613,eq_bench,69.35,[],eqbench_240912.csv +openchat_3_5,eq_bench,72.18,[],eqbench_240912.csv +🆕exaone_3_0_7_8b_instruct,eq_bench,66.72,[],eqbench_240912.csv +laserxtral,eq_bench,71.96,[],eqbench_240912.csv +llama_2_70b_chat,eq_bench,73.59,[],eqbench_240912.csv +marcoroni_7b_v3_safetensor,eq_bench,71.68,[],eqbench_240912.csv +🆕trillama_8b,eq_bench,66.63,[],eqbench_240912.csv +🆕phi_3_5_mini_instruct,eq_bench,54.74,[],eqbench_240912.csv +gpt_3_5_turbo_0125,eq_bench,64.97,[],eqbench_240912.csv +beyonder_4x7b_v2,eq_bench,69.23,[],eqbench_240912.csv +firefly_mixtral_8x7b,eq_bench,64.36,[],eqbench_240912.csv +yi_1_5_6b_chat,eq_bench,59.45,[],eqbench_240912.csv +marcoroni_neural_chat_7b_v2,eq_bench,68.54,[],eqbench_240912.csv +wizardlm_2_7b,eq_bench,69.31,[],eqbench_240912.csv +openhermes_2_5_mistral_7b,eq_bench,66.89,[],eqbench_240912.csv +neuralhermes_2_5_mistral_7b,eq_bench,65.86,[],eqbench_240912.csv +snorkel_mistral_pairrm_dpo,eq_bench,65.83,[],eqbench_240912.csv +qwen_14b_chat,eq_bench,63.47,[],eqbench_240912.csv +dolphin_2_2_1_mistral_7b,eq_bench,69.92,[],eqbench_240912.csv +mistral_7b_instruct_v0_2,eq_bench,68.18,[],eqbench_240912.csv +mistral_7b_openorca,eq_bench,66.55,[],eqbench_240912.csv +neural_chat_7b_v3_1,eq_bench,64.77,[],eqbench_240912.csv +internlm2_chat_7b,eq_bench,62.61,[],eqbench_240912.csv +yi_6b_chat,eq_bench,61.79,[],eqbench_240912.csv +orion_14b_chat,eq_bench,59.71,[],eqbench_240912.csv +una_cybertron_7b_v2_bf16,eq_bench,62.83,[],eqbench_240912.csv +c4ai_command_r_v0_1,eq_bench,56.05,[],eqbench_240912.csv +mistral_7b_instruct_v0_3,eq_bench,63.15,[],eqbench_240912.csv +vicuna_33b_v1_3,eq_bench,67.07,[],eqbench_240912.csv +nanbeige2_8b_chat,eq_bench,65.17,[],eqbench_240912.csv +gemma_1_1_7b_it,eq_bench,59.17,[],eqbench_240912.csv +qwen1_5_moe_a2_7b_chat,eq_bench,58.07,[],eqbench_240912.csv +vicuna_13b_v1_5,eq_bench,67.39,[],eqbench_240912.csv +gemma_2_2b_it,eq_bench,60.86,[],eqbench_240912.csv +qwen1_5_7b_chat,eq_bench,54.41,[],eqbench_240912.csv +sparsetral_16x7b_v2,eq_bench,59.9,[],eqbench_240912.csv +zephyr_7b_beta,eq_bench,58.33,[],eqbench_240912.csv +wizardlm_13b_v1_2,eq_bench,63.71,[],eqbench_240912.csv +zephyr_7b_alpha,eq_bench,56.82,[],eqbench_240912.csv +phi_2_orange,eq_bench,56.94,[],eqbench_240912.csv +phi_2_psy,eq_bench,56.44,[],eqbench_240912.csv +gemma_7b_it,eq_bench,61.72,[],eqbench_240912.csv +phi_2_dpo,eq_bench,54.42,[],eqbench_240912.csv +phixtral_2x2_8,eq_bench,54.58,[],eqbench_240912.csv +qwen_7b_chat,eq_bench,50.11,[],eqbench_240912.csv +mistral_7b_instruct_v0_1,eq_bench,52.15,[],eqbench_240912.csv +llama_2_13b_chat,eq_bench,49.12,[],eqbench_240912.csv +guanaco_33b_merged,eq_bench,36.11,[],eqbench_240912.csv +nous_capybara_7b_v1,eq_bench,34.37,[],eqbench_240912.csv +llama_2_7b_chat,eq_bench,36.32,[],eqbench_240912.csv +qwen1_5_4b_chat,eq_bench,28.75,[],eqbench_240912.csv +qwen_1_8b_chat,eq_bench,30.0,[],eqbench_240912.csv +phi_2,eq_bench,27.6,[],eqbench_240912.csv +qwen1_5_1_8b_chat,eq_bench,24.12,[],eqbench_240912.csv +vicuna_7b_v1_1,eq_bench,26.12,[],eqbench_240912.csv +gemma_2b_it,eq_bench,23.26,[],eqbench_240912.csv +koala_7b,eq_bench,21.54,[],eqbench_240912.csv +stablelm_2_zephyr_1_6b,eq_bench,15.04,[],eqbench_240912.csv +random_baseline,eq_bench,0.0,[],eqbench_240912.csv +falcon_180b_chat,eq_bench,56.82,[],eqbench_240912.csv +claude_instant_1_2,eq_bench,69.04,[],eqbench_240912.csv +claude_2_1,eq_bench,73.96,[],eqbench_240912.csv +claude_1,eq_bench,76.83,[],eqbench_240912.csv +claude_2_0,eq_bench,72.89,[],eqbench_240912.csv +pplx_70b_online,eq_bench,62.79,[],eqbench_240912.csv +pplx_7b_online,eq_bench,48.91,[],eqbench_240912.csv +theprofessor_155b,eq_bench,78.82,[],eqbench_240912.csv +llama3_1_405b_instruct,magi_hard,83.81,[],eqbench_240912.csv +claude_3_5_sonnet_20240620,magi_hard,78.8,[],eqbench_240912.csv +gpt_4o,magi_hard,80.86,[],eqbench_240912.csv +gpt_4_turbo_2024_04_09,magi_hard,77.74,[],eqbench_240912.csv +rys_xlarge_base,magi_hard,78.3,[],eqbench_240912.csv +gpt_4_0613,magi_hard,77.85,[],eqbench_240912.csv +gpt_4_0314,magi_hard,75.67,[],eqbench_240912.csv +rys_xlarge,magi_hard,76.83,[],eqbench_240912.csv +gpt_4_1106_preview,magi_hard,74.96,[],eqbench_240912.csv +gpt_4_0125_preview,magi_hard,76.83,[],eqbench_240912.csv +claude_3_opus_20240229,magi_hard,76.55,[],eqbench_240912.csv +mistral_large_2407,magi_hard,72.37,[],eqbench_240912.csv +qwen2_72b_instruct,magi_hard,75.74,[],eqbench_240912.csv +mistral_large_2402,magi_hard,67.69,[],eqbench_240912.csv +llama3_70b_instruct,magi_hard,67.97,[],eqbench_240912.csv +qwen1_5_110b_chat,magi_hard,66.09,[],eqbench_240912.csv +solar_pro_preview_instruct,magi_hard,70.84,[],eqbench_240912.csv +senku_70b_full,magi_hard,63.94,[],eqbench_240912.csv +smaug_llama3_70b_instruct,magi_hard,67.25,[],eqbench_240912.csv +ece_tw3_jrgl_v1,magi_hard,63.56,[],eqbench_240912.csv +miiqu_f16,magi_hard,63.28,[],eqbench_240912.csv +qwen1_5_72b_chat,magi_hard,63.47,[],eqbench_240912.csv +miqu_1_70b,magi_hard,63.22,[],eqbench_240912.csv +mistral_medium,magi_hard,62.15,[],eqbench_240912.csv +gemma_2_27b_it,magi_hard,64.1,[],eqbench_240912.csv +gpt_4o_mini,magi_hard,67.5,[],eqbench_240912.csv +🆕phi_3_5_moe_instruct,magi_hard,67.25,[],eqbench_240912.csv +deepseek_v2_chat_0628,magi_hard,60.63,[],eqbench_240912.csv +miquella_120b,magi_hard,60.69,[],eqbench_240912.csv +phi_3_medium_4k_instruct,magi_hard,66.38,[],eqbench_240912.csv +claude_3_sonnet_20240229,magi_hard,61.01,[],eqbench_240912.csv +tess_72b_v1_5b,magi_hard,59.57,[],eqbench_240912.csv +mixtral_8x22b_instruct_v0_1,magi_hard,62.41,[],eqbench_240912.csv +qwen_72b_chat,magi_hard,60.38,[],eqbench_240912.csv +smaug_72b_v0_1,magi_hard,60.22,[],eqbench_240912.csv +gemma_2_9b_it,magi_hard,57.98,[],eqbench_240912.csv +yi_1_5_34b_chat,magi_hard,64.85,[],eqbench_240912.csv +mixtral_34bx2_moe_60b,magi_hard,65.06,[],eqbench_240912.csv +phi_3_small_8k_instruct,magi_hard,64.16,[],eqbench_240912.csv +wizardlm_2_8x22b,magi_hard,59.16,[],eqbench_240912.csv +miquliz_120b_v2_0,magi_hard,54.57,[],eqbench_240912.csv +quyen_pro_max_v0_1,magi_hard,59.29,[],eqbench_240912.csv +qwen1_5_32b_chat,magi_hard,60.72,[],eqbench_240912.csv +🆕gemma_2_ifable_9b,magi_hard,56.35,[],eqbench_240912.csv +dolphin_2_2_yi_34b,magi_hard,60.66,[],eqbench_240912.csv +nous_hermes_2_yi_34b,magi_hard,63.03,[],eqbench_240912.csv +megadolphin_120b,magi_hard,54.45,[],eqbench_240912.csv +dbrx_instructruct,magi_hard,57.13,[],eqbench_240912.csv +llama3_8b_instruct,magi_hard,63.84,[],eqbench_240912.csv +discolm_120b,magi_hard,54.01,[],eqbench_240912.csv +mistral_small_2402,magi_hard,51.9,[],eqbench_240912.csv +dolphin_2_2_70b,magi_hard,49.73,[],eqbench_240912.csv +yi_34b_chat,magi_hard,57.1,[],eqbench_240912.csv +tulu_2_dpo_70b,magi_hard,50.23,[],eqbench_240912.csv +tess_xl_v1_0,magi_hard,48.08,[],eqbench_240912.csv +yi_1_5_9b_chat,magi_hard,56.13,[],eqbench_240912.csv +goliath_120b,magi_hard,50.36,[],eqbench_240912.csv +c4ai_command_r_plus,magi_hard,49.7,[],eqbench_240912.csv +samantha_120b,magi_hard,48.58,[],eqbench_240912.csv +nous_hermes_2_mixtral_8x7b_sft,magi_hard,51.83,[],eqbench_240912.csv +qwen1_5_14b_chat,magi_hard,49.27,[],eqbench_240912.csv +synthia_70b_v1_5,magi_hard,48.92,[],eqbench_240912.csv +gemini_pro,magi_hard,46.87,[],eqbench_240912.csv +mistral_nemo_instruct_2407,magi_hard,43.65,[],eqbench_240912.csv +mixtral_8x7b_instruct_v0_1,magi_hard,45.74,[],eqbench_240912.csv +quyen_pro_v0_1,magi_hard,47.3,[],eqbench_240912.csv +gpt_3_5_turbo_0301,magi_hard,46.66,[],eqbench_240912.csv +midnight_miqu_70b_v1_0,magi_hard,40.74,[],eqbench_240912.csv +meow,magi_hard,42.68,[],eqbench_240912.csv +lmcocktail_10_7b_v1,magi_hard,42.65,[],eqbench_240912.csv +experiment26_7b,magi_hard,38.93,[],eqbench_240912.csv +beyonder_4x7b_v3,magi_hard,39.03,[],eqbench_240912.csv +sauerkrautlm_una_solar_instruct,magi_hard,42.43,[],eqbench_240912.csv +neuralbeagle14_7b,magi_hard,41.06,[],eqbench_240912.csv +neuralmonarch_7b,magi_hard,39.59,[],eqbench_240912.csv +solar_10_7b_instruct_dpo,magi_hard,42.37,[],eqbench_240912.csv +beagle14_7b,magi_hard,41.02,[],eqbench_240912.csv +monarch_7b,magi_hard,39.56,[],eqbench_240912.csv +westlake_7b_v2,magi_hard,36.59,[],eqbench_240912.csv +alphamonarch_7b,magi_hard,39.12,[],eqbench_240912.csv +gml_mistral_merged_v1,magi_hard,41.18,[],eqbench_240912.csv +gpt_3_5_turbo_1106,magi_hard,43.17,[],eqbench_240912.csv +starling_lm_7b_beta,magi_hard,40.12,[],eqbench_240912.csv +solar_10_7b_instruct_v1_0,magi_hard,39.62,[],eqbench_240912.csv +phi_3_mini_4k_instruct,magi_hard,53.26,[],eqbench_240912.csv +claude_3_haiku_20240307,magi_hard,47.71,[],eqbench_240912.csv +openchat_3_5_1210,magi_hard,38.81,[],eqbench_240912.csv +neuralmarcoro14_7b,magi_hard,37.12,[],eqbench_240912.csv +wizardlm_70b_v1_0,magi_hard,39.87,[],eqbench_240912.csv +starling_lm_7b_alpha,magi_hard,37.06,[],eqbench_240912.csv +gpt_3_5_turbo_0613,magi_hard,40.55,[],eqbench_240912.csv +openchat_3_5,magi_hard,37.34,[],eqbench_240912.csv +🆕exaone_3_0_7_8b_instruct,magi_hard,42.8,[],eqbench_240912.csv +laserxtral,magi_hard,37.46,[],eqbench_240912.csv +llama_2_70b_chat,magi_hard,35.4,[],eqbench_240912.csv +marcoroni_7b_v3_safetensor,magi_hard,37.06,[],eqbench_240912.csv +🆕trillama_8b,magi_hard,41.9,[],eqbench_240912.csv +🆕phi_3_5_mini_instruct,magi_hard,52.92,[],eqbench_240912.csv +gpt_3_5_turbo_0125,magi_hard,42.65,[],eqbench_240912.csv +beyonder_4x7b_v2,magi_hard,38.03,[],eqbench_240912.csv +firefly_mixtral_8x7b,magi_hard,42.46,[],eqbench_240912.csv +yi_1_5_6b_chat,magi_hard,46.18,[],eqbench_240912.csv +marcoroni_neural_chat_7b_v2,magi_hard,36.31,[],eqbench_240912.csv +wizardlm_2_7b,magi_hard,35.4,[],eqbench_240912.csv +openhermes_2_5_mistral_7b,magi_hard,37.31,[],eqbench_240912.csv +neuralhermes_2_5_mistral_7b,magi_hard,37.56,[],eqbench_240912.csv +snorkel_mistral_pairrm_dpo,magi_hard,37.53,[],eqbench_240912.csv +qwen_14b_chat,magi_hard,39.74,[],eqbench_240912.csv +dolphin_2_2_1_mistral_7b,magi_hard,33.16,[],eqbench_240912.csv +mistral_7b_instruct_v0_2,magi_hard,34.69,[],eqbench_240912.csv +mistral_7b_openorca,magi_hard,35.78,[],eqbench_240912.csv +neural_chat_7b_v3_1,magi_hard,36.65,[],eqbench_240912.csv +internlm2_chat_7b,magi_hard,38.43,[],eqbench_240912.csv +yi_6b_chat,magi_hard,38.74,[],eqbench_240912.csv +orion_14b_chat,magi_hard,40.74,[],eqbench_240912.csv +una_cybertron_7b_v2_bf16,magi_hard,37.5,[],eqbench_240912.csv +c4ai_command_r_v0_1,magi_hard,43.27,[],eqbench_240912.csv +mistral_7b_instruct_v0_3,magi_hard,36.0,[],eqbench_240912.csv +vicuna_33b_v1_3,magi_hard,31.66,[],eqbench_240912.csv +nanbeige2_8b_chat,magi_hard,33.03,[],eqbench_240912.csv +gemma_1_1_7b_it,magi_hard,38.43,[],eqbench_240912.csv +qwen1_5_moe_a2_7b_chat,magi_hard,38.34,[],eqbench_240912.csv +vicuna_13b_v1_5,magi_hard,28.75,[],eqbench_240912.csv +gemma_2_2b_it,magi_hard,35.22,[],eqbench_240912.csv +qwen1_5_7b_chat,magi_hard,41.59,[],eqbench_240912.csv +sparsetral_16x7b_v2,magi_hard,34.97,[],eqbench_240912.csv +zephyr_7b_beta,magi_hard,35.97,[],eqbench_240912.csv +wizardlm_13b_v1_2,magi_hard,29.1,[],eqbench_240912.csv +zephyr_7b_alpha,magi_hard,35.15,[],eqbench_240912.csv +phi_2_orange,magi_hard,32.03,[],eqbench_240912.csv +phi_2_psy,magi_hard,32.03,[],eqbench_240912.csv +gemma_7b_it,magi_hard,24.85,[],eqbench_240912.csv +phi_2_dpo,magi_hard,31.85,[],eqbench_240912.csv +phixtral_2x2_8,magi_hard,30.44,[],eqbench_240912.csv +qwen_7b_chat,magi_hard,33.44,[],eqbench_240912.csv +mistral_7b_instruct_v0_1,magi_hard,30.69,[],eqbench_240912.csv +llama_2_13b_chat,magi_hard,28.2,[],eqbench_240912.csv +guanaco_33b_merged,magi_hard,31.78,[],eqbench_240912.csv +nous_capybara_7b_v1,magi_hard,30.16,[],eqbench_240912.csv +llama_2_7b_chat,magi_hard,27.5,[],eqbench_240912.csv +qwen1_5_4b_chat,magi_hard,32.66,[],eqbench_240912.csv +qwen_1_8b_chat,magi_hard,29.19,[],eqbench_240912.csv +phi_2,magi_hard,30.57,[],eqbench_240912.csv +qwen1_5_1_8b_chat,magi_hard,31.56,[],eqbench_240912.csv +vicuna_7b_v1_1,magi_hard,27.38,[],eqbench_240912.csv +gemma_2b_it,magi_hard,24.16,[],eqbench_240912.csv +koala_7b,magi_hard,23.7,[],eqbench_240912.csv +stablelm_2_zephyr_1_6b,magi_hard,27.54,[],eqbench_240912.csv +random_baseline,magi_hard,25.0,[],eqbench_240912.csv +gpt_4_1106_preview,BIGGEN,4.22,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN,4.19,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN,4.141,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN,4.132,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN,4.103,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN,4.012,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN,4.011,[],biggen_240829.csv +qwen_110b_chat,BIGGEN,3.979,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN,3.954,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN,3.953,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN,3.936,[],biggen_240829.csv +mistral_medium,BIGGEN,3.935,[],biggen_240829.csv +mistral_large,BIGGEN,3.927,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN,3.899,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN,3.839,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN,3.832,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN,3.821,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN,3.813,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN,3.756,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN,3.753,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN,3.737,[],biggen_240829.csv +yi_34b_chat,BIGGEN,3.701,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN,3.695,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN,3.689,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN,3.683,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN,3.679,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN,3.678,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN,3.677,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN,3.672,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN,3.668,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN,3.64,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN,3.619,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN,3.606,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN,3.596,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN,3.581,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN,3.573,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN,3.573,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN,3.556,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN,3.537,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN,3.522,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN,3.493,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN,3.476,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN,3.473,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN,3.467,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN,3.462,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN,3.445,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN,3.441,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN,3.423,[],biggen_240829.csv +qwen1_5_72b,BIGGEN,3.422,[],biggen_240829.csv +codetulu_2_34b,BIGGEN,3.421,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN,3.407,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN,3.394,[],biggen_240829.csv +codellama34b_instruct,BIGGEN,3.363,[],biggen_240829.csv +yi_34b,BIGGEN,3.322,[],biggen_240829.csv +llama_2_70b,BIGGEN,3.317,[],biggen_240829.csv +qwen1_5_32b,BIGGEN,3.312,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN,3.307,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN,3.28,[],biggen_240829.csv +codetulu_2_13b,BIGGEN,3.254,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN,3.248,[],biggen_240829.csv +tulu_2_13b,BIGGEN,3.211,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN,3.206,[],biggen_240829.csv +yi_6b_chat,BIGGEN,3.204,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN,3.14,[],biggen_240829.csv +gemma_7b_it,BIGGEN,3.132,[],biggen_240829.csv +llama3_70b,BIGGEN,3.122,[],biggen_240829.csv +qwen1_5_14b,BIGGEN,3.106,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN,3.072,[],biggen_240829.csv +codetulu_2_7b,BIGGEN,3.07,[],biggen_240829.csv +tulu_2_7b,BIGGEN,3.041,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN,3.024,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN,3.006,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN,2.976,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN,2.974,[],biggen_240829.csv +gemma_2b_it,BIGGEN,2.932,[],biggen_240829.csv +qwen1_5_7b,BIGGEN,2.872,[],biggen_240829.csv +phi_2,BIGGEN,2.859,[],biggen_240829.csv +olmo_7b_sft,BIGGEN,2.827,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN,2.805,[],biggen_240829.csv +llemma_34b,BIGGEN,2.771,[],biggen_240829.csv +llama3_8b,BIGGEN,2.743,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN,2.741,[],biggen_240829.csv +qwen1_5_4b,BIGGEN,2.708,[],biggen_240829.csv +llama_2_13b,BIGGEN,2.703,[],biggen_240829.csv +yi_6b,BIGGEN,2.635,[],biggen_240829.csv +codellama_70b,BIGGEN,2.593,[],biggen_240829.csv +codellama34b,BIGGEN,2.509,[],biggen_240829.csv +phi_1_5,BIGGEN,2.497,[],biggen_240829.csv +orca_2_13b,BIGGEN,2.489,[],biggen_240829.csv +llama_2_7b,BIGGEN,2.457,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN,2.364,[],biggen_240829.csv +llemma_7b,BIGGEN,2.27,[],biggen_240829.csv +gemma_2b,BIGGEN,2.262,[],biggen_240829.csv +codellama_13b,BIGGEN,2.134,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN,2.108,[],biggen_240829.csv +orca_2_7b,BIGGEN,2.083,[],biggen_240829.csv +olmo_7b,BIGGEN,2.081,[],biggen_240829.csv +codellama_7b,BIGGEN,1.954,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN,1.834,[],biggen_240829.csv +olmo_1b,BIGGEN,1.648,[],biggen_240829.csv +aya_101,BIGGEN,1.447,[],biggen_240829.csv +gemma_7b,BIGGEN,1.411,[],biggen_240829.csv +phi_1,BIGGEN,1.135,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Grounding,4.288,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Grounding,4.3,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Grounding,4.238,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Grounding,4.312,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Grounding,4.288,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Grounding,4.125,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Grounding,4.25,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Grounding,4.15,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Grounding,4.138,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Grounding,4.05,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Grounding,4.012,[],biggen_240829.csv +mistral_medium,BIGGEN Grounding,3.962,[],biggen_240829.csv +mistral_large,BIGGEN Grounding,4.025,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Grounding,4.138,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Grounding,3.988,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Grounding,3.888,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Grounding,3.725,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Grounding,3.788,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Grounding,3.8,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Grounding,4.125,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Grounding,3.812,[],biggen_240829.csv +yi_34b_chat,BIGGEN Grounding,3.738,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Grounding,3.9,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Grounding,3.925,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Grounding,3.7,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Grounding,3.712,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Grounding,4.025,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Grounding,3.812,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Grounding,3.812,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Grounding,3.662,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Grounding,3.6,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Grounding,3.7,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Grounding,3.688,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Grounding,3.65,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Grounding,3.638,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Grounding,3.55,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Grounding,3.625,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Grounding,3.588,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Grounding,3.712,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Grounding,3.55,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Grounding,3.662,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Grounding,3.338,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Grounding,3.612,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Grounding,3.662,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Grounding,3.688,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Grounding,3.712,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Grounding,3.525,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Grounding,3.45,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Grounding,3.488,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Grounding,3.45,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Grounding,3.588,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Grounding,3.525,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Grounding,3.5,[],biggen_240829.csv +yi_34b,BIGGEN Grounding,3.512,[],biggen_240829.csv +llama_2_70b,BIGGEN Grounding,3.425,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Grounding,3.325,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Grounding,3.388,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Grounding,3.238,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Grounding,3.225,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Grounding,3.25,[],biggen_240829.csv +tulu_2_13b,BIGGEN Grounding,3.15,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Grounding,3.262,[],biggen_240829.csv +yi_6b_chat,BIGGEN Grounding,3.275,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Grounding,3.212,[],biggen_240829.csv +gemma_7b_it,BIGGEN Grounding,3.312,[],biggen_240829.csv +llama3_70b,BIGGEN Grounding,3.35,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Grounding,3.538,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Grounding,2.9,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Grounding,3.112,[],biggen_240829.csv +tulu_2_7b,BIGGEN Grounding,2.862,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Grounding,3.15,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Grounding,3.225,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Grounding,2.9,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Grounding,3.112,[],biggen_240829.csv +gemma_2b_it,BIGGEN Grounding,2.875,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Grounding,2.988,[],biggen_240829.csv +phi_2,BIGGEN Grounding,3.138,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Grounding,2.95,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Grounding,2.85,[],biggen_240829.csv +llemma_34b,BIGGEN Grounding,2.988,[],biggen_240829.csv +llama3_8b,BIGGEN Grounding,3.262,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Grounding,2.812,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Grounding,2.888,[],biggen_240829.csv +llama_2_13b,BIGGEN Grounding,2.85,[],biggen_240829.csv +yi_6b,BIGGEN Grounding,2.938,[],biggen_240829.csv +codellama_70b,BIGGEN Grounding,2.938,[],biggen_240829.csv +codellama34b,BIGGEN Grounding,2.812,[],biggen_240829.csv +phi_1_5,BIGGEN Grounding,2.475,[],biggen_240829.csv +orca_2_13b,BIGGEN Grounding,2.938,[],biggen_240829.csv +llama_2_7b,BIGGEN Grounding,2.612,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Grounding,2.538,[],biggen_240829.csv +llemma_7b,BIGGEN Grounding,2.412,[],biggen_240829.csv +gemma_2b,BIGGEN Grounding,2.338,[],biggen_240829.csv +codellama_13b,BIGGEN Grounding,2.3,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Grounding,2.2,[],biggen_240829.csv +orca_2_7b,BIGGEN Grounding,2.425,[],biggen_240829.csv +olmo_7b,BIGGEN Grounding,2.388,[],biggen_240829.csv +codellama_7b,BIGGEN Grounding,1.962,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Grounding,2.025,[],biggen_240829.csv +olmo_1b,BIGGEN Grounding,1.762,[],biggen_240829.csv +aya_101,BIGGEN Grounding,1.288,[],biggen_240829.csv +gemma_7b,BIGGEN Grounding,1.325,[],biggen_240829.csv +phi_1,BIGGEN Grounding,1.112,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Instruction Following,4.23,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Instruction Following,4.2,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Instruction Following,4.26,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Instruction Following,4.13,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Instruction Following,4.06,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Instruction Following,4.18,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Instruction Following,3.92,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Instruction Following,4.01,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Instruction Following,4.01,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Instruction Following,4.04,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Instruction Following,4.0,[],biggen_240829.csv +mistral_medium,BIGGEN Instruction Following,3.94,[],biggen_240829.csv +mistral_large,BIGGEN Instruction Following,3.99,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Instruction Following,3.91,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Instruction Following,4.0,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Instruction Following,3.99,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Instruction Following,3.88,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Instruction Following,3.85,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Instruction Following,3.84,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Instruction Following,3.94,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Instruction Following,4.06,[],biggen_240829.csv +yi_34b_chat,BIGGEN Instruction Following,3.83,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Instruction Following,3.88,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Instruction Following,3.85,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Instruction Following,3.89,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Instruction Following,3.8,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Instruction Following,3.79,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Instruction Following,3.88,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Instruction Following,3.77,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Instruction Following,3.88,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Instruction Following,3.84,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Instruction Following,3.87,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Instruction Following,3.7,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Instruction Following,3.78,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Instruction Following,3.84,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Instruction Following,3.62,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Instruction Following,3.9,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Instruction Following,3.88,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Instruction Following,3.72,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Instruction Following,3.72,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Instruction Following,3.74,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Instruction Following,3.65,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Instruction Following,3.8,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Instruction Following,3.92,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Instruction Following,3.66,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Instruction Following,3.58,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Instruction Following,3.7,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Instruction Following,3.77,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Instruction Following,3.6,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Instruction Following,3.51,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Instruction Following,3.53,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Instruction Following,3.66,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Instruction Following,3.5,[],biggen_240829.csv +yi_34b,BIGGEN Instruction Following,3.54,[],biggen_240829.csv +llama_2_70b,BIGGEN Instruction Following,3.56,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Instruction Following,3.64,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Instruction Following,3.58,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Instruction Following,3.76,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Instruction Following,3.5,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Instruction Following,3.56,[],biggen_240829.csv +tulu_2_13b,BIGGEN Instruction Following,3.38,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Instruction Following,3.34,[],biggen_240829.csv +yi_6b_chat,BIGGEN Instruction Following,3.52,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Instruction Following,3.36,[],biggen_240829.csv +gemma_7b_it,BIGGEN Instruction Following,3.43,[],biggen_240829.csv +llama3_70b,BIGGEN Instruction Following,3.33,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Instruction Following,3.41,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Instruction Following,3.34,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Instruction Following,3.41,[],biggen_240829.csv +tulu_2_7b,BIGGEN Instruction Following,3.34,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Instruction Following,3.33,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Instruction Following,3.3,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Instruction Following,3.19,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Instruction Following,3.54,[],biggen_240829.csv +gemma_2b_it,BIGGEN Instruction Following,3.24,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Instruction Following,3.14,[],biggen_240829.csv +phi_2,BIGGEN Instruction Following,2.92,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Instruction Following,3.27,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Instruction Following,2.7,[],biggen_240829.csv +llemma_34b,BIGGEN Instruction Following,2.97,[],biggen_240829.csv +llama3_8b,BIGGEN Instruction Following,2.94,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Instruction Following,3.27,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Instruction Following,2.94,[],biggen_240829.csv +llama_2_13b,BIGGEN Instruction Following,3.09,[],biggen_240829.csv +yi_6b,BIGGEN Instruction Following,2.97,[],biggen_240829.csv +codellama_70b,BIGGEN Instruction Following,2.62,[],biggen_240829.csv +codellama34b,BIGGEN Instruction Following,2.66,[],biggen_240829.csv +phi_1_5,BIGGEN Instruction Following,2.89,[],biggen_240829.csv +orca_2_13b,BIGGEN Instruction Following,2.49,[],biggen_240829.csv +llama_2_7b,BIGGEN Instruction Following,2.87,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Instruction Following,2.85,[],biggen_240829.csv +llemma_7b,BIGGEN Instruction Following,2.57,[],biggen_240829.csv +gemma_2b,BIGGEN Instruction Following,2.72,[],biggen_240829.csv +codellama_13b,BIGGEN Instruction Following,2.3,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Instruction Following,2.61,[],biggen_240829.csv +orca_2_7b,BIGGEN Instruction Following,2.27,[],biggen_240829.csv +olmo_7b,BIGGEN Instruction Following,2.26,[],biggen_240829.csv +codellama_7b,BIGGEN Instruction Following,2.25,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Instruction Following,2.12,[],biggen_240829.csv +olmo_1b,BIGGEN Instruction Following,1.8,[],biggen_240829.csv +aya_101,BIGGEN Instruction Following,1.45,[],biggen_240829.csv +gemma_7b,BIGGEN Instruction Following,1.49,[],biggen_240829.csv +phi_1,BIGGEN Instruction Following,1.01,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Planning,4.271,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Planning,4.357,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Planning,4.357,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Planning,4.3,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Planning,4.186,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Planning,4.186,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Planning,4.171,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Planning,4.229,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Planning,4.129,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Planning,4.129,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Planning,4.0,[],biggen_240829.csv +mistral_medium,BIGGEN Planning,4.029,[],biggen_240829.csv +mistral_large,BIGGEN Planning,4.029,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Planning,3.971,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Planning,4.186,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Planning,4.029,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Planning,3.8,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Planning,4.029,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Planning,4.0,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Planning,3.929,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Planning,3.957,[],biggen_240829.csv +yi_34b_chat,BIGGEN Planning,3.914,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Planning,3.6,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Planning,3.843,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Planning,3.9,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Planning,3.7,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Planning,3.829,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Planning,3.9,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Planning,3.857,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Planning,3.929,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Planning,3.871,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Planning,3.8,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Planning,3.743,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Planning,3.714,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Planning,3.757,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Planning,3.957,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Planning,3.857,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Planning,3.714,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Planning,3.829,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Planning,3.729,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Planning,3.8,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Planning,3.643,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Planning,3.686,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Planning,3.686,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Planning,3.729,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Planning,3.5,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Planning,3.6,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Planning,3.6,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Planning,3.5,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Planning,3.686,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Planning,3.371,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Planning,3.8,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Planning,3.457,[],biggen_240829.csv +yi_34b,BIGGEN Planning,3.529,[],biggen_240829.csv +llama_2_70b,BIGGEN Planning,3.386,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Planning,3.514,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Planning,3.586,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Planning,3.5,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Planning,3.4,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Planning,3.371,[],biggen_240829.csv +tulu_2_13b,BIGGEN Planning,3.4,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Planning,3.357,[],biggen_240829.csv +yi_6b_chat,BIGGEN Planning,3.414,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Planning,3.286,[],biggen_240829.csv +gemma_7b_it,BIGGEN Planning,3.071,[],biggen_240829.csv +llama3_70b,BIGGEN Planning,3.114,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Planning,3.157,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Planning,3.229,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Planning,3.114,[],biggen_240829.csv +tulu_2_7b,BIGGEN Planning,3.229,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Planning,3.1,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Planning,3.243,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Planning,3.086,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Planning,3.271,[],biggen_240829.csv +gemma_2b_it,BIGGEN Planning,3.114,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Planning,3.014,[],biggen_240829.csv +phi_2,BIGGEN Planning,2.857,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Planning,2.957,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Planning,2.671,[],biggen_240829.csv +llemma_34b,BIGGEN Planning,2.743,[],biggen_240829.csv +llama3_8b,BIGGEN Planning,2.657,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Planning,2.914,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Planning,2.729,[],biggen_240829.csv +llama_2_13b,BIGGEN Planning,2.786,[],biggen_240829.csv +yi_6b,BIGGEN Planning,2.657,[],biggen_240829.csv +codellama_70b,BIGGEN Planning,2.557,[],biggen_240829.csv +codellama34b,BIGGEN Planning,2.486,[],biggen_240829.csv +phi_1_5,BIGGEN Planning,2.5,[],biggen_240829.csv +orca_2_13b,BIGGEN Planning,1.786,[],biggen_240829.csv +llama_2_7b,BIGGEN Planning,2.514,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Planning,2.386,[],biggen_240829.csv +llemma_7b,BIGGEN Planning,2.086,[],biggen_240829.csv +gemma_2b,BIGGEN Planning,2.357,[],biggen_240829.csv +codellama_13b,BIGGEN Planning,1.957,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Planning,2.057,[],biggen_240829.csv +orca_2_7b,BIGGEN Planning,1.371,[],biggen_240829.csv +olmo_7b,BIGGEN Planning,1.929,[],biggen_240829.csv +codellama_7b,BIGGEN Planning,1.771,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Planning,1.7,[],biggen_240829.csv +olmo_1b,BIGGEN Planning,1.443,[],biggen_240829.csv +aya_101,BIGGEN Planning,1.471,[],biggen_240829.csv +gemma_7b,BIGGEN Planning,1.186,[],biggen_240829.csv +phi_1,BIGGEN Planning,1.0,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Reasoning,4.22,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Reasoning,4.16,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Reasoning,4.21,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Reasoning,4.2,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Reasoning,3.97,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Reasoning,3.87,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Reasoning,3.91,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Reasoning,3.94,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Reasoning,3.69,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Reasoning,4.06,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Reasoning,3.96,[],biggen_240829.csv +mistral_medium,BIGGEN Reasoning,3.95,[],biggen_240829.csv +mistral_large,BIGGEN Reasoning,3.93,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Reasoning,3.92,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Reasoning,3.64,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Reasoning,3.68,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Reasoning,3.81,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Reasoning,3.62,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Reasoning,3.56,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Reasoning,3.47,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Reasoning,3.53,[],biggen_240829.csv +yi_34b_chat,BIGGEN Reasoning,3.57,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Reasoning,3.71,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Reasoning,3.65,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Reasoning,3.36,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Reasoning,3.82,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Reasoning,3.51,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Reasoning,3.39,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Reasoning,3.42,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Reasoning,3.22,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Reasoning,3.62,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Reasoning,3.18,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Reasoning,3.5,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Reasoning,3.39,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Reasoning,3.34,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Reasoning,3.52,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Reasoning,3.36,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Reasoning,3.3,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Reasoning,3.33,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Reasoning,3.23,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Reasoning,3.26,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Reasoning,3.53,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Reasoning,3.12,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Reasoning,2.76,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Reasoning,3.28,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Reasoning,3.3,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Reasoning,3.11,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Reasoning,2.9,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Reasoning,3.25,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Reasoning,3.01,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Reasoning,3.25,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Reasoning,3.28,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Reasoning,3.04,[],biggen_240829.csv +yi_34b,BIGGEN Reasoning,3.27,[],biggen_240829.csv +llama_2_70b,BIGGEN Reasoning,3.06,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Reasoning,3.31,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Reasoning,2.85,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Reasoning,2.79,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Reasoning,2.8,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Reasoning,2.96,[],biggen_240829.csv +tulu_2_13b,BIGGEN Reasoning,2.8,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Reasoning,2.77,[],biggen_240829.csv +yi_6b_chat,BIGGEN Reasoning,2.85,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Reasoning,2.75,[],biggen_240829.csv +gemma_7b_it,BIGGEN Reasoning,2.97,[],biggen_240829.csv +llama3_70b,BIGGEN Reasoning,3.04,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Reasoning,3.0,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Reasoning,2.74,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Reasoning,2.73,[],biggen_240829.csv +tulu_2_7b,BIGGEN Reasoning,2.81,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Reasoning,2.78,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Reasoning,2.86,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Reasoning,2.83,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Reasoning,2.47,[],biggen_240829.csv +gemma_2b_it,BIGGEN Reasoning,2.48,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Reasoning,2.65,[],biggen_240829.csv +phi_2,BIGGEN Reasoning,2.8,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Reasoning,2.4,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Reasoning,2.83,[],biggen_240829.csv +llemma_34b,BIGGEN Reasoning,2.75,[],biggen_240829.csv +llama3_8b,BIGGEN Reasoning,2.39,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Reasoning,2.28,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Reasoning,2.45,[],biggen_240829.csv +llama_2_13b,BIGGEN Reasoning,2.28,[],biggen_240829.csv +yi_6b,BIGGEN Reasoning,2.36,[],biggen_240829.csv +codellama_70b,BIGGEN Reasoning,2.44,[],biggen_240829.csv +codellama34b,BIGGEN Reasoning,2.17,[],biggen_240829.csv +phi_1_5,BIGGEN Reasoning,2.24,[],biggen_240829.csv +orca_2_13b,BIGGEN Reasoning,2.24,[],biggen_240829.csv +llama_2_7b,BIGGEN Reasoning,2.18,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Reasoning,1.98,[],biggen_240829.csv +llemma_7b,BIGGEN Reasoning,2.24,[],biggen_240829.csv +gemma_2b,BIGGEN Reasoning,2.16,[],biggen_240829.csv +codellama_13b,BIGGEN Reasoning,2.01,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Reasoning,1.76,[],biggen_240829.csv +orca_2_7b,BIGGEN Reasoning,1.85,[],biggen_240829.csv +olmo_7b,BIGGEN Reasoning,1.84,[],biggen_240829.csv +codellama_7b,BIGGEN Reasoning,1.72,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Reasoning,1.58,[],biggen_240829.csv +olmo_1b,BIGGEN Reasoning,1.33,[],biggen_240829.csv +aya_101,BIGGEN Reasoning,1.25,[],biggen_240829.csv +gemma_7b,BIGGEN Reasoning,1.34,[],biggen_240829.csv +phi_1,BIGGEN Reasoning,1.0,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Refinement,4.171,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Refinement,4.145,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Refinement,4.079,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Refinement,4.105,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Refinement,3.908,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Refinement,3.907,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Refinement,3.724,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Refinement,3.882,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Refinement,3.632,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Refinement,3.671,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Refinement,3.842,[],biggen_240829.csv +mistral_medium,BIGGEN Refinement,3.776,[],biggen_240829.csv +mistral_large,BIGGEN Refinement,3.776,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Refinement,3.453,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Refinement,3.461,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Refinement,3.632,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Refinement,3.974,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Refinement,3.395,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Refinement,3.547,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Refinement,3.507,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Refinement,3.342,[],biggen_240829.csv +yi_34b_chat,BIGGEN Refinement,3.676,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Refinement,3.434,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Refinement,3.434,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Refinement,3.421,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Refinement,3.513,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Refinement,3.434,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Refinement,3.447,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Refinement,3.382,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Refinement,3.36,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Refinement,3.373,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Refinement,3.447,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Refinement,3.539,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Refinement,3.461,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Refinement,3.566,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Refinement,3.618,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Refinement,3.263,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Refinement,3.395,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Refinement,3.224,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Refinement,3.382,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Refinement,3.355,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Refinement,3.373,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Refinement,3.263,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Refinement,3.079,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Refinement,3.276,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Refinement,3.237,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Refinement,3.171,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Refinement,3.184,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Refinement,3.227,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Refinement,3.211,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Refinement,3.25,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Refinement,3.28,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Refinement,3.079,[],biggen_240829.csv +yi_34b,BIGGEN Refinement,3.24,[],biggen_240829.csv +llama_2_70b,BIGGEN Refinement,3.133,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Refinement,3.118,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Refinement,2.961,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Refinement,3.079,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Refinement,3.197,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Refinement,3.197,[],biggen_240829.csv +tulu_2_13b,BIGGEN Refinement,3.027,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Refinement,2.895,[],biggen_240829.csv +yi_6b_chat,BIGGEN Refinement,3.08,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Refinement,2.961,[],biggen_240829.csv +gemma_7b_it,BIGGEN Refinement,3.026,[],biggen_240829.csv +llama3_70b,BIGGEN Refinement,3.342,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Refinement,3.092,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Refinement,3.053,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Refinement,2.908,[],biggen_240829.csv +tulu_2_7b,BIGGEN Refinement,2.974,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Refinement,2.892,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Refinement,2.763,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Refinement,3.0,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Refinement,2.776,[],biggen_240829.csv +gemma_2b_it,BIGGEN Refinement,2.882,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Refinement,2.827,[],biggen_240829.csv +phi_2,BIGGEN Refinement,2.763,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Refinement,2.684,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Refinement,2.747,[],biggen_240829.csv +llemma_34b,BIGGEN Refinement,2.816,[],biggen_240829.csv +llama3_8b,BIGGEN Refinement,3.039,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Refinement,2.855,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Refinement,2.697,[],biggen_240829.csv +llama_2_13b,BIGGEN Refinement,2.579,[],biggen_240829.csv +yi_6b,BIGGEN Refinement,2.487,[],biggen_240829.csv +codellama_70b,BIGGEN Refinement,2.507,[],biggen_240829.csv +codellama34b,BIGGEN Refinement,2.566,[],biggen_240829.csv +phi_1_5,BIGGEN Refinement,2.526,[],biggen_240829.csv +orca_2_13b,BIGGEN Refinement,2.487,[],biggen_240829.csv +llama_2_7b,BIGGEN Refinement,2.211,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Refinement,2.605,[],biggen_240829.csv +llemma_7b,BIGGEN Refinement,2.303,[],biggen_240829.csv +gemma_2b,BIGGEN Refinement,2.093,[],biggen_240829.csv +codellama_13b,BIGGEN Refinement,2.092,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Refinement,2.0,[],biggen_240829.csv +orca_2_7b,BIGGEN Refinement,2.316,[],biggen_240829.csv +olmo_7b,BIGGEN Refinement,2.105,[],biggen_240829.csv +codellama_7b,BIGGEN Refinement,2.118,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Refinement,2.158,[],biggen_240829.csv +olmo_1b,BIGGEN Refinement,1.947,[],biggen_240829.csv +aya_101,BIGGEN Refinement,1.908,[],biggen_240829.csv +gemma_7b,BIGGEN Refinement,1.579,[],biggen_240829.csv +phi_1,BIGGEN Refinement,1.434,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Safety,4.565,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Safety,4.174,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Safety,4.058,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Safety,4.087,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Safety,4.536,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Safety,4.014,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Safety,4.362,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Safety,4.043,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Safety,4.304,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Safety,4.116,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Safety,4.087,[],biggen_240829.csv +mistral_medium,BIGGEN Safety,4.058,[],biggen_240829.csv +mistral_large,BIGGEN Safety,3.913,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Safety,4.217,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Safety,3.971,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Safety,3.957,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Safety,4.145,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Safety,4.217,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Safety,3.87,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Safety,3.725,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Safety,3.739,[],biggen_240829.csv +yi_34b_chat,BIGGEN Safety,3.884,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Safety,3.812,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Safety,3.884,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Safety,3.754,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Safety,3.957,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Safety,4.0,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Safety,3.899,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Safety,3.826,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Safety,4.377,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Safety,3.942,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Safety,3.826,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Safety,4.0,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Safety,3.609,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Safety,3.725,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Safety,3.449,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Safety,3.855,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Safety,3.725,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Safety,3.913,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Safety,3.551,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Safety,3.377,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Safety,3.536,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Safety,3.696,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Safety,4.319,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Safety,3.435,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Safety,3.87,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Safety,3.971,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Safety,3.841,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Safety,3.942,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Safety,3.652,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Safety,4.043,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Safety,3.232,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Safety,4.13,[],biggen_240829.csv +yi_34b,BIGGEN Safety,3.58,[],biggen_240829.csv +llama_2_70b,BIGGEN Safety,3.87,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Safety,3.333,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Safety,4.145,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Safety,3.754,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Safety,3.29,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Safety,3.667,[],biggen_240829.csv +tulu_2_13b,BIGGEN Safety,3.768,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Safety,4.043,[],biggen_240829.csv +yi_6b_chat,BIGGEN Safety,3.478,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Safety,3.754,[],biggen_240829.csv +gemma_7b_it,BIGGEN Safety,3.768,[],biggen_240829.csv +llama3_70b,BIGGEN Safety,3.261,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Safety,2.58,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Safety,3.971,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Safety,3.246,[],biggen_240829.csv +tulu_2_7b,BIGGEN Safety,3.638,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Safety,3.377,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Safety,3.406,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Safety,3.333,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Safety,3.101,[],biggen_240829.csv +gemma_2b_it,BIGGEN Safety,3.754,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Safety,3.101,[],biggen_240829.csv +phi_2,BIGGEN Safety,3.406,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Safety,3.333,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Safety,4.101,[],biggen_240829.csv +llemma_34b,BIGGEN Safety,2.971,[],biggen_240829.csv +llama3_8b,BIGGEN Safety,2.899,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Safety,2.681,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Safety,3.333,[],biggen_240829.csv +llama_2_13b,BIGGEN Safety,3.348,[],biggen_240829.csv +yi_6b,BIGGEN Safety,3.232,[],biggen_240829.csv +codellama_70b,BIGGEN Safety,2.841,[],biggen_240829.csv +codellama34b,BIGGEN Safety,2.725,[],biggen_240829.csv +phi_1_5,BIGGEN Safety,2.87,[],biggen_240829.csv +orca_2_13b,BIGGEN Safety,2.812,[],biggen_240829.csv +llama_2_7b,BIGGEN Safety,3.217,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Safety,2.478,[],biggen_240829.csv +llemma_7b,BIGGEN Safety,2.522,[],biggen_240829.csv +gemma_2b,BIGGEN Safety,2.623,[],biggen_240829.csv +codellama_13b,BIGGEN Safety,2.449,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Safety,2.391,[],biggen_240829.csv +orca_2_7b,BIGGEN Safety,2.594,[],biggen_240829.csv +olmo_7b,BIGGEN Safety,2.652,[],biggen_240829.csv +codellama_7b,BIGGEN Safety,2.348,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Safety,2.014,[],biggen_240829.csv +olmo_1b,BIGGEN Safety,2.188,[],biggen_240829.csv +aya_101,BIGGEN Safety,1.667,[],biggen_240829.csv +gemma_7b,BIGGEN Safety,2.159,[],biggen_240829.csv +phi_1,BIGGEN Safety,1.507,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Theory of Mind,4.24,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Theory of Mind,4.26,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Theory of Mind,4.08,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Theory of Mind,4.12,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Theory of Mind,4.09,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Theory of Mind,4.04,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Theory of Mind,4.0,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Theory of Mind,3.99,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Theory of Mind,3.98,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Theory of Mind,4.07,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Theory of Mind,3.87,[],biggen_240829.csv +mistral_medium,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv +mistral_large,BIGGEN Theory of Mind,3.93,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Theory of Mind,3.96,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Theory of Mind,3.94,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Theory of Mind,3.96,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Theory of Mind,3.87,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Theory of Mind,3.87,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Theory of Mind,3.83,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Theory of Mind,3.79,[],biggen_240829.csv +yi_34b_chat,BIGGEN Theory of Mind,3.96,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Theory of Mind,3.81,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Theory of Mind,3.79,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Theory of Mind,3.83,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Theory of Mind,3.83,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Theory of Mind,3.67,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Theory of Mind,3.9,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Theory of Mind,3.73,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Theory of Mind,3.75,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Theory of Mind,3.77,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Theory of Mind,3.49,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Theory of Mind,3.63,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Theory of Mind,3.66,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Theory of Mind,3.58,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Theory of Mind,3.52,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Theory of Mind,3.7,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Theory of Mind,3.54,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Theory of Mind,3.73,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Theory of Mind,3.69,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Theory of Mind,3.56,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Theory of Mind,3.58,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Theory of Mind,3.71,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Theory of Mind,3.57,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Theory of Mind,3.59,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Theory of Mind,3.5,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Theory of Mind,3.59,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Theory of Mind,3.38,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Theory of Mind,3.5,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Theory of Mind,3.44,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Theory of Mind,3.45,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Theory of Mind,3.46,[],biggen_240829.csv +yi_34b,BIGGEN Theory of Mind,3.39,[],biggen_240829.csv +llama_2_70b,BIGGEN Theory of Mind,3.48,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Theory of Mind,3.33,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Theory of Mind,3.65,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Theory of Mind,3.68,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Theory of Mind,3.38,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Theory of Mind,3.42,[],biggen_240829.csv +tulu_2_13b,BIGGEN Theory of Mind,3.39,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Theory of Mind,3.38,[],biggen_240829.csv +yi_6b_chat,BIGGEN Theory of Mind,3.677,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Theory of Mind,3.22,[],biggen_240829.csv +gemma_7b_it,BIGGEN Theory of Mind,3.15,[],biggen_240829.csv +llama3_70b,BIGGEN Theory of Mind,3.04,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Theory of Mind,3.16,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Theory of Mind,3.37,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Theory of Mind,3.25,[],biggen_240829.csv +tulu_2_7b,BIGGEN Theory of Mind,3.26,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Theory of Mind,3.29,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Theory of Mind,3.09,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Theory of Mind,3.07,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Theory of Mind,3.31,[],biggen_240829.csv +gemma_2b_it,BIGGEN Theory of Mind,3.15,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Theory of Mind,2.77,[],biggen_240829.csv +phi_2,BIGGEN Theory of Mind,3.2,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Theory of Mind,2.93,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Theory of Mind,2.55,[],biggen_240829.csv +llemma_34b,BIGGEN Theory of Mind,2.84,[],biggen_240829.csv +llama3_8b,BIGGEN Theory of Mind,2.82,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Theory of Mind,3.13,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Theory of Mind,2.73,[],biggen_240829.csv +llama_2_13b,BIGGEN Theory of Mind,2.88,[],biggen_240829.csv +yi_6b,BIGGEN Theory of Mind,2.89,[],biggen_240829.csv +codellama_70b,BIGGEN Theory of Mind,2.44,[],biggen_240829.csv +codellama34b,BIGGEN Theory of Mind,2.59,[],biggen_240829.csv +phi_1_5,BIGGEN Theory of Mind,2.95,[],biggen_240829.csv +orca_2_13b,BIGGEN Theory of Mind,2.8,[],biggen_240829.csv +llama_2_7b,BIGGEN Theory of Mind,2.6,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Theory of Mind,2.55,[],biggen_240829.csv +llemma_7b,BIGGEN Theory of Mind,2.19,[],biggen_240829.csv +gemma_2b,BIGGEN Theory of Mind,2.32,[],biggen_240829.csv +codellama_13b,BIGGEN Theory of Mind,2.15,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Theory of Mind,2.38,[],biggen_240829.csv +orca_2_7b,BIGGEN Theory of Mind,2.24,[],biggen_240829.csv +olmo_7b,BIGGEN Theory of Mind,2.16,[],biggen_240829.csv +codellama_7b,BIGGEN Theory of Mind,1.9,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Theory of Mind,1.8,[],biggen_240829.csv +olmo_1b,BIGGEN Theory of Mind,1.59,[],biggen_240829.csv +aya_101,BIGGEN Theory of Mind,1.38,[],biggen_240829.csv +gemma_7b,BIGGEN Theory of Mind,1.2,[],biggen_240829.csv +phi_1,BIGGEN Theory of Mind,1.0,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Tool Usage,3.775,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Tool Usage,3.925,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Tool Usage,3.85,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Tool Usage,3.8,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Tool Usage,3.788,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Tool Usage,3.775,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Tool Usage,3.75,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Tool Usage,3.588,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Tool Usage,3.75,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Tool Usage,3.488,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Tool Usage,3.712,[],biggen_240829.csv +mistral_medium,BIGGEN Tool Usage,3.862,[],biggen_240829.csv +mistral_large,BIGGEN Tool Usage,3.825,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Tool Usage,3.625,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Tool Usage,3.525,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Tool Usage,3.525,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Tool Usage,3.338,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Tool Usage,3.738,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Tool Usage,3.562,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Tool Usage,3.5,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Tool Usage,3.662,[],biggen_240829.csv +yi_34b_chat,BIGGEN Tool Usage,3.038,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Tool Usage,3.412,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Tool Usage,3.138,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Tool Usage,3.612,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Tool Usage,3.1,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Tool Usage,3.162,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Tool Usage,3.188,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Tool Usage,3.412,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Tool Usage,3.188,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Tool Usage,3.125,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Tool Usage,3.362,[],biggen_240829.csv +mixtral_8x22b_v0_1_awq,BIGGEN Tool Usage,3.188,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Tool Usage,3.538,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Tool Usage,3.125,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Tool Usage,3.288,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Tool Usage,3.2,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Tool Usage,3.15,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Tool Usage,3.025,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Tool Usage,3.288,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Tool Usage,3.062,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Tool Usage,3.175,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Tool Usage,3.025,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Tool Usage,2.6,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Tool Usage,3.062,[],biggen_240829.csv +mixtral_8x7b_v0_1,BIGGEN Tool Usage,2.775,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Tool Usage,2.95,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Tool Usage,3.05,[],biggen_240829.csv +qwen1_5_72b,BIGGEN Tool Usage,2.988,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Tool Usage,3.35,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Tool Usage,2.788,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Tool Usage,2.925,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Tool Usage,2.738,[],biggen_240829.csv +yi_34b,BIGGEN Tool Usage,2.512,[],biggen_240829.csv +llama_2_70b,BIGGEN Tool Usage,2.625,[],biggen_240829.csv +qwen1_5_32b,BIGGEN Tool Usage,2.925,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Tool Usage,2.3,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Tool Usage,2.438,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Tool Usage,3.238,[],biggen_240829.csv +solar_10_7b_v1_0,BIGGEN Tool Usage,2.562,[],biggen_240829.csv +tulu_2_13b,BIGGEN Tool Usage,2.775,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Tool Usage,2.6,[],biggen_240829.csv +yi_6b_chat,BIGGEN Tool Usage,2.338,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Tool Usage,2.575,[],biggen_240829.csv +gemma_7b_it,BIGGEN Tool Usage,2.325,[],biggen_240829.csv +llama3_70b,BIGGEN Tool Usage,2.5,[],biggen_240829.csv +qwen1_5_14b,BIGGEN Tool Usage,2.912,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Tool Usage,1.975,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Tool Usage,2.788,[],biggen_240829.csv +tulu_2_7b,BIGGEN Tool Usage,2.212,[],biggen_240829.csv +mistral_7b_v0_2,BIGGEN Tool Usage,2.275,[],biggen_240829.csv +mistral_7b_v0_1,BIGGEN Tool Usage,2.162,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Tool Usage,2.4,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Tool Usage,2.212,[],biggen_240829.csv +gemma_2b_it,BIGGEN Tool Usage,1.962,[],biggen_240829.csv +qwen1_5_7b,BIGGEN Tool Usage,2.488,[],biggen_240829.csv +phi_2,BIGGEN Tool Usage,1.788,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Tool Usage,2.088,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Tool Usage,1.988,[],biggen_240829.csv +llemma_34b,BIGGEN Tool Usage,2.088,[],biggen_240829.csv +llama3_8b,BIGGEN Tool Usage,1.938,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Tool Usage,1.988,[],biggen_240829.csv +qwen1_5_4b,BIGGEN Tool Usage,1.9,[],biggen_240829.csv +llama_2_13b,BIGGEN Tool Usage,1.812,[],biggen_240829.csv +yi_6b,BIGGEN Tool Usage,1.55,[],biggen_240829.csv +codellama_70b,BIGGEN Tool Usage,2.4,[],biggen_240829.csv +codellama34b,BIGGEN Tool Usage,2.062,[],biggen_240829.csv +phi_1_5,BIGGEN Tool Usage,1.525,[],biggen_240829.csv +orca_2_13b,BIGGEN Tool Usage,2.362,[],biggen_240829.csv +llama_2_7b,BIGGEN Tool Usage,1.45,[],biggen_240829.csv +qwen1_5_1_8b,BIGGEN Tool Usage,1.525,[],biggen_240829.csv +llemma_7b,BIGGEN Tool Usage,1.838,[],biggen_240829.csv +gemma_2b,BIGGEN Tool Usage,1.488,[],biggen_240829.csv +codellama_13b,BIGGEN Tool Usage,1.812,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Tool Usage,1.462,[],biggen_240829.csv +orca_2_7b,BIGGEN Tool Usage,1.6,[],biggen_240829.csv +olmo_7b,BIGGEN Tool Usage,1.312,[],biggen_240829.csv +codellama_7b,BIGGEN Tool Usage,1.562,[],biggen_240829.csv +qwen1_5_0_5b,BIGGEN Tool Usage,1.275,[],biggen_240829.csv +olmo_1b,BIGGEN Tool Usage,1.125,[],biggen_240829.csv +aya_101,BIGGEN Tool Usage,1.162,[],biggen_240829.csv +gemma_7b,BIGGEN Tool Usage,1.012,[],biggen_240829.csv +phi_1,BIGGEN Tool Usage,1.012,[],biggen_240829.csv +gpt_4_1106_preview,BIGGEN Multilingual,3.6,[],biggen_240829.csv +gpt_4_0125_preview,BIGGEN Multilingual,3.543,[],biggen_240829.csv +gpt_4o_2024_05_13,BIGGEN Multilingual,3.643,[],biggen_240829.csv +gpt_4_turbo_2024_04_09,BIGGEN Multilingual,3.471,[],biggen_240829.csv +claude_3_opus_20240229,BIGGEN Multilingual,3.571,[],biggen_240829.csv +llama3_70b_instruct,BIGGEN Multilingual,3.314,[],biggen_240829.csv +claude_3_sonnet_20240229,BIGGEN Multilingual,3.186,[],biggen_240829.csv +qwen_110b_chat,BIGGEN Multilingual,2.771,[],biggen_240829.csv +claude_3_haiku_20240307,BIGGEN Multilingual,3.071,[],biggen_240829.csv +gemini_pro_1_5,BIGGEN Multilingual,3.257,[],biggen_240829.csv +mixtral_8x22b_instruct_v0_1_awq,BIGGEN Multilingual,2.714,[],biggen_240829.csv +mistral_medium,BIGGEN Multilingual,2.929,[],biggen_240829.csv +mistral_large,BIGGEN Multilingual,2.886,[],biggen_240829.csv +gemini_flash_1_5,BIGGEN Multilingual,2.671,[],biggen_240829.csv +c4ai_command_r_plus_gptq,BIGGEN Multilingual,2.757,[],biggen_240829.csv +qwen1_5_72b_chat,BIGGEN Multilingual,2.914,[],biggen_240829.csv +phi_3_mini_4k_instruct,BIGGEN Multilingual,1.914,[],biggen_240829.csv +qwen1_5_32b_chat,BIGGEN Multilingual,2.714,[],biggen_240829.csv +starling_lm_7b_beta,BIGGEN Multilingual,2.271,[],biggen_240829.csv +llama3_8b_instruct,BIGGEN Multilingual,2.914,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_dpo,BIGGEN Multilingual,2.557,[],biggen_240829.csv +yi_34b_chat,BIGGEN Multilingual,2.186,[],biggen_240829.csv +mixtral_8x7b_instruct_v0_1,BIGGEN Multilingual,2.714,[],biggen_240829.csv +gpt_3_5_turbo_0125,BIGGEN Multilingual,2.614,[],biggen_240829.csv +tulu_2_dpo_70b,BIGGEN Multilingual,2.314,[],biggen_240829.csv +phi_3_mini_128k_instruct,BIGGEN Multilingual,1.829,[],biggen_240829.csv +gpt_3_5_turbo_1106,BIGGEN Multilingual,2.557,[],biggen_240829.csv +c4ai_command_r_v0_1,BIGGEN Multilingual,2.186,[],biggen_240829.csv +solar_10_7b_instruct_v1_0,BIGGEN Multilingual,2.443,[],biggen_240829.csv +llama_2_70b_chat,BIGGEN Multilingual,2.386,[],biggen_240829.csv +gemini_1_0_pro,BIGGEN Multilingual,3.186,[],biggen_240829.csv +mistral_7b_instruct_v0_2,BIGGEN Multilingual,2.286,[],biggen_240829.csv +nous_hermes_2_mixtral_8x7b_sft,BIGGEN Multilingual,2.4,[],biggen_240829.csv +openchat_3_5_0106,BIGGEN Multilingual,2.157,[],biggen_240829.csv +zephyr_orpo_141b_a35b_v0_1_awq,BIGGEN Multilingual,2.586,[],biggen_240829.csv +qwen1_5_14b_chat,BIGGEN Multilingual,2.386,[],biggen_240829.csv +qwen1_5_7b_chat,BIGGEN Multilingual,2.057,[],biggen_240829.csv +starling_lm_7b_alpha,BIGGEN Multilingual,2.229,[],biggen_240829.csv +zephyr_7b_beta,BIGGEN Multilingual,1.943,[],biggen_240829.csv +nous_hermes_2_mistral_7b_dpo,BIGGEN Multilingual,2.171,[],biggen_240829.csv +nous_hermes_2_yi_34b,BIGGEN Multilingual,2.071,[],biggen_240829.csv +mistral_orpo_beta,BIGGEN Multilingual,2.1,[],biggen_240829.csv +llama_2_13b_chat,BIGGEN Multilingual,2.114,[],biggen_240829.csv +openhermes_2_5_mistral_7b,BIGGEN Multilingual,2.1,[],biggen_240829.csv +mistral_orpo_alpha,BIGGEN Multilingual,2.086,[],biggen_240829.csv +tulu_2_dpo_13b,BIGGEN Multilingual,2.143,[],biggen_240829.csv +codetulu_2_34b,BIGGEN Multilingual,2.0,[],biggen_240829.csv +gemma_1_1_7b_it,BIGGEN Multilingual,2.0,[],biggen_240829.csv +openhermes_2_mistral_7b,BIGGEN Multilingual,1.914,[],biggen_240829.csv +codellama34b_instruct,BIGGEN Multilingual,2.114,[],biggen_240829.csv +llama_2_7b_chat,BIGGEN Multilingual,2.029,[],biggen_240829.csv +tulu_2_dpo_7b,BIGGEN Multilingual,1.971,[],biggen_240829.csv +codetulu_2_13b,BIGGEN Multilingual,1.886,[],biggen_240829.csv +tulu_2_13b,BIGGEN Multilingual,2.029,[],biggen_240829.csv +codellama_13b_instruct,BIGGEN Multilingual,1.886,[],biggen_240829.csv +yi_6b_chat,BIGGEN Multilingual,1.457,[],biggen_240829.csv +codellama_7b_instruct,BIGGEN Multilingual,1.771,[],biggen_240829.csv +gemma_7b_it,BIGGEN Multilingual,1.786,[],biggen_240829.csv +gemma_1_1_2b_it,BIGGEN Multilingual,1.471,[],biggen_240829.csv +codetulu_2_7b,BIGGEN Multilingual,1.8,[],biggen_240829.csv +tulu_2_7b,BIGGEN Multilingual,1.714,[],biggen_240829.csv +qwen1_5_4b_chat,BIGGEN Multilingual,1.471,[],biggen_240829.csv +olmo_7b_instruct,BIGGEN Multilingual,1.414,[],biggen_240829.csv +gemma_2b_it,BIGGEN Multilingual,1.657,[],biggen_240829.csv +olmo_7b_sft,BIGGEN Multilingual,1.186,[],biggen_240829.csv +codellama_70b_instruct,BIGGEN Multilingual,1.929,[],biggen_240829.csv +qwen1_5_1_8b_chat,BIGGEN Multilingual,1.3,[],biggen_240829.csv +orca_2_13b,BIGGEN Multilingual,2.043,[],biggen_240829.csv +qwen1_5_0_5b_chat,BIGGEN Multilingual,1.159,[],biggen_240829.csv +orca_2_7b,BIGGEN Multilingual,1.729,[],biggen_240829.csv +aya_101,BIGGEN Multilingual,1.129,[],biggen_240829.csv +command_r,ruler,85.5,,ruler_bench_241002.csv +command_r_0824,ruler,86.0,,ruler_bench_241002.csv +command_r_plus,ruler,82.7,,ruler_bench_241002.csv +command_r_plus_0824,ruler,83.4,,ruler_bench_241002.csv +dbrx,ruler,38.0,,ruler_bench_241002.csv +film_7b*,ruler,66.4,,ruler_bench_241002.csv +gemini_1_5_pro,ruler,95.5,,ruler_bench_241002.csv +glm3,ruler,62.0,,ruler_bench_241002.csv +glm4,ruler,88.0,,ruler_bench_241002.csv +gpt_4_1106_preview,ruler,89.0,,ruler_bench_241002.csv +internlm2_5,ruler,77.8,,ruler_bench_241002.csv +jamba_1_5_large,ruler,95.7,,ruler_bench_241002.csv +jamba_1_5_mini,ruler,93.1,,ruler_bench_241002.csv +llama3,ruler,82.6,,ruler_bench_241002.csv +llama3_1,ruler,85.5,,ruler_bench_241002.csv +longalpaca,ruler,24.7,,ruler_bench_241002.csv +longchat,ruler,33.1,,ruler_bench_241002.csv +lwm,ruler,69.9,,ruler_bench_241002.csv +megabeam_mistral,ruler,87.3,,ruler_bench_241002.csv +mistral,ruler,55.6,,ruler_bench_241002.csv +mistral_large,ruler,70.6,,ruler_bench_241002.csv +mistral_nemo,ruler,54.7,,ruler_bench_241002.csv +mixtral_8x22b,ruler,73.5,,ruler_bench_241002.csv +mixtral_8x7b,ruler,72.8,,ruler_bench_241002.csv +phi3_medium,ruler,74.8,,ruler_bench_241002.csv +phi3_mini,ruler,80.9,,ruler_bench_241002.csv +qwen1_5,ruler,37.5,,ruler_bench_241002.csv +qwen2,ruler,79.6,,ruler_bench_241002.csv +together,ruler,33.8,,ruler_bench_241002.csv +yi,ruler,84.8,,ruler_bench_241002.csv +zephyr_7b_beta,LiveBench 240624,17.32,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench 240624,19.28,[],livebench_240701.csv +yi_6b_chat,LiveBench 240624,9.02,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench 240624,14.22,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench 240624,12.31,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench 240624,16.62,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench 240624,39.66,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench 240624,40.16,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench 240624,26.63,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench 240624,10.42,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench 240624,7.3,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench 240624,29.07,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench 240624,28.89,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench 240624,17.02,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench 240624,11.59,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench 240624,6.32,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench 240624,5.43,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench 240624,35.14,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench 240624,27.81,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench 240624,29.68,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench 240624,29.09,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench 240624,24.76,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench 240624,24.41,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench 240624,29.88,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench 240624,30.96,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench 240624,23.36,[],livebench_240701.csv +open_mistral_nemo,LiveBench 240624,29.02,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench 240624,35.29,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench 240624,22.79,[],livebench_240701.csv +mistral_small_2402,LiveBench 240624,33.03,[],livebench_240701.csv +mistral_large_2407,LiveBench 240624,48.35,[],livebench_240701.csv +mistral_large_2402,LiveBench 240624,38.92,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench 240624,20.09,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench 240624,19.51,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench 240624,55.18,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench 240624,48.9,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench 240624,28.11,[],livebench_240701.csv +llama3_70b_instruct,LiveBench 240624,37.6,[],livebench_240701.csv +llama3_8b_instruct,LiveBench 240624,27.46,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench 240624,24.33,[],livebench_240701.csv +llama_2_7b_chat,LiveBench 240624,10.25,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench 240624,39.56,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench 240624,44.57,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench 240624,56.46,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench 240624,54.96,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench 240624,53.0,[],livebench_240701.csv +gpt_4_0613,LiveBench 240624,44.94,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench 240624,49.39,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench 240624,34.66,[],livebench_240701.csv +gemma_2_27b_it,LiveBench 240624,41.22,[],livebench_240701.csv +gemma_2_9b_it,LiveBench 240624,31.57,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench 240624,18.23,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench 240624,55.06,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench 240624,53.63,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench 240624,44.41,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench 240624,47.51,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench 240624,40.95,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench 240624,49.82,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench 240624,41.72,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench 240624,17.49,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench 240624,29.21,[],livebench_240701.csv +deepseek_coder_v2,LiveBench 240624,46.84,[],livebench_240701.csv +deepseek_chat_v2,LiveBench 240624,46.36,[],livebench_240701.csv +command_r_plus,LiveBench 240624,32.86,[],livebench_240701.csv +command_r,LiveBench 240624,27.23,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench 240624,38.08,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench 240624,50.75,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench 240624,35.32,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench 240624,61.16,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench 240624,55.35,[],livebench_240701.csv +zephyr_7b_beta,LiveBench Reasoning Average,16.0,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench Reasoning Average,17.0,[],livebench_240701.csv +yi_6b_chat,LiveBench Reasoning Average,8.0,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench Reasoning Average,15.0,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench Reasoning Average,12.0,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench Reasoning Average,19.0,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench Reasoning Average,37.0,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench Reasoning Average,42.0,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench Reasoning Average,20.0,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench Reasoning Average,8.0,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench Reasoning Average,3.0,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench Reasoning Average,26.0,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench Reasoning Average,21.0,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench Reasoning Average,13.0,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench Reasoning Average,13.0,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench Reasoning Average,5.0,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench Reasoning Average,4.0,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench Reasoning Average,41.0,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench Reasoning Average,31.0,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench Reasoning Average,28.0,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench Reasoning Average,29.0,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench Reasoning Average,24.0,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench Reasoning Average,22.0,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench Reasoning Average,31.0,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench Reasoning Average,35.0,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench Reasoning Average,17.0,[],livebench_240701.csv +open_mistral_nemo,LiveBench Reasoning Average,25.0,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench Reasoning Average,29.0,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench Reasoning Average,18.0,[],livebench_240701.csv +mistral_small_2402,LiveBench Reasoning Average,28.0,[],livebench_240701.csv +mistral_large_2407,LiveBench Reasoning Average,45.0,[],livebench_240701.csv +mistral_large_2402,LiveBench Reasoning Average,35.0,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench Reasoning Average,11.0,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench Reasoning Average,13.0,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench Reasoning Average,57.0,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench Reasoning Average,43.0,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench Reasoning Average,14.0,[],livebench_240701.csv +llama3_70b_instruct,LiveBench Reasoning Average,31.0,[],livebench_240701.csv +llama3_8b_instruct,LiveBench Reasoning Average,25.0,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench Reasoning Average,16.0,[],livebench_240701.csv +llama_2_7b_chat,LiveBench Reasoning Average,5.0,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench Reasoning Average,32.0,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench Reasoning Average,37.0,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench Reasoning Average,54.0,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench Reasoning Average,55.0,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench Reasoning Average,54.0,[],livebench_240701.csv +gpt_4_0613,LiveBench Reasoning Average,31.0,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench Reasoning Average,48.0,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench Reasoning Average,26.0,[],livebench_240701.csv +gemma_2_27b_it,LiveBench Reasoning Average,31.0,[],livebench_240701.csv +gemma_2_9b_it,LiveBench Reasoning Average,19.0,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench Reasoning Average,10.0,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench Reasoning Average,56.0,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench Reasoning Average,55.0,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench Reasoning Average,33.0,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench Reasoning Average,52.0,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench Reasoning Average,30.0,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench Reasoning Average,50.0,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench Reasoning Average,41.0,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench Reasoning Average,13.0,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench Reasoning Average,22.0,[],livebench_240701.csv +deepseek_coder_v2,LiveBench Reasoning Average,49.0,[],livebench_240701.csv +deepseek_chat_v2,LiveBench Reasoning Average,41.0,[],livebench_240701.csv +command_r_plus,LiveBench Reasoning Average,32.0,[],livebench_240701.csv +command_r,LiveBench Reasoning Average,28.0,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench Reasoning Average,26.0,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench Reasoning Average,41.0,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench Reasoning Average,26.0,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench Reasoning Average,64.0,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench Reasoning Average,57.0,[],livebench_240701.csv +zephyr_7b_beta,LiveBench Coding Average,8.32,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench Coding Average,11.32,[],livebench_240701.csv +yi_6b_chat,LiveBench Coding Average,1.32,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench Coding Average,1.32,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench Coding Average,1.0,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench Coding Average,18.26,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench Coding Average,39.05,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench Coding Average,31.79,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench Coding Average,29.21,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench Coding Average,5.63,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench Coding Average,2.0,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench Coding Average,22.21,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench Coding Average,22.89,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench Coding Average,6.63,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench Coding Average,4.0,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench Coding Average,0.0,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench Coding Average,0.0,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench Coding Average,19.26,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench Coding Average,15.26,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench Coding Average,24.87,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench Coding Average,21.24,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench Coding Average,14.29,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench Coding Average,14.79,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench Coding Average,21.58,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench Coding Average,20.58,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench Coding Average,11.63,[],livebench_240701.csv +open_mistral_nemo,LiveBench Coding Average,28.16,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench Coding Average,33.11,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench Coding Average,11.32,[],livebench_240701.csv +mistral_small_2402,LiveBench Coding Average,24.21,[],livebench_240701.csv +mistral_large_2407,LiveBench Coding Average,46.37,[],livebench_240701.csv +mistral_large_2402,LiveBench Coding Average,26.84,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench Coding Average,9.0,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench Coding Average,11.63,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench Coding Average,45.68,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench Coding Average,33.11,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench Coding Average,21.58,[],livebench_240701.csv +llama3_70b_instruct,LiveBench Coding Average,20.95,[],livebench_240701.csv +llama3_8b_instruct,LiveBench Coding Average,18.26,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench Coding Average,15.63,[],livebench_240701.csv +llama_2_7b_chat,LiveBench Coding Average,0.0,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench Coding Average,29.79,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench Coding Average,43.37,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench Coding Average,50.63,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench Coding Average,46.37,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench Coding Average,47.05,[],livebench_240701.csv +gpt_4_0613,LiveBench Coding Average,37.05,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench Coding Average,44.05,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench Coding Average,29.16,[],livebench_240701.csv +gemma_2_27b_it,LiveBench Coding Average,36.74,[],livebench_240701.csv +gemma_2_9b_it,LiveBench Coding Average,22.21,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench Coding Average,11.0,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench Coding Average,42.0,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench Coding Average,43.37,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench Coding Average,32.79,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench Coding Average,39.74,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench Coding Average,39.05,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench Coding Average,36.11,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench Coding Average,41.05,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench Coding Average,8.63,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench Coding Average,26.84,[],livebench_240701.csv +deepseek_coder_v2,LiveBench Coding Average,41.05,[],livebench_240701.csv +deepseek_chat_v2,LiveBench Coding Average,42.05,[],livebench_240701.csv +command_r_plus,LiveBench Coding Average,20.26,[],livebench_240701.csv +command_r,LiveBench Coding Average,14.95,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench Coding Average,25.21,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench Coding Average,40.05,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench Coding Average,24.53,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench Coding Average,63.21,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench Coding Average,46.0,[],livebench_240701.csv +zephyr_7b_beta,LiveBench Mathematics Average,11.23,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench Mathematics Average,9.96,[],livebench_240701.csv +yi_6b_chat,LiveBench Mathematics Average,8.53,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench Mathematics Average,9.04,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench Mathematics Average,7.1,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench Mathematics Average,14.86,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench Mathematics Average,40.67,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench Mathematics Average,43.44,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench Mathematics Average,26.87,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench Mathematics Average,9.94,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench Mathematics Average,7.35,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench Mathematics Average,26.28,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench Mathematics Average,26.82,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench Mathematics Average,15.29,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench Mathematics Average,9.86,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench Mathematics Average,3.53,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench Mathematics Average,4.43,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench Mathematics Average,33.3,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench Mathematics Average,22.2,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench Mathematics Average,28.97,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench Mathematics Average,23.73,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench Mathematics Average,17.06,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench Mathematics Average,20.84,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench Mathematics Average,25.64,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench Mathematics Average,31.36,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench Mathematics Average,20.45,[],livebench_240701.csv +open_mistral_nemo,LiveBench Mathematics Average,21.66,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench Mathematics Average,28.33,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench Mathematics Average,20.71,[],livebench_240701.csv +mistral_small_2402,LiveBench Mathematics Average,28.15,[],livebench_240701.csv +mistral_large_2407,LiveBench Mathematics Average,40.48,[],livebench_240701.csv +mistral_large_2402,LiveBench Mathematics Average,32.2,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench Mathematics Average,14.56,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench Mathematics Average,17.08,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench Mathematics Average,46.55,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench Mathematics Average,45.58,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench Mathematics Average,24.37,[],livebench_240701.csv +llama3_70b_instruct,LiveBench Mathematics Average,32.31,[],livebench_240701.csv +llama3_8b_instruct,LiveBench Mathematics Average,19.66,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench Mathematics Average,17.84,[],livebench_240701.csv +llama_2_7b_chat,LiveBench Mathematics Average,4.78,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench Mathematics Average,28.32,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench Mathematics Average,41.58,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench Mathematics Average,52.29,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench Mathematics Average,49.88,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench Mathematics Average,48.99,[],livebench_240701.csv +gpt_4_0613,LiveBench Mathematics Average,36.22,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench Mathematics Average,42.75,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench Mathematics Average,26.93,[],livebench_240701.csv +gemma_2_27b_it,LiveBench Mathematics Average,36.23,[],livebench_240701.csv +gemma_2_9b_it,LiveBench Mathematics Average,23.98,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench Mathematics Average,15.21,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench Mathematics Average,56.28,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench Mathematics Average,47.46,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench Mathematics Average,42.42,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench Mathematics Average,36.29,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench Mathematics Average,38.89,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench Mathematics Average,45.68,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench Mathematics Average,42.77,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench Mathematics Average,14.08,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench Mathematics Average,34.44,[],livebench_240701.csv +deepseek_coder_v2,LiveBench Mathematics Average,52.54,[],livebench_240701.csv +deepseek_chat_v2,LiveBench Mathematics Average,52.11,[],livebench_240701.csv +command_r_plus,LiveBench Mathematics Average,24.85,[],livebench_240701.csv +command_r,LiveBench Mathematics Average,16.92,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench Mathematics Average,29.65,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench Mathematics Average,46.54,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench Mathematics Average,25.72,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench Mathematics Average,53.75,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench Mathematics Average,52.19,[],livebench_240701.csv +zephyr_7b_beta,LiveBench Data Analysis Average,15.75,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench Data Analysis Average,17.4,[],livebench_240701.csv +yi_6b_chat,LiveBench Data Analysis Average,4.38,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench Data Analysis Average,9.93,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench Data Analysis Average,3.33,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench Data Analysis Average,2.0,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench Data Analysis Average,26.19,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench Data Analysis Average,26.24,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench Data Analysis Average,28.75,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench Data Analysis Average,10.01,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench Data Analysis Average,2.0,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench Data Analysis Average,31.45,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench Data Analysis Average,32.98,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench Data Analysis Average,16.9,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench Data Analysis Average,9.13,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench Data Analysis Average,3.33,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench Data Analysis Average,0.0,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench Data Analysis Average,40.46,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench Data Analysis Average,30.43,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench Data Analysis Average,27.26,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench Data Analysis Average,29.62,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench Data Analysis Average,34.02,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench Data Analysis Average,29.55,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench Data Analysis Average,32.12,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench Data Analysis Average,31.63,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench Data Analysis Average,26.92,[],livebench_240701.csv +open_mistral_nemo,LiveBench Data Analysis Average,33.35,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench Data Analysis Average,31.67,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench Data Analysis Average,28.13,[],livebench_240701.csv +mistral_small_2402,LiveBench Data Analysis Average,31.88,[],livebench_240701.csv +mistral_large_2407,LiveBench Data Analysis Average,46.61,[],livebench_240701.csv +mistral_large_2402,LiveBench Data Analysis Average,42.55,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench Data Analysis Average,21.77,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench Data Analysis Average,14.62,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench Data Analysis Average,53.51,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench Data Analysis Average,50.29,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench Data Analysis Average,32.15,[],livebench_240701.csv +llama3_70b_instruct,LiveBench Data Analysis Average,43.75,[],livebench_240701.csv +llama3_8b_instruct,LiveBench Data Analysis Average,26.0,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench Data Analysis Average,27.89,[],livebench_240701.csv +llama_2_7b_chat,LiveBench Data Analysis Average,0.0,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench Data Analysis Average,48.11,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench Data Analysis Average,44.52,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench Data Analysis Average,52.89,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench Data Analysis Average,52.41,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench Data Analysis Average,51.32,[],livebench_240701.csv +gpt_4_0613,LiveBench Data Analysis Average,44.03,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench Data Analysis Average,54.06,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench Data Analysis Average,41.21,[],livebench_240701.csv +gemma_2_27b_it,LiveBench Data Analysis Average,43.58,[],livebench_240701.csv +gemma_2_9b_it,LiveBench Data Analysis Average,35.06,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench Data Analysis Average,18.17,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench Data Analysis Average,50.83,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench Data Analysis Average,50.15,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench Data Analysis Average,52.81,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench Data Analysis Average,47.87,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench Data Analysis Average,44.03,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench Data Analysis Average,47.99,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench Data Analysis Average,26.24,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench Data Analysis Average,18.19,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench Data Analysis Average,33.0,[],livebench_240701.csv +deepseek_coder_v2,LiveBench Data Analysis Average,38.25,[],livebench_240701.csv +deepseek_chat_v2,LiveBench Data Analysis Average,45.59,[],livebench_240701.csv +command_r_plus,LiveBench Data Analysis Average,24.6,[],livebench_240701.csv +command_r,LiveBench Data Analysis Average,31.69,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench Data Analysis Average,44.56,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench Data Analysis Average,54.32,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench Data Analysis Average,41.54,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench Data Analysis Average,56.74,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench Data Analysis Average,54.43,[],livebench_240701.csv +zephyr_7b_beta,LiveBench Language Average,4.28,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench Language Average,7.2,[],livebench_240701.csv +yi_6b_chat,LiveBench Language Average,4.69,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench Language Average,7.92,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench Language Average,8.66,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench Language Average,7.26,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench Language Average,30.03,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench Language Average,29.21,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench Language Average,10.21,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench Language Average,3.05,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench Language Average,2.8,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench Language Average,13.22,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench Language Average,11.37,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench Language Average,6.18,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench Language Average,5.8,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench Language Average,3.16,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench Language Average,2.88,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench Language Average,17.07,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench Language Average,9.67,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench Language Average,15.53,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench Language Average,15.13,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench Language Average,7.76,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench Language Average,8.06,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench Language Average,12.76,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench Language Average,13.91,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench Language Average,11.37,[],livebench_240701.csv +open_mistral_nemo,LiveBench Language Average,14.15,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench Language Average,26.48,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench Language Average,13.76,[],livebench_240701.csv +mistral_small_2402,LiveBench Language Average,22.06,[],livebench_240701.csv +mistral_large_2407,LiveBench Language Average,39.79,[],livebench_240701.csv +mistral_large_2402,LiveBench Language Average,28.74,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench Language Average,11.85,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench Language Average,9.05,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench Language Average,49.85,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench Language Average,42.36,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench Language Average,20.05,[],livebench_240701.csv +llama3_70b_instruct,LiveBench Language Average,34.11,[],livebench_240701.csv +llama3_8b_instruct,LiveBench Language Average,18.72,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench Language Average,15.37,[],livebench_240701.csv +llama_2_7b_chat,LiveBench Language Average,6.86,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench Language Average,43.77,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench Language Average,35.28,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench Language Average,54.37,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench Language Average,53.94,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench Language Average,45.26,[],livebench_240701.csv +gpt_4_0613,LiveBench Language Average,49.57,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench Language Average,43.55,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench Language Average,24.22,[],livebench_240701.csv +gemma_2_27b_it,LiveBench Language Average,32.4,[],livebench_240701.csv +gemma_2_9b_it,LiveBench Language Average,27.64,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench Language Average,10.65,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench Language Average,49.31,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench Language Average,46.96,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench Language Average,38.25,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench Language Average,31.04,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench Language Average,30.69,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench Language Average,41.77,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench Language Average,31.17,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench Language Average,9.2,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench Language Average,10.64,[],livebench_240701.csv +deepseek_coder_v2,LiveBench Language Average,33.04,[],livebench_240701.csv +deepseek_chat_v2,LiveBench Language Average,32.77,[],livebench_240701.csv +command_r_plus,LiveBench Language Average,23.92,[],livebench_240701.csv +command_r,LiveBench Language Average,14.64,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench Language Average,38.08,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench Language Average,51.72,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench Language Average,30.07,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench Language Average,56.94,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench Language Average,49.95,[],livebench_240701.csv +zephyr_7b_beta,LiveBench Instruction Following Average,48.32,[],livebench_240701.csv +zephyr_7b_alpha,LiveBench Instruction Following Average,52.79,[],livebench_240701.csv +yi_6b_chat,LiveBench Instruction Following Average,27.22,[],livebench_240701.csv +vicuna_7b_v1_5_16k,LiveBench Instruction Following Average,42.12,[],livebench_240701.csv +vicuna_7b_v1_5,LiveBench Instruction Following Average,41.75,[],livebench_240701.csv +starling_lm_7b_beta,LiveBench Instruction Following Average,38.32,[],livebench_240701.csv +smaug_qwen2_72b_instruct,LiveBench Instruction Following Average,65.0,[],livebench_240701.csv +qwen2_72b_instruct,LiveBench Instruction Following Average,68.27,[],livebench_240701.csv +qwen2_7b_instruct,LiveBench Instruction Following Average,44.74,[],livebench_240701.csv +qwen2_1_5b_instruct,LiveBench Instruction Following Average,25.9,[],livebench_240701.csv +qwen2_0_5b_instruct,LiveBench Instruction Following Average,26.63,[],livebench_240701.csv +qwen1_5_110b_chat,LiveBench Instruction Following Average,55.26,[],livebench_240701.csv +qwen1_5_72b_chat,LiveBench Instruction Following Average,58.25,[],livebench_240701.csv +qwen1_5_7b_chat,LiveBench Instruction Following Average,44.12,[],livebench_240701.csv +qwen1_5_4b_chat,LiveBench Instruction Following Average,27.75,[],livebench_240701.csv +qwen1_5_1_8b_chat,LiveBench Instruction Following Average,22.9,[],livebench_240701.csv +qwen1_5_0_5b_chat,LiveBench Instruction Following Average,21.3,[],livebench_240701.csv +phi_3_5_moe_instruct,LiveBench Instruction Following Average,59.73,[],livebench_240701.csv +phi_3_5_mini_instruct,LiveBench Instruction Following Average,58.3,[],livebench_240701.csv +phi_3_small_128k_instruct,LiveBench Instruction Following Average,53.47,[],livebench_240701.csv +phi_3_small_8k_instruct,LiveBench Instruction Following Average,55.81,[],livebench_240701.csv +phi_3_mini_128k_instruct,LiveBench Instruction Following Average,51.4,[],livebench_240701.csv +phi_3_mini_4k_instruct,LiveBench Instruction Following Average,51.25,[],livebench_240701.csv +phi_3_medium_128k_instruct,LiveBench Instruction Following Average,56.15,[],livebench_240701.csv +phi_3_medium_4k_instruct,LiveBench Instruction Following Average,53.3,[],livebench_240701.csv +openhermes_2_5_mistral_7b,LiveBench Instruction Following Average,52.78,[],livebench_240701.csv +open_mistral_nemo,LiveBench Instruction Following Average,51.8,[],livebench_240701.csv +mixtral_8x22b_instruct_v0_1,LiveBench Instruction Following Average,63.17,[],livebench_240701.csv +mixtral_8x7b_instruct_v0_1,LiveBench Instruction Following Average,44.81,[],livebench_240701.csv +mistral_small_2402,LiveBench Instruction Following Average,63.91,[],livebench_240701.csv +mistral_large_2407,LiveBench Instruction Following Average,71.85,[],livebench_240701.csv +mistral_large_2402,LiveBench Instruction Following Average,68.19,[],livebench_240701.csv +mistral_7b_instruct_v0_3,LiveBench Instruction Following Average,52.37,[],livebench_240701.csv +mistral_7b_instruct_v0_2,LiveBench Instruction Following Average,51.65,[],livebench_240701.csv +llama3_1_405b_instruct_turbo,LiveBench Instruction Following Average,78.47,[],livebench_240701.csv +llama3_1_70b_instruct_turbo,LiveBench Instruction Following Average,79.08,[],livebench_240701.csv +llama3_1_8b_instruct_turbo,LiveBench Instruction Following Average,56.53,[],livebench_240701.csv +llama3_70b_instruct,LiveBench Instruction Following Average,63.5,[],livebench_240701.csv +llama3_8b_instruct,LiveBench Instruction Following Average,57.14,[],livebench_240701.csv +mathstral_7b_v0_1,LiveBench Instruction Following Average,53.25,[],livebench_240701.csv +llama_2_7b_chat,LiveBench Instruction Following Average,44.88,[],livebench_240701.csv +hermes_3_llama3_1_70b,LiveBench Instruction Following Average,55.37,[],livebench_240701.csv +gpt_4o_mini_2024_07_18,LiveBench Instruction Following Average,65.68,[],livebench_240701.csv +gpt_4o_2024_08_06,LiveBench Instruction Following Average,74.58,[],livebench_240701.csv +gpt_4o_2024_05_13,LiveBench Instruction Following Average,72.17,[],livebench_240701.csv +gpt_4_turbo_2024_04_09,LiveBench Instruction Following Average,71.39,[],livebench_240701.csv +gpt_4_0613,LiveBench Instruction Following Average,71.79,[],livebench_240701.csv +gpt_4_0125_preview,LiveBench Instruction Following Average,63.92,[],livebench_240701.csv +gpt_3_5_turbo_0125,LiveBench Instruction Following Average,60.47,[],livebench_240701.csv +gemma_2_27b_it,LiveBench Instruction Following Average,67.37,[],livebench_240701.csv +gemma_2_9b_it,LiveBench Instruction Following Average,61.55,[],livebench_240701.csv +gemma_1_1_7b_it,LiveBench Instruction Following Average,44.34,[],livebench_240701.csv +gemini_1_5_pro_exp_0827,LiveBench Instruction Following Average,75.95,[],livebench_240701.csv +gemini_1_5_pro_exp_0801,LiveBench Instruction Following Average,78.84,[],livebench_240701.csv +gemini_1_5_pro_api_0514,LiveBench Instruction Following Average,67.2,[],livebench_240701.csv +gemini_1_5_flash_exp_0827,LiveBench Instruction Following Average,78.11,[],livebench_240701.csv +gemini_1_5_flash_api_0514,LiveBench Instruction Following Average,63.01,[],livebench_240701.csv +dracarys_llama3_1_70b_instruct,LiveBench Instruction Following Average,77.37,[],livebench_240701.csv +dracarys_72b_instruct,LiveBench Instruction Following Average,68.08,[],livebench_240701.csv +deepseek_v2_lite_chat,LiveBench Instruction Following Average,41.83,[],livebench_240701.csv +deepseek_coder_v2_lite_instruct,LiveBench Instruction Following Average,48.34,[],livebench_240701.csv +deepseek_coder_v2,LiveBench Instruction Following Average,67.18,[],livebench_240701.csv +deepseek_chat_v2,LiveBench Instruction Following Average,64.61,[],livebench_240701.csv +command_r_plus,LiveBench Instruction Following Average,71.51,[],livebench_240701.csv +command_r,LiveBench Instruction Following Average,57.16,[],livebench_240701.csv +claude_3_sonnet_20240229,LiveBench Instruction Following Average,65.0,[],livebench_240701.csv +claude_3_opus_20240229,LiveBench Instruction Following Average,70.87,[],livebench_240701.csv +claude_3_haiku_20240307,LiveBench Instruction Following Average,64.03,[],livebench_240701.csv +claude_3_5_sonnet_20240620,LiveBench Instruction Following Average,72.3,[],livebench_240701.csv +chatgpt_4o_latest,LiveBench Instruction Following Average,72.52,[],livebench_240701.csv +abab5_5_chat,hydrox_integrity,8.09,,hydrox_safety_241001.csv +abab5_5_chat,hydrox_overall_score,6.6,,hydrox_safety_241001.csv +abab5_5_chat,hydrox_privacy,5.13,,hydrox_safety_241001.csv +abab5_5_chat,hydrox_safety,8.32,,hydrox_safety_241001.csv +abab5_5_chat,hydrox_security,4.85,,hydrox_safety_241001.csv +abab5_5s_chat,hydrox_integrity,19.46,,hydrox_safety_241001.csv +abab5_5s_chat,hydrox_overall_score,19.12,,hydrox_safety_241001.csv +abab5_5s_chat,hydrox_privacy,20.63,,hydrox_safety_241001.csv +abab5_5s_chat,hydrox_safety,22.54,,hydrox_safety_241001.csv +abab5_5s_chat,hydrox_security,14.17,,hydrox_safety_241001.csv +claude_3_5_sonnet,hydrox_integrity,95.56,,hydrox_safety_241001.csv +claude_3_5_sonnet,hydrox_overall_score,94.18,,hydrox_safety_241001.csv +claude_3_5_sonnet,hydrox_privacy,93.83,,hydrox_safety_241001.csv +claude_3_5_sonnet,hydrox_safety,94.75,,hydrox_safety_241001.csv +claude_3_5_sonnet,hydrox_security,92.61,,hydrox_safety_241001.csv +claude_3_haiku,hydrox_integrity,89.53,,hydrox_safety_241001.csv +claude_3_haiku,hydrox_overall_score,91.59,,hydrox_safety_241001.csv +claude_3_haiku,hydrox_privacy,93.69,,hydrox_safety_241001.csv +claude_3_haiku,hydrox_safety,91.52,,hydrox_safety_241001.csv +claude_3_haiku,hydrox_security,91.39,,hydrox_safety_241001.csv +claude_3_opus,hydrox_integrity,94.08,,hydrox_safety_241001.csv +claude_3_opus,hydrox_overall_score,92.02,,hydrox_safety_241001.csv +claude_3_opus,hydrox_privacy,91.26,,hydrox_safety_241001.csv +claude_3_opus,hydrox_safety,92.5,,hydrox_safety_241001.csv +claude_3_opus,hydrox_security,90.47,,hydrox_safety_241001.csv +claude_3_sonnet,hydrox_integrity,94.14,,hydrox_safety_241001.csv +claude_3_sonnet,hydrox_overall_score,93.62,,hydrox_safety_241001.csv +claude_3_sonnet,hydrox_privacy,94.36,,hydrox_safety_241001.csv +claude_3_sonnet,hydrox_safety,92.33,,hydrox_safety_241001.csv +claude_3_sonnet,hydrox_security,94.62,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,hydrox_integrity,0.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,hydrox_overall_score,50.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,hydrox_privacy,0.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,hydrox_safety,50.0,,hydrox_safety_241001.csv +deepseek_v2_chat_0628,hydrox_security,0.0,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,hydrox_integrity,45.93,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,hydrox_overall_score,44.91,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,hydrox_privacy,48.84,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,hydrox_safety,44.26,,hydrox_safety_241001.csv +deepseek_v2_lite_chat,hydrox_security,41.91,,hydrox_safety_241001.csv +dolly_v2_12b,hydrox_integrity,3.72,,hydrox_safety_241001.csv +dolly_v2_12b,hydrox_overall_score,6.21,,hydrox_safety_241001.csv +dolly_v2_12b,hydrox_privacy,3.48,,hydrox_safety_241001.csv +dolly_v2_12b,hydrox_safety,11.46,,hydrox_safety_241001.csv +dolly_v2_12b,hydrox_security,3.39,,hydrox_safety_241001.csv +dolly_v2_3b,hydrox_integrity,0.18,,hydrox_safety_241001.csv +dolly_v2_3b,hydrox_overall_score,1.81,,hydrox_safety_241001.csv +dolly_v2_3b,hydrox_privacy,1.08,,hydrox_safety_241001.csv +dolly_v2_3b,hydrox_safety,4.08,,hydrox_safety_241001.csv +dolly_v2_3b,hydrox_security,0.55,,hydrox_safety_241001.csv +dolly_v2_7b,hydrox_integrity,8.33,,hydrox_safety_241001.csv +dolly_v2_7b,hydrox_overall_score,7.79,,hydrox_safety_241001.csv +dolly_v2_7b,hydrox_privacy,8.33,,hydrox_safety_241001.csv +dolly_v2_7b,hydrox_safety,9.92,,hydrox_safety_241001.csv +dolly_v2_7b,hydrox_security,4.96,,hydrox_safety_241001.csv +falcon_40b,hydrox_integrity,0.64,,hydrox_safety_241001.csv +falcon_40b,hydrox_overall_score,0.9,,hydrox_safety_241001.csv +falcon_40b,hydrox_privacy,0.25,,hydrox_safety_241001.csv +falcon_40b,hydrox_safety,2.08,,hydrox_safety_241001.csv +falcon_40b,hydrox_security,0.4,,hydrox_safety_241001.csv +falcon_40b_instruct,hydrox_integrity,30.32,,hydrox_safety_241001.csv +falcon_40b_instruct,hydrox_overall_score,27.55,,hydrox_safety_241001.csv +falcon_40b_instruct,hydrox_privacy,30.83,,hydrox_safety_241001.csv +falcon_40b_instruct,hydrox_safety,28.1,,hydrox_safety_241001.csv +falcon_40b_instruct,hydrox_security,22.97,,hydrox_safety_241001.csv +falcon_7b,hydrox_integrity,0.23,,hydrox_safety_241001.csv +falcon_7b,hydrox_overall_score,0.51,,hydrox_safety_241001.csv +falcon_7b,hydrox_privacy,0.11,,hydrox_safety_241001.csv +falcon_7b,hydrox_safety,1.05,,hydrox_safety_241001.csv +falcon_7b,hydrox_security,0.43,,hydrox_safety_241001.csv +falcon_7b_instruct,hydrox_integrity,15.76,,hydrox_safety_241001.csv +falcon_7b_instruct,hydrox_overall_score,14.01,,hydrox_safety_241001.csv +falcon_7b_instruct,hydrox_privacy,11.3,,hydrox_safety_241001.csv +falcon_7b_instruct,hydrox_safety,14.64,,hydrox_safety_241001.csv +falcon_7b_instruct,hydrox_security,14.01,,hydrox_safety_241001.csv +gemini_1_0_pro,hydrox_integrity,87.11,,hydrox_safety_241001.csv +gemini_1_0_pro,hydrox_overall_score,77.2,,hydrox_safety_241001.csv +gemini_1_0_pro,hydrox_privacy,90.39,,hydrox_safety_241001.csv +gemini_1_0_pro,hydrox_safety,65.18,,hydrox_safety_241001.csv +gemini_1_0_pro,hydrox_security,79.93,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,hydrox_integrity,88.61,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,hydrox_overall_score,78.29,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,hydrox_privacy,87.82,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,hydrox_safety,69.2,,hydrox_safety_241001.csv +gemini_1_0_pro_latest,hydrox_security,77.91,,hydrox_safety_241001.csv +gemini_1_5_flash,hydrox_integrity,60.0,,hydrox_safety_241001.csv +gemini_1_5_flash,hydrox_overall_score,74.43,,hydrox_safety_241001.csv +gemini_1_5_flash,hydrox_privacy,83.33,,hydrox_safety_241001.csv +gemini_1_5_flash,hydrox_safety,77.61,,hydrox_safety_241001.csv +gemini_1_5_flash,hydrox_security,72.05,,hydrox_safety_241001.csv +gemini_1_5_pro,hydrox_integrity,40.84,,hydrox_safety_241001.csv +gemini_1_5_pro,hydrox_overall_score,43.27,,hydrox_safety_241001.csv +gemini_1_5_pro,hydrox_privacy,40.63,,hydrox_safety_241001.csv +gemini_1_5_pro,hydrox_safety,46.99,,hydrox_safety_241001.csv +gemini_1_5_pro,hydrox_security,41.65,,hydrox_safety_241001.csv +gemini_pro,hydrox_integrity,84.42,,hydrox_safety_241001.csv +gemini_pro,hydrox_overall_score,73.04,,hydrox_safety_241001.csv +gemini_pro,hydrox_privacy,90.6,,hydrox_safety_241001.csv +gemini_pro,hydrox_safety,63.56,,hydrox_safety_241001.csv +gemini_pro,hydrox_security,67.49,,hydrox_safety_241001.csv +gemma_2_27b_it,hydrox_integrity,10.94,,hydrox_safety_241001.csv +gemma_2_27b_it,hydrox_overall_score,9.67,,hydrox_safety_241001.csv +gemma_2_27b_it,hydrox_privacy,11.11,,hydrox_safety_241001.csv +gemma_2_27b_it,hydrox_safety,8.1,,hydrox_safety_241001.csv +gemma_2_27b_it,hydrox_security,10.0,,hydrox_safety_241001.csv +gemma_2_2b,hydrox_integrity,24.88,,hydrox_safety_241001.csv +gemma_2_2b,hydrox_overall_score,25.5,,hydrox_safety_241001.csv +gemma_2_2b,hydrox_privacy,27.04,,hydrox_safety_241001.csv +gemma_2_2b,hydrox_safety,25.61,,hydrox_safety_241001.csv +gemma_2_2b,hydrox_security,24.5,,hydrox_safety_241001.csv +gemma_2_2b_it,hydrox_integrity,93.14,,hydrox_safety_241001.csv +gemma_2_2b_it,hydrox_overall_score,91.66,,hydrox_safety_241001.csv +gemma_2_2b_it,hydrox_privacy,92.43,,hydrox_safety_241001.csv +gemma_2_2b_it,hydrox_safety,92.15,,hydrox_safety_241001.csv +gemma_2_2b_it,hydrox_security,89.22,,hydrox_safety_241001.csv +gemma_2b,hydrox_integrity,6.39,,hydrox_safety_241001.csv +gemma_2b,hydrox_overall_score,7.99,,hydrox_safety_241001.csv +gemma_2b,hydrox_privacy,8.27,,hydrox_safety_241001.csv +gemma_2b,hydrox_safety,8.55,,hydrox_safety_241001.csv +gemma_2b,hydrox_security,8.09,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,hydrox_integrity,80.84,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,hydrox_overall_score,72.04,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,hydrox_privacy,90.0,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,hydrox_safety,56.94,,hydrox_safety_241001.csv +gpt_3_5_turbo_0613,hydrox_security,93.43,,hydrox_safety_241001.csv +gpt_4_0314,hydrox_integrity,54.0,,hydrox_safety_241001.csv +gpt_4_0314,hydrox_overall_score,62.51,,hydrox_safety_241001.csv +gpt_4_0314,hydrox_privacy,76.67,,hydrox_safety_241001.csv +gpt_4_0314,hydrox_safety,56.36,,hydrox_safety_241001.csv +gpt_4_0314,hydrox_security,72.79,,hydrox_safety_241001.csv +gpt_4_0613,hydrox_integrity,96.04,,hydrox_safety_241001.csv +gpt_4_0613,hydrox_overall_score,85.43,,hydrox_safety_241001.csv +gpt_4_0613,hydrox_privacy,91.79,,hydrox_safety_241001.csv +gpt_4_0613,hydrox_safety,79.94,,hydrox_safety_241001.csv +gpt_4_0613,hydrox_security,92.0,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,hydrox_integrity,63.54,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,hydrox_overall_score,65.26,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,hydrox_privacy,68.46,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,hydrox_safety,67.11,,hydrox_safety_241001.csv +gpt_4o_2024_05_13,hydrox_security,60.89,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,hydrox_integrity,81.38,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,hydrox_overall_score,80.43,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,hydrox_privacy,82.32,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,hydrox_safety,80.87,,hydrox_safety_241001.csv +gpt_4o_mini_2024_07_18,hydrox_security,77.55,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,hydrox_integrity,65.75,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,hydrox_overall_score,63.67,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,hydrox_privacy,73.46,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,hydrox_safety,63.64,,hydrox_safety_241001.csv +h2ogpt_4096_llama2_70b_chat,hydrox_security,63.34,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,hydrox_integrity,5.96,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,hydrox_overall_score,7.64,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,hydrox_privacy,6.16,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,hydrox_safety,11.03,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full,hydrox_security,5.1,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_integrity,35.51,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_overall_score,27.81,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_privacy,32.34,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_safety,22.95,,hydrox_safety_241001.csv +hydro_safe_dolly_v2_7b_dpo_full_3_epoch,hydrox_security,25.64,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_integrity,84.27,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_overall_score,83.93,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_privacy,90.63,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_safety,79.83,,hydrox_safety_241001.csv +hydro_safe_llama2_7b_chat_dpo_full_3_epoch,hydrox_security,84.68,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_integrity,97.74,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_overall_score,91.6,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_privacy,96.21,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_safety,86.56,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_instruct_v0_1_dpo_full_1_epoch,hydrox_security,91.35,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_integrity,98.16,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_overall_score,94.44,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_privacy,99.62,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_safety,89.41,,hydrox_safety_241001.csv +hydro_safe_mistral_7b_v0_1_dpo_full,hydrox_security,96.66,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_integrity,35.98,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_overall_score,31.87,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_privacy,45.3,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_safety,26.44,,hydrox_safety_241001.csv +hydro_safe_sheared_llama_1_3b_dpo_full,hydrox_security,27.07,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,hydrox_integrity,71.25,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,hydrox_overall_score,78.18,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,hydrox_privacy,49.7,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,hydrox_safety,78.18,,hydrox_safety_241001.csv +hydro_safe_zephyr_td_full,hydrox_security,66.63,,hydrox_safety_241001.csv +komt_mistral_7b_v1,hydrox_integrity,0.0,,hydrox_safety_241001.csv +komt_mistral_7b_v1,hydrox_overall_score,0.13,,hydrox_safety_241001.csv +komt_mistral_7b_v1,hydrox_privacy,0.02,,hydrox_safety_241001.csv +komt_mistral_7b_v1,hydrox_safety,0.65,,hydrox_safety_241001.csv +komt_mistral_7b_v1,hydrox_security,0.0,,hydrox_safety_241001.csv +llama3_2_1b_instruct,hydrox_integrity,76.98,,hydrox_safety_241001.csv +llama3_2_1b_instruct,hydrox_overall_score,75.78,,hydrox_safety_241001.csv +llama3_2_1b_instruct,hydrox_privacy,75.71,,hydrox_safety_241001.csv +llama3_2_1b_instruct,hydrox_safety,76.25,,hydrox_safety_241001.csv +llama3_2_1b_instruct,hydrox_security,74.2,,hydrox_safety_241001.csv +llama3_2_3b_instruct,hydrox_integrity,79.24,,hydrox_safety_241001.csv +llama3_2_3b_instruct,hydrox_overall_score,77.42,,hydrox_safety_241001.csv +llama3_2_3b_instruct,hydrox_privacy,77.9,,hydrox_safety_241001.csv +llama3_2_3b_instruct,hydrox_safety,79.46,,hydrox_safety_241001.csv +llama3_2_3b_instruct,hydrox_security,72.51,,hydrox_safety_241001.csv +llama3_70b_instruct,hydrox_integrity,73.55,,hydrox_safety_241001.csv +llama3_70b_instruct,hydrox_overall_score,74.44,,hydrox_safety_241001.csv +llama3_70b_instruct,hydrox_privacy,80.65,,hydrox_safety_241001.csv +llama3_70b_instruct,hydrox_safety,74.65,,hydrox_safety_241001.csv +llama3_70b_instruct,hydrox_security,70.21,,hydrox_safety_241001.csv +llama3_8b_instruct,hydrox_integrity,80.86,,hydrox_safety_241001.csv +llama3_8b_instruct,hydrox_overall_score,83.72,,hydrox_safety_241001.csv +llama3_8b_instruct,hydrox_privacy,88.61,,hydrox_safety_241001.csv +llama3_8b_instruct,hydrox_safety,83.32,,hydrox_safety_241001.csv +llama3_8b_instruct,hydrox_security,82.51,,hydrox_safety_241001.csv +llama_2_13b_chat,hydrox_integrity,62.67,,hydrox_safety_241001.csv +llama_2_13b_chat,hydrox_overall_score,60.0,,hydrox_safety_241001.csv +llama_2_13b_chat,hydrox_privacy,63.37,,hydrox_safety_241001.csv +llama_2_13b_chat,hydrox_safety,58.6,,hydrox_safety_241001.csv +llama_2_13b_chat,hydrox_security,57.85,,hydrox_safety_241001.csv +llama_2_70b_chat,hydrox_integrity,63.0,,hydrox_safety_241001.csv +llama_2_70b_chat,hydrox_overall_score,62.5,,hydrox_safety_241001.csv +llama_2_70b_chat,hydrox_privacy,68.87,,hydrox_safety_241001.csv +llama_2_70b_chat,hydrox_safety,61.0,,hydrox_safety_241001.csv +llama_2_70b_chat,hydrox_security,59.58,,hydrox_safety_241001.csv +llama_2_7b_chat,hydrox_integrity,51.63,,hydrox_safety_241001.csv +llama_2_7b_chat,hydrox_overall_score,51.26,,hydrox_safety_241001.csv +llama_2_7b_chat,hydrox_privacy,55.3,,hydrox_safety_241001.csv +llama_2_7b_chat,hydrox_safety,52.3,,hydrox_safety_241001.csv +llama_2_7b_chat,hydrox_security,46.71,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,hydrox_integrity,12.39,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,hydrox_overall_score,16.74,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,hydrox_privacy,12.08,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,hydrox_safety,26.91,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_1,hydrox_security,10.86,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,hydrox_integrity,32.52,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,hydrox_overall_score,36.82,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,hydrox_privacy,37.18,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,hydrox_safety,41.71,,hydrox_safety_241001.csv +mistral_7b_instruct_v0_2,hydrox_security,32.24,,hydrox_safety_241001.csv +mistral_7b_v0_1,hydrox_integrity,8.53,,hydrox_safety_241001.csv +mistral_7b_v0_1,hydrox_overall_score,7.32,,hydrox_safety_241001.csv +mistral_7b_v0_1,hydrox_privacy,4.18,,hydrox_safety_241001.csv +mistral_7b_v0_1,hydrox_safety,11.38,,hydrox_safety_241001.csv +mistral_7b_v0_1,hydrox_security,4.44,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,hydrox_integrity,21.23,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,hydrox_overall_score,23.75,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,hydrox_privacy,25.04,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,hydrox_safety,27.7,,hydrox_safety_241001.csv +mixtral_8x7b_instruct_v0_1,hydrox_security,18.24,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,hydrox_integrity,8.16,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,hydrox_overall_score,8.81,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,hydrox_privacy,8.81,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,hydrox_safety,10.61,,hydrox_safety_241001.csv +mixtral_8x7b_v0_1,hydrox_security,6.73,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,hydrox_integrity,22.84,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,hydrox_overall_score,17.86,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,hydrox_privacy,22.28,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,hydrox_safety,15.86,,hydrox_safety_241001.csv +neural_chat_7b_v3_1,hydrox_security,14.72,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,hydrox_integrity,15.33,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,hydrox_overall_score,17.82,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,hydrox_privacy,14.36,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,hydrox_safety,19.68,,hydrox_safety_241001.csv +neural_chat_7b_v3_2,hydrox_security,18.62,,hydrox_safety_241001.csv +nexusraven_v2_13b,hydrox_integrity,4.5,,hydrox_safety_241001.csv +nexusraven_v2_13b,hydrox_overall_score,4.16,,hydrox_safety_241001.csv +nexusraven_v2_13b,hydrox_privacy,3.13,,hydrox_safety_241001.csv +nexusraven_v2_13b,hydrox_safety,3.95,,hydrox_safety_241001.csv +nexusraven_v2_13b,hydrox_security,4.77,,hydrox_safety_241001.csv +notus_7b_v1,hydrox_integrity,19.5,,hydrox_safety_241001.csv +notus_7b_v1,hydrox_overall_score,21.3,,hydrox_safety_241001.csv +notus_7b_v1,hydrox_privacy,22.05,,hydrox_safety_241001.csv +notus_7b_v1,hydrox_safety,26.55,,hydrox_safety_241001.csv +notus_7b_v1,hydrox_security,15.53,,hydrox_safety_241001.csv +orca_2_13b,hydrox_integrity,0.0,,hydrox_safety_241001.csv +orca_2_13b,hydrox_overall_score,17.48,,hydrox_safety_241001.csv +orca_2_13b,hydrox_privacy,27.78,,hydrox_safety_241001.csv +orca_2_13b,hydrox_safety,33.06,,hydrox_safety_241001.csv +orca_2_13b,hydrox_security,0.0,,hydrox_safety_241001.csv +orca_2_7b,hydrox_integrity,22.09,,hydrox_safety_241001.csv +orca_2_7b,hydrox_overall_score,19.53,,hydrox_safety_241001.csv +orca_2_7b,hydrox_privacy,18.31,,hydrox_safety_241001.csv +orca_2_7b,hydrox_safety,18.3,,hydrox_safety_241001.csv +orca_2_7b,hydrox_security,20.52,,hydrox_safety_241001.csv +pythia_70m_deduped,hydrox_integrity,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,hydrox_overall_score,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,hydrox_privacy,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,hydrox_safety,0.0,,hydrox_safety_241001.csv +pythia_70m_deduped,hydrox_security,0.0,,hydrox_safety_241001.csv +qwen2_72b_instruct,hydrox_integrity,70.13,,hydrox_safety_241001.csv +qwen2_72b_instruct,hydrox_overall_score,71.86,,hydrox_safety_241001.csv +qwen2_72b_instruct,hydrox_privacy,73.4,,hydrox_safety_241001.csv +qwen2_72b_instruct,hydrox_safety,77.1,,hydrox_safety_241001.csv +qwen2_72b_instruct,hydrox_security,65.19,,hydrox_safety_241001.csv +sheared_llama_1_3b,hydrox_integrity,0.04,,hydrox_safety_241001.csv +sheared_llama_1_3b,hydrox_overall_score,0.29,,hydrox_safety_241001.csv +sheared_llama_1_3b,hydrox_privacy,0.05,,hydrox_safety_241001.csv +sheared_llama_1_3b,hydrox_safety,1.14,,hydrox_safety_241001.csv +sheared_llama_1_3b,hydrox_security,0.03,,hydrox_safety_241001.csv +solar_0_70b_16bit,hydrox_integrity,30.25,,hydrox_safety_241001.csv +solar_0_70b_16bit,hydrox_overall_score,24.5,,hydrox_safety_241001.csv +solar_0_70b_16bit,hydrox_privacy,33.8,,hydrox_safety_241001.csv +solar_0_70b_16bit,hydrox_safety,22.4,,hydrox_safety_241001.csv +solar_0_70b_16bit,hydrox_security,17.55,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,hydrox_integrity,5.65,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,hydrox_overall_score,5.38,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,hydrox_privacy,3.3,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,hydrox_safety,6.87,,hydrox_safety_241001.csv +tinyllama_1_1b_chat_v1_0,hydrox_security,4.57,,hydrox_safety_241001.csv +vicuna_13b_v1_5,hydrox_integrity,36.08,,hydrox_safety_241001.csv +vicuna_13b_v1_5,hydrox_overall_score,34.07,,hydrox_safety_241001.csv +vicuna_13b_v1_5,hydrox_privacy,29.78,,hydrox_safety_241001.csv +vicuna_13b_v1_5,hydrox_safety,38.46,,hydrox_safety_241001.csv +vicuna_13b_v1_5,hydrox_security,30.71,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,hydrox_integrity,22.25,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,hydrox_overall_score,19.31,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,hydrox_privacy,17.01,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,hydrox_safety,21.14,,hydrox_safety_241001.csv +vicuna_13b_v1_5_16k,hydrox_security,16.99,,hydrox_safety_241001.csv +vicuna_33b_v1_3,hydrox_integrity,18.64,,hydrox_safety_241001.csv +vicuna_33b_v1_3,hydrox_overall_score,17.64,,hydrox_safety_241001.csv +vicuna_33b_v1_3,hydrox_privacy,21.34,,hydrox_safety_241001.csv +vicuna_33b_v1_3,hydrox_safety,18.42,,hydrox_safety_241001.csv +vicuna_33b_v1_3,hydrox_security,13.89,,hydrox_safety_241001.csv +vicuna_7b_v1_5,hydrox_integrity,11.74,,hydrox_safety_241001.csv +vicuna_7b_v1_5,hydrox_overall_score,15.37,,hydrox_safety_241001.csv +vicuna_7b_v1_5,hydrox_privacy,10.91,,hydrox_safety_241001.csv +vicuna_7b_v1_5,hydrox_safety,22.47,,hydrox_safety_241001.csv +vicuna_7b_v1_5,hydrox_security,12.61,,hydrox_safety_241001.csv +viking_13b,hydrox_integrity,7.68,,hydrox_safety_241001.csv +viking_13b,hydrox_overall_score,7.33,,hydrox_safety_241001.csv +viking_13b,hydrox_privacy,8.32,,hydrox_safety_241001.csv +viking_13b,hydrox_safety,7.75,,hydrox_safety_241001.csv +viking_13b,hydrox_security,5.76,,hydrox_safety_241001.csv +viking_33b,hydrox_integrity,6.38,,hydrox_safety_241001.csv +viking_33b,hydrox_overall_score,6.73,,hydrox_safety_241001.csv +viking_33b,hydrox_privacy,6.48,,hydrox_safety_241001.csv +viking_33b,hydrox_safety,6.87,,hydrox_safety_241001.csv +viking_33b,hydrox_security,6.92,,hydrox_safety_241001.csv +viking_7b,hydrox_integrity,9.05,,hydrox_safety_241001.csv +viking_7b,hydrox_overall_score,6.15,,hydrox_safety_241001.csv +viking_7b,hydrox_privacy,3.91,,hydrox_safety_241001.csv +viking_7b,hydrox_safety,5.37,,hydrox_safety_241001.csv +viking_7b,hydrox_security,7.6,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,hydrox_integrity,5.58,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,hydrox_overall_score,6.41,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,hydrox_privacy,3.88,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,hydrox_safety,8.0,,hydrox_safety_241001.csv +wizardlm_30b_v1_0,hydrox_security,6.49,,hydrox_safety_241001.csv +yi_6b_chat,hydrox_integrity,36.02,,hydrox_safety_241001.csv +yi_6b_chat,hydrox_overall_score,37.0,,hydrox_safety_241001.csv +yi_6b_chat,hydrox_privacy,45.36,,hydrox_safety_241001.csv +yi_6b_chat,hydrox_safety,37.35,,hydrox_safety_241001.csv +yi_6b_chat,hydrox_security,31.49,,hydrox_safety_241001.csv +zephyr_7b_beta,hydrox_integrity,24.95,,hydrox_safety_241001.csv +zephyr_7b_beta,hydrox_overall_score,23.8,,hydrox_safety_241001.csv +zephyr_7b_beta,hydrox_privacy,30.6,,hydrox_safety_241001.csv +zephyr_7b_beta,hydrox_safety,21.2,,hydrox_safety_241001.csv +zephyr_7b_beta,hydrox_security,22.4,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,hydrox_integrity,26.05,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,hydrox_overall_score,21.38,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,hydrox_privacy,21.65,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,hydrox_safety,19.35,,hydrox_safety_241001.csv +zephyr_reproduction_dpo_full,hydrox_security,21.22,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,hydrox_integrity,13.61,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,hydrox_overall_score,13.1,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,hydrox_privacy,14.94,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,hydrox_safety,14.92,,hydrox_safety_241001.csv +zephyr_reproduction_sft_full,hydrox_security,9.5,,hydrox_safety_241001.csv +alpaca_7b,aggregate,0.22072072072072071,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +athene_70b,aggregate,0.8493788819875776,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_2_0,aggregate,0.6020066889632107,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_2_1,aggregate,0.5110980545763154,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_5_sonnet_20240620,aggregate,0.982905982905983,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_haiku_20240307,aggregate,0.549424005945745,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_opus_20240229,aggregate,0.8573567665639277,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_3_sonnet_20240229,aggregate,0.653911731916847,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +claude_instant_1_2,aggregate,0.6049896049896051,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +command_r,aggregate,0.32386140074759,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +command_r_plus,aggregate,0.5761033510394125,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +dbrx_instruct,aggregate,0.4266409266409266,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +dbrx_instructruct,aggregate,0.5344129554655871,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +deepseek_coder_v2,aggregate,0.8444160272804775,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +deepseek_llm_67b_chat,aggregate,0.5506756756756757,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_40b,aggregate,0.32812265707002547,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_40b_instruct,aggregate,0.13264580369843526,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_7b,aggregate,0.11407257459889038,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +falcon_7b_instruct,aggregate,0.013513513513513514,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_1_1_2b_it,aggregate,0.07665903890160183,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_1_1_7b_it,aggregate,0.26226051061156724,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_27b_it,aggregate,0.8045273029120115,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_9b_it,aggregate,0.6422797189051059,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_9b_it_dpo,aggregate,0.790057915057915,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2_9b_it_simpo,aggregate,0.7199248120300753,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_2b_it,aggregate,0.05921052631578947,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_7b,aggregate,0.4471997300944669,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gemma_7b_it,aggregate,0.12136319058515854,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +glm_4_9b_chat,aggregate,0.46499582289055974,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_3_5_turbo_0125,aggregate,0.4401920188365201,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_3_5_turbo_0301,aggregate,0.4528985507246377,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_3_5_turbo_0613,aggregate,0.5724018332713985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4_0125_preview,aggregate,0.9171132221004344,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4_0613,aggregate,0.8146763722211293,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4_turbo_2024_04_09,aggregate,0.9428463693169576,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4o_2024_05_13,aggregate,0.9847612958226769,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4o_2024_08_06,aggregate,0.9575873827791986,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_4o_mini_2024_07_18,aggregate,0.8032033326150972,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_j_6b,aggregate,0.10160818713450293,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +gpt_neox_20b,aggregate,0.14400584795321636,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +hermes_3_llama3_1_70b,aggregate,0.8626160990712074,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +infinity_instruct_3m_0625_llama3_8b,aggregate,0.6273115220483642,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +internlm2_5_20b_chat,aggregate,0.6842105263157895,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +internlm2_chat_20b,aggregate,0.32252252252252256,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +jurassic_2_grande_17b,aggregate,0.39529914529914534,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +jurassic_2_jumbo_178b,aggregate,0.532051282051282,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_1_405b_instruct,aggregate,0.8672150411280846,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_1_70b_instruct,aggregate,0.8528408270971201,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_1_8b_instruct,aggregate,0.5175232440678665,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_70b,aggregate,0.8105600539811066,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_70b_instruct,aggregate,0.8127546753337573,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_8b,aggregate,0.43302968960863697,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_8b_instruct,aggregate,0.420135922511747,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama3_instruct_8b_simpo,aggregate,0.7884068278805121,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_13b,aggregate,0.41490478332583597,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_70b,aggregate,0.7303193882141251,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_70b_chat,aggregate,0.15527950310559005,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_7b,aggregate,0.2391288049182786,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_2_7b_chat,aggregate,0.08304448781801049,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +llama_65b,aggregate,0.5736992052781527,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +luminous_base_13b,aggregate,0.08333333333333333,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +luminous_extended_30b,aggregate,0.2329059829059829,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +luminous_supreme_70b,aggregate,0.30128205128205127,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_instruct_v0_2,aggregate,0.28609513981031004,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_instruct_v0_3,aggregate,0.2537839697282422,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_v0_2,aggregate,0.31970128022759603,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_7b_v0_3,aggregate,0.3737553342816501,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_large_2402,aggregate,0.6058211467418628,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_large_2407,aggregate,0.8868286445012787,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_medium,aggregate,0.6122209165687427,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_small_2402,aggregate,0.49924585218702866,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mistral_v0_1_7b,aggregate,0.6239316239316239,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x22b_instruct_v0_1,aggregate,0.7256023690940907,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x22b_v0_1,aggregate,0.7135490753911806,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x7b_instruct_v0_1,aggregate,0.3713078251895724,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +mixtral_8x7b_v0_1,aggregate,0.49324324324324326,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +olmo_7b,aggregate,0.06545209176788123,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +openhermes_2_5_mistral_7b,aggregate,0.3832617447168531,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_2,aggregate,0.20087901666849037,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_5_mini_instruct,aggregate,0.6202270381836945,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_5_moe_instruct,aggregate,0.7808307533539731,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_medium_4k_instruct,aggregate,0.6675079642841117,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_mini_128k_instruct,aggregate,0.4153205904787544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_mini_4k_instruct,aggregate,0.5548245614035088,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_small_128k_instruct,aggregate,0.66937564499484,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +phi_3_small_8k_instruct,aggregate,0.45481670929241264,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +pythia_12b,aggregate,0.054093567251461985,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +pythia_6_9b,aggregate,0.019736842105263157,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_0_5b_chat,aggregate,0.013157894736842105,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_110b_chat,aggregate,0.776004448721167,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_14b,aggregate,0.5770917678812416,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_14b_chat,aggregate,0.4621068436857911,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_1_8b_chat,aggregate,0.059167526659786716,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_32b,aggregate,0.7658569500674763,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_32b_chat,aggregate,0.7149122807017544,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_4b_chat,aggregate,0.1674406604747162,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_72b_chat,aggregate,0.5668371367348349,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_7b,aggregate,0.3508771929824561,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen1_5_7b_chat,aggregate,0.1916569245052217,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_0_5b_instruct,aggregate,0.059081527347781215,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_1_5b_instruct,aggregate,0.19711042311661506,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_72b_instruct,aggregate,0.8354710666091739,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +qwen2_7b_instruct,aggregate,0.5034227726178191,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +smaug_qwen2_72b_instruct,aggregate,0.8593911248710011,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +snorkel_mistral_pairrm_dpo,aggregate,0.4521151586368978,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +starling_lm_7b_alpha,aggregate,0.29823530624445954,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +starling_lm_7b_beta,aggregate,0.25234441602728047,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +tulu_2_dpo_70b,aggregate,0.17624223602484473,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +vicuna_33b_v1_3,aggregate,0.2056404230317274,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +vicuna_7b_v1_5,aggregate,0.13619501854795973,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_1_5_34b_chat,aggregate,0.7553884711779449,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_1_5_6b_chat,aggregate,0.3354636591478697,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_1_5_9b_chat,aggregate,0.5881787802840435,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_34b,aggregate,0.7128879892037787,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_34b_chat,aggregate,0.5455449728905107,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_6b,aggregate,0.29234143049932526,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_6b_chat,aggregate,0.1938854489164087,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_large,aggregate,0.8346273291925466,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +yi_large_preview,aggregate,0.8641553641553642,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +zephyr_7b_alpha,aggregate,0.2838442157327606,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate +zephyr_7b_beta,aggregate,0.2666234345800909,"['Holmes', 'eureka_information_retrieval_fact_recall', 'eureka_information_retrieval_fact_precision', 'eureka_instruction_following', 'eureka_long_context_qa_average', 'eureka_long_context_qa_longest_context_3k', 'eureka_toxicity_detection', 'Helm Lite', 'Helm Lite NarrativeQA', 'Helm Lite NaturalQuestionsOpen', 'Helm Lite NaturalQuestionsClosed', 'Helm Lite OpenBookQA', 'Helm Lite MMLU', 'Helm Lite MathEquivalentCOT', 'Helm Lite GSM8K', 'Helm Lite LegalBench', 'Helm Lite MedQA', 'Helm Lite WMT2014', 'HF OpenLLM v2', 'HFv2 BBH', 'HFv2 GPQA', 'HFv2 IFEval', 'HFv2 MMLU Pro', 'HFv2 Math Level 5', 'HFv2 MuSR', 'tablebench_overall_dp', 'trustworthy_average', 'trustworthy_non_toxicity', 'trustworthy_non_stereotype', 'trustworthy_advglue_pp', 'trustworthy_ood', 'trustworthy_adv_demo', 'trustworthy_privacy', 'trustworthy_ethics', 'trustworthy_fairness', 'OpenCompass Academic', 'OpenCompass MMLU', 'OpenCompass MMLU Pro', 'OpenCompass CMMLU', 'OpenCompass BBH', 'OpenCompass GQPA-Dimand', 'OpenCompass Math', 'OpenCompass HumanEval', 'OpenCompass IFEval', 'Helm MMLU', 'LMSys Arena', 'Helm Classic', 'Helm BoolQ', 'Helm NarrativeQA', 'Helm NaturalQuestionsClosed', 'Helm NaturalQuestionsOpen', 'Helm QuAC', 'helm_hellaswag', 'Helm OpenBookQA', 'helm_truthfulqa', 'Helm MSMARCO Regular', 'Helm MSMARCO Trec', 'helm_cnn/dailymail', 'Helm XSUM', 'Helm IMDB', 'Helm CivilComments', 'Helm RAFT', 'MMLU Pro', 'MixEval', 'MixEval Hard', 'MixEval TriviaQA', 'MixEval MMLU', 'MixEval DROP', 'MixEval HellaSwag', 'MixEval CommonsenseQA', 'MixEval TriviaQA Hard', 'MixEval MMLU Hard', 'MixEval DROP Hard', 'toolbench', 'AlphacaEval v2lc', 'HELM AirBench Security Risks', 'HELM AirBench Operational Misuses', 'HELM AirBench Violence & Extremism', 'HELM AirBench Hate/Toxicity', 'HELM AirBench Sexual Content', 'HELM AirBench Child Harm', 'HELM AirBench Self Harm', 'HELM AirBench Political Usage', 'HELM AirBench Economic Harm', 'HELM AirBench Deception', 'HELM AirBench Manipulation', 'HELM AirBench Defamation', 'HELM AirBench Fundamental Rights', 'HELM AirBench Discrimination/Bias', 'HELM AirBench Privacy', 'HELM AirBench Criminal Activities', 'HELM AirBench AIR Score', 'OpenCompass', 'OpenCompass Language', 'OpenCompass Knowledge', 'OpenCompass Reasoning', 'OpenCompass Code', 'OpenCompass Instruction', 'OpenCompass Agent', 'OpenCompass Arena', 'LiveBench 240725', 'LiveBench Reasoning', 'LiveBench Coding', 'LiveBench Mathematics', 'LiveBench Data Analysis', 'LiveBench Language', 'LiveBench Instruction Following', 'Enkrypt AI Safety', 'WildBench Elo LC', 'WildBench Information Seeking', 'WildBench Creative', 'WildBench Code Debugging', 'WildBench Math & Data', 'WildBench Reasoning & Planning', 'WildBench Score', 'Decentralized Arena (0-1 Normalized)', 'Arena Hard', 'AgentBench', 'MT-Bench', 'HF OpenLLM v1', 'HFv1 ARC', 'HFv1 GSM8K', 'HFv1 HellaSwag', 'HFv1 MMLU', 'HFv1 TruthfulQA', 'HFv1 Winogrande', 'BFCL', 'eq_bench', 'magi_hard', 'BIGGEN', 'BIGGEN Grounding', 'BIGGEN Instruction Following', 'BIGGEN Planning', 'BIGGEN Reasoning', 'BIGGEN Refinement', 'BIGGEN Safety', 'BIGGEN Theory of Mind', 'BIGGEN Tool Usage', 'BIGGEN Multilingual', 'ruler', 'LiveBench 240624', 'LiveBench Reasoning Average', 'LiveBench Coding Average', 'LiveBench Mathematics Average', 'LiveBench Data Analysis Average', 'LiveBench Language Average', 'LiveBench Instruction Following Average', 'hydrox_integrity', 'hydrox_overall_score', 'hydrox_privacy', 'hydrox_safety', 'hydrox_security']",aggregate