{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "hendrycksTest-anatomy": { "acc": 0.45925925925925926, "acc_stderr": 0.04304979692464242, "acc_norm": 0.45925925925925926, "acc_norm_stderr": 0.04304979692464242 }, "hendrycksTest-astronomy": { "acc": 0.40131578947368424, "acc_stderr": 0.039889037033362836, "acc_norm": 0.40131578947368424, "acc_norm_stderr": 0.039889037033362836 }, "hendrycksTest-business_ethics": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "hendrycksTest-clinical_knowledge": { "acc": 0.44150943396226416, "acc_stderr": 0.030561590426731844, "acc_norm": 0.44150943396226416, "acc_norm_stderr": 0.030561590426731844 }, "hendrycksTest-college_biology": { "acc": 0.4513888888888889, "acc_stderr": 0.04161402398403279, "acc_norm": 0.4513888888888889, "acc_norm_stderr": 0.04161402398403279 }, "hendrycksTest-college_chemistry": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "hendrycksTest-college_computer_science": { "acc": 0.36, "acc_stderr": 0.04824181513244218, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "hendrycksTest-college_mathematics": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "hendrycksTest-college_medicine": { "acc": 0.42196531791907516, "acc_stderr": 0.0376574669386515, "acc_norm": 0.42196531791907516, "acc_norm_stderr": 0.0376574669386515 }, "hendrycksTest-college_physics": { "acc": 0.23529411764705882, "acc_stderr": 0.04220773659171453, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453 }, "hendrycksTest-computer_security": { "acc": 0.59, "acc_stderr": 0.049431107042371025, "acc_norm": 0.59, "acc_norm_stderr": 0.049431107042371025 }, "hendrycksTest-conceptual_physics": { "acc": 0.4340425531914894, "acc_stderr": 0.03240038086792747, "acc_norm": 0.4340425531914894, "acc_norm_stderr": 0.03240038086792747 }, "hendrycksTest-econometrics": { "acc": 0.32456140350877194, "acc_stderr": 0.04404556157374768, "acc_norm": 0.32456140350877194, "acc_norm_stderr": 0.04404556157374768 }, "hendrycksTest-electrical_engineering": { "acc": 0.3931034482758621, "acc_stderr": 0.0407032901370707, "acc_norm": 0.3931034482758621, "acc_norm_stderr": 0.0407032901370707 }, "hendrycksTest-elementary_mathematics": { "acc": 0.28835978835978837, "acc_stderr": 0.0233306540545359, "acc_norm": 0.28835978835978837, "acc_norm_stderr": 0.0233306540545359 }, "hendrycksTest-formal_logic": { "acc": 0.30158730158730157, "acc_stderr": 0.04104947269903394, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.04104947269903394 }, "hendrycksTest-global_facts": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-high_school_biology": { "acc": 0.47419354838709676, "acc_stderr": 0.02840609505765332, "acc_norm": 0.47419354838709676, "acc_norm_stderr": 0.02840609505765332 }, "hendrycksTest-high_school_chemistry": { "acc": 0.3448275862068966, "acc_stderr": 0.03344283744280458, "acc_norm": 0.3448275862068966, "acc_norm_stderr": 0.03344283744280458 }, "hendrycksTest-high_school_computer_science": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "hendrycksTest-high_school_european_history": { "acc": 0.6060606060606061, "acc_stderr": 0.03815494308688931, "acc_norm": 0.6060606060606061, "acc_norm_stderr": 0.03815494308688931 }, "hendrycksTest-high_school_geography": { "acc": 0.5505050505050505, "acc_stderr": 0.035441324919479704, "acc_norm": 0.5505050505050505, "acc_norm_stderr": 0.035441324919479704 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.689119170984456, "acc_stderr": 0.03340361906276585, "acc_norm": 0.689119170984456, "acc_norm_stderr": 0.03340361906276585 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.4282051282051282, "acc_stderr": 0.025088301454694834, "acc_norm": 0.4282051282051282, "acc_norm_stderr": 0.025088301454694834 }, "hendrycksTest-high_school_mathematics": { "acc": 0.2851851851851852, "acc_stderr": 0.027528599210340496, "acc_norm": 0.2851851851851852, "acc_norm_stderr": 0.027528599210340496 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.38235294117647056, "acc_stderr": 0.03156663099215416, "acc_norm": 0.38235294117647056, "acc_norm_stderr": 0.03156663099215416 }, "hendrycksTest-high_school_physics": { "acc": 0.3509933774834437, "acc_stderr": 0.03896981964257375, "acc_norm": 0.3509933774834437, "acc_norm_stderr": 0.03896981964257375 }, "hendrycksTest-high_school_psychology": { "acc": 0.6146788990825688, "acc_stderr": 0.02086585085279412, "acc_norm": 0.6146788990825688, "acc_norm_stderr": 0.02086585085279412 }, "hendrycksTest-high_school_statistics": { "acc": 0.25462962962962965, "acc_stderr": 0.02971127586000535, "acc_norm": 0.25462962962962965, "acc_norm_stderr": 0.02971127586000535 }, "hendrycksTest-high_school_us_history": { "acc": 0.5490196078431373, "acc_stderr": 0.03492406104163613, "acc_norm": 0.5490196078431373, "acc_norm_stderr": 0.03492406104163613 }, "hendrycksTest-high_school_world_history": { "acc": 0.6244725738396625, "acc_stderr": 0.03152256243091156, "acc_norm": 0.6244725738396625, "acc_norm_stderr": 0.03152256243091156 }, "hendrycksTest-human_aging": { "acc": 0.5381165919282511, "acc_stderr": 0.033460150119732274, "acc_norm": 0.5381165919282511, "acc_norm_stderr": 0.033460150119732274 }, "hendrycksTest-human_sexuality": { "acc": 0.5343511450381679, "acc_stderr": 0.043749285605997376, "acc_norm": 0.5343511450381679, "acc_norm_stderr": 0.043749285605997376 }, "hendrycksTest-international_law": { "acc": 0.6033057851239669, "acc_stderr": 0.044658697805310094, "acc_norm": 0.6033057851239669, "acc_norm_stderr": 0.044658697805310094 }, "hendrycksTest-jurisprudence": { "acc": 0.49074074074074076, "acc_stderr": 0.04832853553437055, "acc_norm": 0.49074074074074076, "acc_norm_stderr": 0.04832853553437055 }, "hendrycksTest-logical_fallacies": { "acc": 0.48466257668711654, "acc_stderr": 0.039265223787088424, "acc_norm": 0.48466257668711654, "acc_norm_stderr": 0.039265223787088424 }, "hendrycksTest-machine_learning": { "acc": 0.375, "acc_stderr": 0.04595091388086298, "acc_norm": 0.375, "acc_norm_stderr": 0.04595091388086298 }, "hendrycksTest-management": { "acc": 0.5145631067961165, "acc_stderr": 0.049486373240266356, "acc_norm": 0.5145631067961165, "acc_norm_stderr": 0.049486373240266356 }, "hendrycksTest-marketing": { "acc": 0.6837606837606838, "acc_stderr": 0.03046365674734027, "acc_norm": 0.6837606837606838, "acc_norm_stderr": 0.03046365674734027 }, "hendrycksTest-medical_genetics": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "hendrycksTest-miscellaneous": { "acc": 0.6232439335887612, "acc_stderr": 0.01732829290730305, "acc_norm": 0.6232439335887612, "acc_norm_stderr": 0.01732829290730305 }, "hendrycksTest-moral_disputes": { "acc": 0.49421965317919075, "acc_stderr": 0.026917296179149116, "acc_norm": 0.49421965317919075, "acc_norm_stderr": 0.026917296179149116 }, "hendrycksTest-moral_scenarios": { "acc": 0.2446927374301676, "acc_stderr": 0.014378169884098435, "acc_norm": 0.2446927374301676, "acc_norm_stderr": 0.014378169884098435 }, "hendrycksTest-nutrition": { "acc": 0.4673202614379085, "acc_stderr": 0.02856869975222588, "acc_norm": 0.4673202614379085, "acc_norm_stderr": 0.02856869975222588 }, "hendrycksTest-philosophy": { "acc": 0.5884244372990354, "acc_stderr": 0.027950481494401262, "acc_norm": 0.5884244372990354, "acc_norm_stderr": 0.027950481494401262 }, "hendrycksTest-prehistory": { "acc": 0.5, "acc_stderr": 0.02782074420373286, "acc_norm": 0.5, "acc_norm_stderr": 0.02782074420373286 }, "hendrycksTest-professional_accounting": { "acc": 0.3475177304964539, "acc_stderr": 0.028406627809590954, "acc_norm": 0.3475177304964539, "acc_norm_stderr": 0.028406627809590954 }, "hendrycksTest-professional_law": { "acc": 0.3683181225554107, "acc_stderr": 0.012319403369564639, "acc_norm": 0.3683181225554107, "acc_norm_stderr": 0.012319403369564639 }, "hendrycksTest-professional_medicine": { "acc": 0.5330882352941176, "acc_stderr": 0.03030625772246832, "acc_norm": 0.5330882352941176, "acc_norm_stderr": 0.03030625772246832 }, "hendrycksTest-professional_psychology": { "acc": 0.42320261437908496, "acc_stderr": 0.019987809769482064, "acc_norm": 0.42320261437908496, "acc_norm_stderr": 0.019987809769482064 }, "hendrycksTest-public_relations": { "acc": 0.5363636363636364, "acc_stderr": 0.04776449162396197, "acc_norm": 0.5363636363636364, "acc_norm_stderr": 0.04776449162396197 }, "hendrycksTest-security_studies": { "acc": 0.42448979591836733, "acc_stderr": 0.031642094879429414, "acc_norm": 0.42448979591836733, "acc_norm_stderr": 0.031642094879429414 }, "hendrycksTest-sociology": { "acc": 0.5621890547263682, "acc_stderr": 0.035080801121998406, "acc_norm": 0.5621890547263682, "acc_norm_stderr": 0.035080801121998406 }, "hendrycksTest-us_foreign_policy": { "acc": 0.69, "acc_stderr": 0.04648231987117316, "acc_norm": 0.69, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-virology": { "acc": 0.3855421686746988, "acc_stderr": 0.037891344246115496, "acc_norm": 0.3855421686746988, "acc_norm_stderr": 0.037891344246115496 }, "hendrycksTest-world_religions": { "acc": 0.6666666666666666, "acc_stderr": 0.036155076303109365, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.036155076303109365 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "hf-causal-experimental", "model_args": "pretrained=/home/vmagent/app/data/Llama-2-7b-hf,peft=/home/vmagent/app/data/llama-2-delta-tune-model-div-sal,use_accelerate=True,delta=/home/vmagent/app/data/llama-2-delta-tune-model-div-sal/best_model_structure.txt", "num_fewshot": 5, "batch_size": "auto", "batch_sizes": [ 8 ], "device": null, "no_cache": false, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }