|
{ |
|
"results": { |
|
"hendrycksTest-abstract_algebra": { |
|
"acc": 0.34, |
|
"acc_stderr": 0.04760952285695236, |
|
"acc_norm": 0.34, |
|
"acc_norm_stderr": 0.04760952285695236 |
|
}, |
|
"hendrycksTest-anatomy": { |
|
"acc": 0.45925925925925926, |
|
"acc_stderr": 0.04304979692464242, |
|
"acc_norm": 0.45925925925925926, |
|
"acc_norm_stderr": 0.04304979692464242 |
|
}, |
|
"hendrycksTest-astronomy": { |
|
"acc": 0.40131578947368424, |
|
"acc_stderr": 0.039889037033362836, |
|
"acc_norm": 0.40131578947368424, |
|
"acc_norm_stderr": 0.039889037033362836 |
|
}, |
|
"hendrycksTest-business_ethics": { |
|
"acc": 0.49, |
|
"acc_stderr": 0.05024183937956912, |
|
"acc_norm": 0.49, |
|
"acc_norm_stderr": 0.05024183937956912 |
|
}, |
|
"hendrycksTest-clinical_knowledge": { |
|
"acc": 0.44150943396226416, |
|
"acc_stderr": 0.030561590426731844, |
|
"acc_norm": 0.44150943396226416, |
|
"acc_norm_stderr": 0.030561590426731844 |
|
}, |
|
"hendrycksTest-college_biology": { |
|
"acc": 0.4513888888888889, |
|
"acc_stderr": 0.04161402398403279, |
|
"acc_norm": 0.4513888888888889, |
|
"acc_norm_stderr": 0.04161402398403279 |
|
}, |
|
"hendrycksTest-college_chemistry": { |
|
"acc": 0.35, |
|
"acc_stderr": 0.0479372485441102, |
|
"acc_norm": 0.35, |
|
"acc_norm_stderr": 0.0479372485441102 |
|
}, |
|
"hendrycksTest-college_computer_science": { |
|
"acc": 0.36, |
|
"acc_stderr": 0.04824181513244218, |
|
"acc_norm": 0.36, |
|
"acc_norm_stderr": 0.04824181513244218 |
|
}, |
|
"hendrycksTest-college_mathematics": { |
|
"acc": 0.35, |
|
"acc_stderr": 0.047937248544110196, |
|
"acc_norm": 0.35, |
|
"acc_norm_stderr": 0.047937248544110196 |
|
}, |
|
"hendrycksTest-college_medicine": { |
|
"acc": 0.42196531791907516, |
|
"acc_stderr": 0.0376574669386515, |
|
"acc_norm": 0.42196531791907516, |
|
"acc_norm_stderr": 0.0376574669386515 |
|
}, |
|
"hendrycksTest-college_physics": { |
|
"acc": 0.23529411764705882, |
|
"acc_stderr": 0.04220773659171453, |
|
"acc_norm": 0.23529411764705882, |
|
"acc_norm_stderr": 0.04220773659171453 |
|
}, |
|
"hendrycksTest-computer_security": { |
|
"acc": 0.59, |
|
"acc_stderr": 0.049431107042371025, |
|
"acc_norm": 0.59, |
|
"acc_norm_stderr": 0.049431107042371025 |
|
}, |
|
"hendrycksTest-conceptual_physics": { |
|
"acc": 0.4340425531914894, |
|
"acc_stderr": 0.03240038086792747, |
|
"acc_norm": 0.4340425531914894, |
|
"acc_norm_stderr": 0.03240038086792747 |
|
}, |
|
"hendrycksTest-econometrics": { |
|
"acc": 0.32456140350877194, |
|
"acc_stderr": 0.04404556157374768, |
|
"acc_norm": 0.32456140350877194, |
|
"acc_norm_stderr": 0.04404556157374768 |
|
}, |
|
"hendrycksTest-electrical_engineering": { |
|
"acc": 0.3931034482758621, |
|
"acc_stderr": 0.0407032901370707, |
|
"acc_norm": 0.3931034482758621, |
|
"acc_norm_stderr": 0.0407032901370707 |
|
}, |
|
"hendrycksTest-elementary_mathematics": { |
|
"acc": 0.28835978835978837, |
|
"acc_stderr": 0.0233306540545359, |
|
"acc_norm": 0.28835978835978837, |
|
"acc_norm_stderr": 0.0233306540545359 |
|
}, |
|
"hendrycksTest-formal_logic": { |
|
"acc": 0.30158730158730157, |
|
"acc_stderr": 0.04104947269903394, |
|
"acc_norm": 0.30158730158730157, |
|
"acc_norm_stderr": 0.04104947269903394 |
|
}, |
|
"hendrycksTest-global_facts": { |
|
"acc": 0.3, |
|
"acc_stderr": 0.046056618647183814, |
|
"acc_norm": 0.3, |
|
"acc_norm_stderr": 0.046056618647183814 |
|
}, |
|
"hendrycksTest-high_school_biology": { |
|
"acc": 0.47419354838709676, |
|
"acc_stderr": 0.02840609505765332, |
|
"acc_norm": 0.47419354838709676, |
|
"acc_norm_stderr": 0.02840609505765332 |
|
}, |
|
"hendrycksTest-high_school_chemistry": { |
|
"acc": 0.3448275862068966, |
|
"acc_stderr": 0.03344283744280458, |
|
"acc_norm": 0.3448275862068966, |
|
"acc_norm_stderr": 0.03344283744280458 |
|
}, |
|
"hendrycksTest-high_school_computer_science": { |
|
"acc": 0.39, |
|
"acc_stderr": 0.04902071300001975, |
|
"acc_norm": 0.39, |
|
"acc_norm_stderr": 0.04902071300001975 |
|
}, |
|
"hendrycksTest-high_school_european_history": { |
|
"acc": 0.6060606060606061, |
|
"acc_stderr": 0.03815494308688931, |
|
"acc_norm": 0.6060606060606061, |
|
"acc_norm_stderr": 0.03815494308688931 |
|
}, |
|
"hendrycksTest-high_school_geography": { |
|
"acc": 0.5505050505050505, |
|
"acc_stderr": 0.035441324919479704, |
|
"acc_norm": 0.5505050505050505, |
|
"acc_norm_stderr": 0.035441324919479704 |
|
}, |
|
"hendrycksTest-high_school_government_and_politics": { |
|
"acc": 0.689119170984456, |
|
"acc_stderr": 0.03340361906276585, |
|
"acc_norm": 0.689119170984456, |
|
"acc_norm_stderr": 0.03340361906276585 |
|
}, |
|
"hendrycksTest-high_school_macroeconomics": { |
|
"acc": 0.4282051282051282, |
|
"acc_stderr": 0.025088301454694834, |
|
"acc_norm": 0.4282051282051282, |
|
"acc_norm_stderr": 0.025088301454694834 |
|
}, |
|
"hendrycksTest-high_school_mathematics": { |
|
"acc": 0.2851851851851852, |
|
"acc_stderr": 0.027528599210340496, |
|
"acc_norm": 0.2851851851851852, |
|
"acc_norm_stderr": 0.027528599210340496 |
|
}, |
|
"hendrycksTest-high_school_microeconomics": { |
|
"acc": 0.38235294117647056, |
|
"acc_stderr": 0.03156663099215416, |
|
"acc_norm": 0.38235294117647056, |
|
"acc_norm_stderr": 0.03156663099215416 |
|
}, |
|
"hendrycksTest-high_school_physics": { |
|
"acc": 0.3509933774834437, |
|
"acc_stderr": 0.03896981964257375, |
|
"acc_norm": 0.3509933774834437, |
|
"acc_norm_stderr": 0.03896981964257375 |
|
}, |
|
"hendrycksTest-high_school_psychology": { |
|
"acc": 0.6146788990825688, |
|
"acc_stderr": 0.02086585085279412, |
|
"acc_norm": 0.6146788990825688, |
|
"acc_norm_stderr": 0.02086585085279412 |
|
}, |
|
"hendrycksTest-high_school_statistics": { |
|
"acc": 0.25462962962962965, |
|
"acc_stderr": 0.02971127586000535, |
|
"acc_norm": 0.25462962962962965, |
|
"acc_norm_stderr": 0.02971127586000535 |
|
}, |
|
"hendrycksTest-high_school_us_history": { |
|
"acc": 0.5490196078431373, |
|
"acc_stderr": 0.03492406104163613, |
|
"acc_norm": 0.5490196078431373, |
|
"acc_norm_stderr": 0.03492406104163613 |
|
}, |
|
"hendrycksTest-high_school_world_history": { |
|
"acc": 0.6244725738396625, |
|
"acc_stderr": 0.03152256243091156, |
|
"acc_norm": 0.6244725738396625, |
|
"acc_norm_stderr": 0.03152256243091156 |
|
}, |
|
"hendrycksTest-human_aging": { |
|
"acc": 0.5381165919282511, |
|
"acc_stderr": 0.033460150119732274, |
|
"acc_norm": 0.5381165919282511, |
|
"acc_norm_stderr": 0.033460150119732274 |
|
}, |
|
"hendrycksTest-human_sexuality": { |
|
"acc": 0.5343511450381679, |
|
"acc_stderr": 0.043749285605997376, |
|
"acc_norm": 0.5343511450381679, |
|
"acc_norm_stderr": 0.043749285605997376 |
|
}, |
|
"hendrycksTest-international_law": { |
|
"acc": 0.6033057851239669, |
|
"acc_stderr": 0.044658697805310094, |
|
"acc_norm": 0.6033057851239669, |
|
"acc_norm_stderr": 0.044658697805310094 |
|
}, |
|
"hendrycksTest-jurisprudence": { |
|
"acc": 0.49074074074074076, |
|
"acc_stderr": 0.04832853553437055, |
|
"acc_norm": 0.49074074074074076, |
|
"acc_norm_stderr": 0.04832853553437055 |
|
}, |
|
"hendrycksTest-logical_fallacies": { |
|
"acc": 0.48466257668711654, |
|
"acc_stderr": 0.039265223787088424, |
|
"acc_norm": 0.48466257668711654, |
|
"acc_norm_stderr": 0.039265223787088424 |
|
}, |
|
"hendrycksTest-machine_learning": { |
|
"acc": 0.375, |
|
"acc_stderr": 0.04595091388086298, |
|
"acc_norm": 0.375, |
|
"acc_norm_stderr": 0.04595091388086298 |
|
}, |
|
"hendrycksTest-management": { |
|
"acc": 0.5145631067961165, |
|
"acc_stderr": 0.049486373240266356, |
|
"acc_norm": 0.5145631067961165, |
|
"acc_norm_stderr": 0.049486373240266356 |
|
}, |
|
"hendrycksTest-marketing": { |
|
"acc": 0.6837606837606838, |
|
"acc_stderr": 0.03046365674734027, |
|
"acc_norm": 0.6837606837606838, |
|
"acc_norm_stderr": 0.03046365674734027 |
|
}, |
|
"hendrycksTest-medical_genetics": { |
|
"acc": 0.53, |
|
"acc_stderr": 0.05016135580465919, |
|
"acc_norm": 0.53, |
|
"acc_norm_stderr": 0.05016135580465919 |
|
}, |
|
"hendrycksTest-miscellaneous": { |
|
"acc": 0.6232439335887612, |
|
"acc_stderr": 0.01732829290730305, |
|
"acc_norm": 0.6232439335887612, |
|
"acc_norm_stderr": 0.01732829290730305 |
|
}, |
|
"hendrycksTest-moral_disputes": { |
|
"acc": 0.49421965317919075, |
|
"acc_stderr": 0.026917296179149116, |
|
"acc_norm": 0.49421965317919075, |
|
"acc_norm_stderr": 0.026917296179149116 |
|
}, |
|
"hendrycksTest-moral_scenarios": { |
|
"acc": 0.2446927374301676, |
|
"acc_stderr": 0.014378169884098435, |
|
"acc_norm": 0.2446927374301676, |
|
"acc_norm_stderr": 0.014378169884098435 |
|
}, |
|
"hendrycksTest-nutrition": { |
|
"acc": 0.4673202614379085, |
|
"acc_stderr": 0.02856869975222588, |
|
"acc_norm": 0.4673202614379085, |
|
"acc_norm_stderr": 0.02856869975222588 |
|
}, |
|
"hendrycksTest-philosophy": { |
|
"acc": 0.5884244372990354, |
|
"acc_stderr": 0.027950481494401262, |
|
"acc_norm": 0.5884244372990354, |
|
"acc_norm_stderr": 0.027950481494401262 |
|
}, |
|
"hendrycksTest-prehistory": { |
|
"acc": 0.5, |
|
"acc_stderr": 0.02782074420373286, |
|
"acc_norm": 0.5, |
|
"acc_norm_stderr": 0.02782074420373286 |
|
}, |
|
"hendrycksTest-professional_accounting": { |
|
"acc": 0.3475177304964539, |
|
"acc_stderr": 0.028406627809590954, |
|
"acc_norm": 0.3475177304964539, |
|
"acc_norm_stderr": 0.028406627809590954 |
|
}, |
|
"hendrycksTest-professional_law": { |
|
"acc": 0.3683181225554107, |
|
"acc_stderr": 0.012319403369564639, |
|
"acc_norm": 0.3683181225554107, |
|
"acc_norm_stderr": 0.012319403369564639 |
|
}, |
|
"hendrycksTest-professional_medicine": { |
|
"acc": 0.5330882352941176, |
|
"acc_stderr": 0.03030625772246832, |
|
"acc_norm": 0.5330882352941176, |
|
"acc_norm_stderr": 0.03030625772246832 |
|
}, |
|
"hendrycksTest-professional_psychology": { |
|
"acc": 0.42320261437908496, |
|
"acc_stderr": 0.019987809769482064, |
|
"acc_norm": 0.42320261437908496, |
|
"acc_norm_stderr": 0.019987809769482064 |
|
}, |
|
"hendrycksTest-public_relations": { |
|
"acc": 0.5363636363636364, |
|
"acc_stderr": 0.04776449162396197, |
|
"acc_norm": 0.5363636363636364, |
|
"acc_norm_stderr": 0.04776449162396197 |
|
}, |
|
"hendrycksTest-security_studies": { |
|
"acc": 0.42448979591836733, |
|
"acc_stderr": 0.031642094879429414, |
|
"acc_norm": 0.42448979591836733, |
|
"acc_norm_stderr": 0.031642094879429414 |
|
}, |
|
"hendrycksTest-sociology": { |
|
"acc": 0.5621890547263682, |
|
"acc_stderr": 0.035080801121998406, |
|
"acc_norm": 0.5621890547263682, |
|
"acc_norm_stderr": 0.035080801121998406 |
|
}, |
|
"hendrycksTest-us_foreign_policy": { |
|
"acc": 0.69, |
|
"acc_stderr": 0.04648231987117316, |
|
"acc_norm": 0.69, |
|
"acc_norm_stderr": 0.04648231987117316 |
|
}, |
|
"hendrycksTest-virology": { |
|
"acc": 0.3855421686746988, |
|
"acc_stderr": 0.037891344246115496, |
|
"acc_norm": 0.3855421686746988, |
|
"acc_norm_stderr": 0.037891344246115496 |
|
}, |
|
"hendrycksTest-world_religions": { |
|
"acc": 0.6666666666666666, |
|
"acc_stderr": 0.036155076303109365, |
|
"acc_norm": 0.6666666666666666, |
|
"acc_norm_stderr": 0.036155076303109365 |
|
} |
|
}, |
|
"versions": { |
|
"hendrycksTest-abstract_algebra": 1, |
|
"hendrycksTest-anatomy": 1, |
|
"hendrycksTest-astronomy": 1, |
|
"hendrycksTest-business_ethics": 1, |
|
"hendrycksTest-clinical_knowledge": 1, |
|
"hendrycksTest-college_biology": 1, |
|
"hendrycksTest-college_chemistry": 1, |
|
"hendrycksTest-college_computer_science": 1, |
|
"hendrycksTest-college_mathematics": 1, |
|
"hendrycksTest-college_medicine": 1, |
|
"hendrycksTest-college_physics": 1, |
|
"hendrycksTest-computer_security": 1, |
|
"hendrycksTest-conceptual_physics": 1, |
|
"hendrycksTest-econometrics": 1, |
|
"hendrycksTest-electrical_engineering": 1, |
|
"hendrycksTest-elementary_mathematics": 1, |
|
"hendrycksTest-formal_logic": 1, |
|
"hendrycksTest-global_facts": 1, |
|
"hendrycksTest-high_school_biology": 1, |
|
"hendrycksTest-high_school_chemistry": 1, |
|
"hendrycksTest-high_school_computer_science": 1, |
|
"hendrycksTest-high_school_european_history": 1, |
|
"hendrycksTest-high_school_geography": 1, |
|
"hendrycksTest-high_school_government_and_politics": 1, |
|
"hendrycksTest-high_school_macroeconomics": 1, |
|
"hendrycksTest-high_school_mathematics": 1, |
|
"hendrycksTest-high_school_microeconomics": 1, |
|
"hendrycksTest-high_school_physics": 1, |
|
"hendrycksTest-high_school_psychology": 1, |
|
"hendrycksTest-high_school_statistics": 1, |
|
"hendrycksTest-high_school_us_history": 1, |
|
"hendrycksTest-high_school_world_history": 1, |
|
"hendrycksTest-human_aging": 1, |
|
"hendrycksTest-human_sexuality": 1, |
|
"hendrycksTest-international_law": 1, |
|
"hendrycksTest-jurisprudence": 1, |
|
"hendrycksTest-logical_fallacies": 1, |
|
"hendrycksTest-machine_learning": 1, |
|
"hendrycksTest-management": 1, |
|
"hendrycksTest-marketing": 1, |
|
"hendrycksTest-medical_genetics": 1, |
|
"hendrycksTest-miscellaneous": 1, |
|
"hendrycksTest-moral_disputes": 1, |
|
"hendrycksTest-moral_scenarios": 1, |
|
"hendrycksTest-nutrition": 1, |
|
"hendrycksTest-philosophy": 1, |
|
"hendrycksTest-prehistory": 1, |
|
"hendrycksTest-professional_accounting": 1, |
|
"hendrycksTest-professional_law": 1, |
|
"hendrycksTest-professional_medicine": 1, |
|
"hendrycksTest-professional_psychology": 1, |
|
"hendrycksTest-public_relations": 1, |
|
"hendrycksTest-security_studies": 1, |
|
"hendrycksTest-sociology": 1, |
|
"hendrycksTest-us_foreign_policy": 1, |
|
"hendrycksTest-virology": 1, |
|
"hendrycksTest-world_religions": 1 |
|
}, |
|
"config": { |
|
"model": "hf-causal-experimental", |
|
"model_args": "pretrained=/home/vmagent/app/data/Llama-2-7b-hf,peft=/home/vmagent/app/data/llama-2-delta-tune-model-div-sal,use_accelerate=True,delta=/home/vmagent/app/data/llama-2-delta-tune-model-div-sal/best_model_structure.txt", |
|
"num_fewshot": 5, |
|
"batch_size": "auto", |
|
"batch_sizes": [ |
|
8 |
|
], |
|
"device": null, |
|
"no_cache": false, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"description_dict": {} |
|
} |
|
} |