|
[ |
|
{ |
|
"config": { |
|
"model_name": "ChatGPT-4o-latest (2024-09-03)", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 87.33082346779815, |
|
"Standard Deviation": 1.4853337406399776, |
|
"Rank": 3 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.976028578, |
|
"Standard Deviation": 0.01507912373, |
|
"Rank": 3 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.951199453, |
|
"Standard Deviation": 0.08452452108, |
|
"Rank": 3 |
|
}, |
|
"Probability": { |
|
"Average Score": 80.1332207690739, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Logical": { |
|
"Average Score": 84.12975867250425, |
|
"Standard Deviation": 0.21211547702245045, |
|
"Rank": 6 |
|
}, |
|
"Social": { |
|
"Average Score": 0.815902987, |
|
"Standard Deviation": 0.0196254222, |
|
"Rank": 3 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 89.92480228064885, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"CPP": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4o-2024-08-06", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 77.7818546246671, |
|
"Standard Deviation": 2.7097581088879505, |
|
"Rank": 5 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.99773096, |
|
"Standard Deviation": 0.002835555172, |
|
"Rank": 1 |
|
}, |
|
"Algebra": { |
|
"Average Score": 1.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 1 |
|
}, |
|
"Probability": { |
|
"Average Score": 74.97136205481755, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Logical": { |
|
"Average Score": 66.0597109743056, |
|
"Standard Deviation": 1.5021351704575163, |
|
"Rank": 14 |
|
}, |
|
"Social": { |
|
"Average Score": 0.680417314, |
|
"Standard Deviation": 0.00656867063, |
|
"Rank": 8 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 82.55189735524202, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"CPP": { |
|
"Average Score": 92.43090226400756, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4o-2024-05-13", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 72.6093654197998, |
|
"Standard Deviation": 13.515345690976028, |
|
"Rank": 10 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.972472377, |
|
"Standard Deviation": 0.01648274205, |
|
"Rank": 4 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.995511298, |
|
"Standard Deviation": 0.004097802515, |
|
"Rank": 2 |
|
}, |
|
"Probability": { |
|
"Average Score": 77.97816201050715, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Logical": { |
|
"Average Score": 75.65058939137873, |
|
"Standard Deviation": 0.07522785572103825, |
|
"Rank": 9 |
|
}, |
|
"Social": { |
|
"Average Score": 0.609875087, |
|
"Standard Deviation": 0.038729239, |
|
"Rank": 13 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 76.03377031297643, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"CPP": { |
|
"Average Score": 79.1592634699295, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4-turbo-2024-04-09", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 73.32308543749606, |
|
"Standard Deviation": 6.562777844134629, |
|
"Rank": 9 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.95374588, |
|
"Standard Deviation": 0.03109307166, |
|
"Rank": 5 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.930945223, |
|
"Standard Deviation": 0.06705136813, |
|
"Rank": 4 |
|
}, |
|
"Probability": { |
|
"Average Score": 74.97144205445957, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"Logical": { |
|
"Average Score": 76.82291715624933, |
|
"Standard Deviation": 0.03462548327631355, |
|
"Rank": 7 |
|
}, |
|
"Social": { |
|
"Average Score": 0.715935163, |
|
"Standard Deviation": 0.1209141409, |
|
"Rank": 6 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 70.44329321394066, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"CPP": { |
|
"Average Score": 70.73143363230263, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemini-1.5-pro-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 74.27365448117855, |
|
"Standard Deviation": 3.9515447172901847, |
|
"Rank": 8 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.9947169, |
|
"Standard Deviation": 0.009150597621, |
|
"Rank": 2 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.857464301, |
|
"Standard Deviation": 0.05014285338, |
|
"Rank": 5 |
|
}, |
|
"Probability": { |
|
"Average Score": 64.77713215500482, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Logical": { |
|
"Average Score": 74.3275461555815, |
|
"Standard Deviation": 0.8092355737847541, |
|
"Rank": 10 |
|
}, |
|
"Social": { |
|
"Average Score": 0.649601885, |
|
"Standard Deviation": 0.104854889, |
|
"Rank": 11 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen2-72b-instruct", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/09" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 71.00423311357184, |
|
"Standard Deviation": 1.6189609141983887, |
|
"Rank": 12 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.796870305, |
|
"Standard Deviation": 0.0509025346, |
|
"Rank": 9 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.836194231, |
|
"Standard Deviation": 0.04517093028, |
|
"Rank": 6 |
|
}, |
|
"Probability": { |
|
"Average Score": 76.33751777233937, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
}, |
|
"Logical": { |
|
"Average Score": 61.22020517318166, |
|
"Standard Deviation": 10.241399997578569, |
|
"Rank": 17 |
|
}, |
|
"Social": { |
|
"Average Score": 0.652578786, |
|
"Standard Deviation": 0.04259293171, |
|
"Rank": 10 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 70.44342338869497, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"CPP": { |
|
"Average Score": 73.54037778797029, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4o-mini-2024-07-18", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 77.35427394420829, |
|
"Standard Deviation": 3.162321541714492, |
|
"Rank": 6 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.946650435, |
|
"Standard Deviation": 0.01831236482, |
|
"Rank": 7 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.796243022, |
|
"Standard Deviation": 0.05537539202, |
|
"Rank": 7 |
|
}, |
|
"Probability": { |
|
"Average Score": 77.63972720989734, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Logical": { |
|
"Average Score": 71.81267717239906, |
|
"Standard Deviation": 0.3393593163824375, |
|
"Rank": 11 |
|
}, |
|
"Social": { |
|
"Average Score": 0.691949855, |
|
"Standard Deviation": 0.02072934333, |
|
"Rank": 7 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 78.10636943659426, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"CPP": { |
|
"Average Score": 88.3877070580296, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3.5-sonnet", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 75.97534774560863, |
|
"Standard Deviation": 9.237316832705584, |
|
"Rank": 7 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.95316419, |
|
"Standard Deviation": 0.02081192856, |
|
"Rank": 6 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.759789952, |
|
"Standard Deviation": 0.02611765096, |
|
"Rank": 8 |
|
}, |
|
"Probability": { |
|
"Average Score": 65.4531881044298, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
}, |
|
"Logical": { |
|
"Average Score": 76.47424588300288, |
|
"Standard Deviation": 0.07699328617321737, |
|
"Rank": 8 |
|
}, |
|
"Social": { |
|
"Average Score": 0.790002247, |
|
"Standard Deviation": 0.1007410022, |
|
"Rank": 4 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 85.17654674052096, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"CPP": { |
|
"Average Score": 82.37734076815008, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "o1-mini", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 87.92989248183513, |
|
"Standard Deviation": 1.3401058431409953, |
|
"Rank": 2 |
|
}, |
|
"Geometry": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Algebra": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Probability": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Logical": { |
|
"Average Score": 99.15920225407733, |
|
"Standard Deviation": 0.49801294410288666, |
|
"Rank": 2 |
|
}, |
|
"Social": { |
|
"Average Score": 0.993974241, |
|
"Standard Deviation": 0.001996882328, |
|
"Rank": 2 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "o1-preview", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 85.40247108906188, |
|
"Standard Deviation": 1.5796898764998464, |
|
"Rank": 4 |
|
}, |
|
"Geometry": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Algebra": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Probability": { |
|
"Average Score": 90.32625019320989, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
}, |
|
"Logical": { |
|
"Average Score": 98.18241651273537, |
|
"Standard Deviation": 0.16231417987288874, |
|
"Rank": 4 |
|
}, |
|
"Social": { |
|
"Average Score": 1.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 1 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemini-1.5-flash-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 67.67997467963976, |
|
"Standard Deviation": 2.624276751646549, |
|
"Rank": 13 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.804144103, |
|
"Standard Deviation": 0.1327142178, |
|
"Rank": 8 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.731776765, |
|
"Standard Deviation": 0.02594657111, |
|
"Rank": 9 |
|
}, |
|
"Probability": { |
|
"Average Score": 61.17190439316032, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"Logical": { |
|
"Average Score": 62.284381466778335, |
|
"Standard Deviation": 3.9592476945909674, |
|
"Rank": 16 |
|
}, |
|
"Social": { |
|
"Average Score": 0.555933822, |
|
"Standard Deviation": 0.1029934524, |
|
"Rank": 15 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 70.24726462490831, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"CPP": { |
|
"Average Score": 72.1127762005651, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt4-1106", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 72.24829405851214, |
|
"Standard Deviation": 13.633826990442946, |
|
"Rank": 11 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.71843088, |
|
"Standard Deviation": 0.04778038294, |
|
"Rank": 11 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.712910417, |
|
"Standard Deviation": 0.02581828898, |
|
"Rank": 10 |
|
}, |
|
"Probability": { |
|
"Average Score": 63.29462909293814, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Logical": { |
|
"Average Score": 62.987098158883875, |
|
"Standard Deviation": 4.027795425350514, |
|
"Rank": 15 |
|
}, |
|
"Social": { |
|
"Average Score": 0.450609816, |
|
"Standard Deviation": 0.05208655446, |
|
"Rank": 21 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 67.34047237109209, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"CPP": { |
|
"Average Score": 69.11824072252848, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2-27b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/06" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 62.70975283121063, |
|
"Standard Deviation": 6.376450054715319, |
|
"Rank": 15 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.60112744, |
|
"Standard Deviation": 0.0469109952, |
|
"Rank": 17 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.687955914, |
|
"Standard Deviation": 0.01959958192, |
|
"Rank": 11 |
|
}, |
|
"Probability": { |
|
"Average Score": 60.04180799425261, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"Logical": { |
|
"Average Score": 60.77082327163094, |
|
"Standard Deviation": 7.2164902432618625, |
|
"Rank": 19 |
|
}, |
|
"Social": { |
|
"Average Score": 0.487844257, |
|
"Standard Deviation": 0.05857760809, |
|
"Rank": 18 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 61.68181926111706, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
}, |
|
"CPP": { |
|
"Average Score": 63.28920072143611, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3-opus", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 60.56449573632771, |
|
"Standard Deviation": 8.485936885427277, |
|
"Rank": 17 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.7215743, |
|
"Standard Deviation": 0.04712598358, |
|
"Rank": 10 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.68777327, |
|
"Standard Deviation": 0.02382683713, |
|
"Rank": 12 |
|
}, |
|
"Probability": { |
|
"Average Score": 62.296041016641176, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Logical": { |
|
"Average Score": 68.36295609287292, |
|
"Standard Deviation": 1.6558271236588655, |
|
"Rank": 13 |
|
}, |
|
"Social": { |
|
"Average Score": 0.663410854, |
|
"Standard Deviation": 0.09540220876, |
|
"Rank": 9 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 70.44337273504232, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"CPP": { |
|
"Average Score": 73.5404403567132, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2-9b-it-simpo", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.582787508, |
|
"Standard Deviation": 0.03965204074, |
|
"Rank": 18 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.658648133, |
|
"Standard Deviation": 0.02565919856, |
|
"Rank": 13 |
|
}, |
|
"Probability": { |
|
"Average Score": 57.545408188912894, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Logical": { |
|
"Average Score": 53.1996479262466, |
|
"Standard Deviation": 2.690106544431167, |
|
"Rank": 23 |
|
}, |
|
"Social": { |
|
"Average Score": 0.635266187, |
|
"Standard Deviation": 0.03620021751, |
|
"Rank": 12 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 74.44267231381626, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"CPP": { |
|
"Average Score": 73.43757596214863, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-72b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 52.983715751652085, |
|
"Standard Deviation": 3.097613966427763, |
|
"Rank": 18 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.543139301, |
|
"Standard Deviation": 0.03425202326, |
|
"Rank": 22 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.635228729, |
|
"Standard Deviation": 0.01944043425, |
|
"Rank": 14 |
|
}, |
|
"Probability": { |
|
"Average Score": 52.650033879924905, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"Logical": { |
|
"Average Score": 32.628853250402074, |
|
"Standard Deviation": 3.227745519436025, |
|
"Rank": 37 |
|
}, |
|
"Social": { |
|
"Average Score": 0.415007627, |
|
"Standard Deviation": 0.03920053159, |
|
"Rank": 22 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 47.5126781973184, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"CPP": { |
|
"Average Score": 48.69302376665551, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-32b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 26.978561942890224, |
|
"Standard Deviation": 1.575986887925592, |
|
"Rank": 32 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.51086835, |
|
"Standard Deviation": 0.04052471998, |
|
"Rank": 25 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.609003168, |
|
"Standard Deviation": 0.04874143541, |
|
"Rank": 15 |
|
}, |
|
"Probability": { |
|
"Average Score": 49.50617919486678, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"Logical": { |
|
"Average Score": 34.07387941414556, |
|
"Standard Deviation": 4.616974831074921, |
|
"Rank": 34 |
|
}, |
|
"Social": { |
|
"Average Score": 0.380987334, |
|
"Standard Deviation": 0.03762251776, |
|
"Rank": 24 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 44.06627265183811, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"CPP": { |
|
"Average Score": 45.14284028264288, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "google-gemma-2-9b-it", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/06" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 52.23013018580635, |
|
"Standard Deviation": 3.3939236141078495, |
|
"Rank": 19 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.575371308, |
|
"Standard Deviation": 0.03556220251, |
|
"Rank": 20 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.597045661, |
|
"Standard Deviation": 0.0313828123, |
|
"Rank": 16 |
|
}, |
|
"Probability": { |
|
"Average Score": 58.73062101843859, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Logical": { |
|
"Average Score": 58.01791397899675, |
|
"Standard Deviation": 5.751983660134971, |
|
"Rank": 21 |
|
}, |
|
"Social": { |
|
"Average Score": 0.768337958, |
|
"Standard Deviation": 0.04078610476, |
|
"Rank": 5 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 52.69494515004607, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"CPP": { |
|
"Average Score": 54.03167523687635, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "yi-1.5-34b-chat", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/05" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 62.568637878216464, |
|
"Standard Deviation": 8.554205798418673, |
|
"Rank": 16 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.566666724, |
|
"Standard Deviation": 0.04001381658, |
|
"Rank": 21 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.590997292, |
|
"Standard Deviation": 0.03594087315, |
|
"Rank": 17 |
|
}, |
|
"Probability": { |
|
"Average Score": 57.545207891104354, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
}, |
|
"Logical": { |
|
"Average Score": 56.598158131627194, |
|
"Standard Deviation": 1.1072821075127297, |
|
"Rank": 22 |
|
}, |
|
"Social": { |
|
"Average Score": 0.516980832, |
|
"Standard Deviation": 0.03369347985, |
|
"Rank": 17 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 50.867343712131174, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
}, |
|
"CPP": { |
|
"Average Score": 52.148798061768964, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "meta-llama-3.1-70b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3.1 Community", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 65.61302047306724, |
|
"Standard Deviation": 7.113338386318571, |
|
"Rank": 14 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.76184398, |
|
"Standard Deviation": 0.01790377984, |
|
"Rank": 10 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.732041699, |
|
"Standard Deviation": 0.02621439062, |
|
"Rank": 9 |
|
}, |
|
"Probability": { |
|
"Average Score": 65.4531285887158, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
}, |
|
"Logical": { |
|
"Average Score": 61.16321386785366, |
|
"Standard Deviation": 0.8920966760646541, |
|
"Rank": 18 |
|
}, |
|
"Social": { |
|
"Average Score": 0.45872939, |
|
"Standard Deviation": 0.05347039576, |
|
"Rank": 20 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 76.03374498429748, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"CPP": { |
|
"Average Score": 84.36815192532764, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "meta-llama-3.1-8b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3.1 Community", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 48.86242501618216, |
|
"Standard Deviation": 3.7761459978540257, |
|
"Rank": 21 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.522442162, |
|
"Standard Deviation": 0.03908236317, |
|
"Rank": 23 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.582702645, |
|
"Standard Deviation": 0.05002277711, |
|
"Rank": 18 |
|
}, |
|
"Probability": { |
|
"Average Score": 52.44179989233465, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"Logical": { |
|
"Average Score": 43.3706774850582, |
|
"Standard Deviation": 2.820707319899787, |
|
"Rank": 28 |
|
}, |
|
"Social": { |
|
"Average Score": 0.329195941, |
|
"Standard Deviation": 0.03925019528, |
|
"Rank": 28 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 43.36264580455019, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"CPP": { |
|
"Average Score": 44.41846841004584, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt3.5-turbo-0125", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2021/09" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 18.951737690142235, |
|
"Standard Deviation": 0.7967088395458379, |
|
"Rank": 42 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.678714519, |
|
"Standard Deviation": 0.05926546762, |
|
"Rank": 12 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.569296173, |
|
"Standard Deviation": 0.05277281097, |
|
"Rank": 19 |
|
}, |
|
"Probability": { |
|
"Average Score": 45.77959177088119, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Logical": { |
|
"Average Score": 17.159084771200394, |
|
"Standard Deviation": 2.5845422782742546, |
|
"Rank": 48 |
|
}, |
|
"Social": { |
|
"Average Score": 0.235071541, |
|
"Standard Deviation": 0.02632892457, |
|
"Rank": 37 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 39.52885225927276, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"CPP": { |
|
"Average Score": 40.46958736582551, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama-3-70b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 40.57810915454436, |
|
"Standard Deviation": 1.3134243733127455, |
|
"Rank": 26 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.516865529, |
|
"Standard Deviation": 0.03858112564, |
|
"Rank": 24 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.566756531, |
|
"Standard Deviation": 0.03369826926, |
|
"Rank": 20 |
|
}, |
|
"Probability": { |
|
"Average Score": 52.64997876875813, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Logical": { |
|
"Average Score": 70.51651844158742, |
|
"Standard Deviation": 0.12355022869457871, |
|
"Rank": 12 |
|
}, |
|
"Social": { |
|
"Average Score": 0.45872939, |
|
"Standard Deviation": 0.05347039576, |
|
"Rank": 20 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 63.65476403379996, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"CPP": { |
|
"Average Score": 65.32140697218945, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3-sonnet", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 52.19088595402735, |
|
"Standard Deviation": 3.743258734262917, |
|
"Rank": 20 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.675613638, |
|
"Standard Deviation": 0.05275594408, |
|
"Rank": 13 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.552025728, |
|
"Standard Deviation": 0.04122192409, |
|
"Rank": 21 |
|
}, |
|
"Probability": { |
|
"Average Score": 54.0284459891417, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"Logical": { |
|
"Average Score": 58.099761779812475, |
|
"Standard Deviation": 7.815595203680491, |
|
"Rank": 20 |
|
}, |
|
"Social": { |
|
"Average Score": 0.570437582, |
|
"Standard Deviation": 0.08607040862, |
|
"Rank": 14 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 59.784958090634056, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"CPP": { |
|
"Average Score": 61.33538592327427, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-14b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 31.56999734729493, |
|
"Standard Deviation": 5.42704987916441, |
|
"Rank": 29 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.452504016, |
|
"Standard Deviation": 0.04225594393, |
|
"Rank": 26 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.538655725, |
|
"Standard Deviation": 0.03721542594, |
|
"Rank": 22 |
|
}, |
|
"Probability": { |
|
"Average Score": 41.027908758027046, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"Logical": { |
|
"Average Score": 31.638560769720616, |
|
"Standard Deviation": 3.175225377796435, |
|
"Rank": 38 |
|
}, |
|
"Social": { |
|
"Average Score": 0.287370142, |
|
"Standard Deviation": 0.04264085315, |
|
"Rank": 30 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 37.667977565724996, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"CPP": { |
|
"Average Score": 38.552779976347026, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3-haiku", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 42.975259650014074, |
|
"Standard Deviation": 2.248602505751528, |
|
"Rank": 25 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.607993912, |
|
"Standard Deviation": 0.05793460748, |
|
"Rank": 15 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.520054055, |
|
"Standard Deviation": 0.03333544511, |
|
"Rank": 23 |
|
}, |
|
"Probability": { |
|
"Average Score": 52.44184603289214, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Logical": { |
|
"Average Score": 50.38523351226464, |
|
"Standard Deviation": 1.9928131873345676, |
|
"Rank": 24 |
|
}, |
|
"Social": { |
|
"Average Score": 0.551083976, |
|
"Standard Deviation": 0.05374722539, |
|
"Rank": 16 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 54.99584531372778, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"CPP": { |
|
"Average Score": 56.40200048817984, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-2.1", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 23.82704986290717, |
|
"Standard Deviation": 1.6337262681919007, |
|
"Rank": 37 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.62752395, |
|
"Standard Deviation": 0.07232659398, |
|
"Rank": 14 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.508849609, |
|
"Standard Deviation": 0.0346897465, |
|
"Rank": 24 |
|
}, |
|
"Probability": { |
|
"Average Score": 42.82280874207299, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"Logical": { |
|
"Average Score": 47.40647506260718, |
|
"Standard Deviation": 3.5140099122016686, |
|
"Rank": 25 |
|
}, |
|
"Social": { |
|
"Average Score": 0.333804568, |
|
"Standard Deviation": 0.03775548253, |
|
"Rank": 27 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 46.09889239661357, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"CPP": { |
|
"Average Score": 47.23672563994903, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "mistral-8x7b-instruct-v0.1", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 26.279729527476174, |
|
"Standard Deviation": 1.7823676900027476, |
|
"Rank": 33 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.432216097, |
|
"Standard Deviation": 0.04747949254, |
|
"Rank": 29 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.478314888, |
|
"Standard Deviation": 0.01998797419, |
|
"Rank": 25 |
|
}, |
|
"Probability": { |
|
"Average Score": 42.27303178662447, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Logical": { |
|
"Average Score": 34.58281320758576, |
|
"Standard Deviation": 2.5548927504271073, |
|
"Rank": 33 |
|
}, |
|
"Social": { |
|
"Average Score": 0.251949622, |
|
"Standard Deviation": 0.03346674405, |
|
"Rank": 35 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 43.47423835615602, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"CPP": { |
|
"Average Score": 44.533118241976666, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-2.0", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 20.490629074737296, |
|
"Standard Deviation": 0.4821482730133453, |
|
"Rank": 40 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.604141967, |
|
"Standard Deviation": 0.05116441826, |
|
"Rank": 16 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.474350734, |
|
"Standard Deviation": 0.01510393066, |
|
"Rank": 26 |
|
}, |
|
"Probability": { |
|
"Average Score": 45.15580067803421, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"Logical": { |
|
"Average Score": 43.65660021552717, |
|
"Standard Deviation": 4.959029305063026, |
|
"Rank": 27 |
|
}, |
|
"Social": { |
|
"Average Score": 0.469422836, |
|
"Standard Deviation": 0.05999901796, |
|
"Rank": 19 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 49.53201090067431, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"CPP": { |
|
"Average Score": 50.773143448036464, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "starling-lm-7b-beta", |
|
"organization": "Nexusflow", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2024/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 43.0415265396966, |
|
"Standard Deviation": 0.8770524316858576, |
|
"Rank": 24 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.446654388, |
|
"Standard Deviation": 0.05637864999, |
|
"Rank": 28 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.473952749, |
|
"Standard Deviation": 0.01584301288, |
|
"Rank": 27 |
|
}, |
|
"Probability": { |
|
"Average Score": 41.320066911500234, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"Logical": { |
|
"Average Score": 39.79665241383638, |
|
"Standard Deviation": 3.4711628274016544, |
|
"Rank": 30 |
|
}, |
|
"Social": { |
|
"Average Score": 0.380021662, |
|
"Standard Deviation": 0.04622452748, |
|
"Rank": 25 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 37.39896886078588, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"CPP": { |
|
"Average Score": 38.27587102395908, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemini-1.0-pro-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 45.78126809517331, |
|
"Standard Deviation": 3.7275133674569783, |
|
"Rank": 23 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.578347959, |
|
"Standard Deviation": 0.04242873607, |
|
"Rank": 19 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.462417786, |
|
"Standard Deviation": 0.01668313635, |
|
"Rank": 28 |
|
}, |
|
"Probability": { |
|
"Average Score": 31.410607001114293, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Logical": { |
|
"Average Score": 21.717362428653246, |
|
"Standard Deviation": 4.392290522642325, |
|
"Rank": 44 |
|
}, |
|
"Social": { |
|
"Average Score": 0.130790863, |
|
"Standard Deviation": 0.02800188173, |
|
"Rank": 45 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 44.14314678087462, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"CPP": { |
|
"Average Score": 45.22204471452975, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "openchat-3.5-0106", |
|
"organization": "OpenChat", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2024/01" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 23.85666609339201, |
|
"Standard Deviation": 1.341285455536348, |
|
"Rank": 36 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.38715246, |
|
"Standard Deviation": 0.03701851946, |
|
"Rank": 32 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.441233712, |
|
"Standard Deviation": 0.01135753754, |
|
"Rank": 29 |
|
}, |
|
"Probability": { |
|
"Average Score": 40.37790468557232, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"Logical": { |
|
"Average Score": 35.1573373260624, |
|
"Standard Deviation": 2.485128777146724, |
|
"Rank": 32 |
|
}, |
|
"Social": { |
|
"Average Score": 0.250891608, |
|
"Standard Deviation": 0.03253769914, |
|
"Rank": 36 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 32.96322247853182, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"CPP": { |
|
"Average Score": 33.70639271807677, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "openchat-3.5", |
|
"organization": "OpenChat", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 23.63538251797928, |
|
"Standard Deviation": 2.0516295921862095, |
|
"Rank": 38 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.401699069, |
|
"Standard Deviation": 0.03410726557, |
|
"Rank": 30 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.414095336, |
|
"Standard Deviation": 0.01881964261, |
|
"Rank": 31 |
|
}, |
|
"Probability": { |
|
"Average Score": 36.00454588244476, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"Logical": { |
|
"Average Score": 34.029859502735654, |
|
"Standard Deviation": 3.354098427500673, |
|
"Rank": 35 |
|
}, |
|
"Social": { |
|
"Average Score": 0.319991655, |
|
"Standard Deviation": 0.04502478724, |
|
"Rank": 29 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 32.29778226319944, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"CPP": { |
|
"Average Score": 33.020911255646965, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "command-r-(08-2024)", |
|
"organization": "Cohere", |
|
"license": "CC-BY-NC-4.0", |
|
"knowledge_cutoff": "2024/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 38.783798277856995, |
|
"Standard Deviation": 1.1948096596199191, |
|
"Rank": 27 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.448300727, |
|
"Standard Deviation": 0.04996362328, |
|
"Rank": 27 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.417519167, |
|
"Standard Deviation": 0.01822196902, |
|
"Rank": 30 |
|
}, |
|
"Probability": { |
|
"Average Score": 38.019523941917335, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"Logical": { |
|
"Average Score": 23.408826179018206, |
|
"Standard Deviation": 0.9355701468205376, |
|
"Rank": 42 |
|
}, |
|
"Social": { |
|
"Average Score": 0.276088379, |
|
"Standard Deviation": 0.03295234688, |
|
"Rank": 32 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 38.699171059988636, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"CPP": { |
|
"Average Score": 39.61492485677676, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-1.1-7b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 20.965269549151657, |
|
"Standard Deviation": 0.6031600560715249, |
|
"Rank": 39 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.324170977, |
|
"Standard Deviation": 0.04668553765, |
|
"Rank": 35 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.398684697, |
|
"Standard Deviation": 0.01982398259, |
|
"Rank": 32 |
|
}, |
|
"Probability": { |
|
"Average Score": 30.98345832281905, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Logical": { |
|
"Average Score": 33.36570116785516, |
|
"Standard Deviation": 3.8824795120929765, |
|
"Rank": 36 |
|
}, |
|
"Social": { |
|
"Average Score": 0.179073276, |
|
"Standard Deviation": 0.02009658805, |
|
"Rank": 41 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 41.66173653808921, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"CPP": { |
|
"Average Score": 42.666504105798204, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama3-8b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "2023/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 30.183633696164936, |
|
"Standard Deviation": 3.5901082045571266, |
|
"Rank": 31 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.367143758, |
|
"Standard Deviation": 0.04363680358, |
|
"Rank": 33 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.391480973, |
|
"Standard Deviation": 0.02757445266, |
|
"Rank": 33 |
|
}, |
|
"Probability": { |
|
"Average Score": 34.51621975866105, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"Logical": { |
|
"Average Score": 45.27560737491475, |
|
"Standard Deviation": 4.639305724878496, |
|
"Rank": 26 |
|
}, |
|
"Social": { |
|
"Average Score": 0.336373622, |
|
"Standard Deviation": 0.05762408512, |
|
"Rank": 26 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 44.271144265487514, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"CPP": { |
|
"Average Score": 45.35392139264795, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 47.37377937645159, |
|
"Standard Deviation": 2.72420190928707, |
|
"Rank": 22 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.395006676, |
|
"Standard Deviation": 0.05882607713, |
|
"Rank": 31 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.379391887, |
|
"Standard Deviation": 0.01722410785, |
|
"Rank": 34 |
|
}, |
|
"Probability": { |
|
"Average Score": 33.90530403382374, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"Logical": { |
|
"Average Score": 37.64262561604027, |
|
"Standard Deviation": 3.0627256408495804, |
|
"Rank": 31 |
|
}, |
|
"Social": { |
|
"Average Score": 0.393482094, |
|
"Standard Deviation": 0.06450214024, |
|
"Rank": 23 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 29.883648650177584, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"CPP": { |
|
"Average Score": 30.53406933106768, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "starling-lm-7b-alpha", |
|
"organization": "Nexusflow", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 24.34505731078066, |
|
"Standard Deviation": 1.4660872513914562, |
|
"Rank": 35 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.336782578, |
|
"Standard Deviation": 0.04069449132, |
|
"Rank": 34 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.371551932, |
|
"Standard Deviation": 0.03367241745, |
|
"Rank": 35 |
|
}, |
|
"Probability": { |
|
"Average Score": 34.51613212227484, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"Logical": { |
|
"Average Score": 29.88612695085449, |
|
"Standard Deviation": 2.4070524024678672, |
|
"Rank": 40 |
|
}, |
|
"Social": { |
|
"Average Score": 0.271975534, |
|
"Standard Deviation": 0.04266753408, |
|
"Rank": 33 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 29.442057363491365, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"CPP": { |
|
"Average Score": 30.07926487356878, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-4b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 7.19753150259024, |
|
"Standard Deviation": 0.6175113365944395, |
|
"Rank": 52 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.215834522, |
|
"Standard Deviation": 0.0363766363, |
|
"Rank": 39 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.305589811, |
|
"Standard Deviation": 0.02354198912, |
|
"Rank": 36 |
|
}, |
|
"Probability": { |
|
"Average Score": 15.124506890648007, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Logical": { |
|
"Average Score": 11.67206257803879, |
|
"Standard Deviation": 1.140401009846497, |
|
"Rank": 51 |
|
}, |
|
"Social": { |
|
"Average Score": 0.18195615, |
|
"Standard Deviation": 0.02269805277, |
|
"Rank": 40 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 12.825435835657133, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"CPP": { |
|
"Average Score": 13.21208067122554, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "command-r-(04-2024)", |
|
"organization": "Cohere", |
|
"license": "CC-BY-NC-4.0", |
|
"knowledge_cutoff": "2024/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 26.20787727166716, |
|
"Standard Deviation": 1.6793980036057201, |
|
"Rank": 34 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.300416698, |
|
"Standard Deviation": 0.03485612736, |
|
"Rank": 36 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.293120231, |
|
"Standard Deviation": 0.032926484, |
|
"Rank": 37 |
|
}, |
|
"Probability": { |
|
"Average Score": 28.551833516483626, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
}, |
|
"Logical": { |
|
"Average Score": 30.83782425033377, |
|
"Standard Deviation": 3.4266833154577383, |
|
"Rank": 39 |
|
}, |
|
"Social": { |
|
"Average Score": 0.283882949, |
|
"Standard Deviation": 0.03336901148, |
|
"Rank": 31 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 40.38004181614496, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"CPP": { |
|
"Average Score": 41.346336503003236, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "vicuna-33b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 19.726298678709266, |
|
"Standard Deviation": 1.0771354692793496, |
|
"Rank": 41 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.208284679, |
|
"Standard Deviation": 0.03937771461, |
|
"Rank": 40 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.248994048, |
|
"Standard Deviation": 0.02668175054, |
|
"Rank": 39 |
|
}, |
|
"Probability": { |
|
"Average Score": 23.2308538772627, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"Logical": { |
|
"Average Score": 19.488409585540122, |
|
"Standard Deviation": 0.7913465863319494, |
|
"Rank": 46 |
|
}, |
|
"Social": { |
|
"Average Score": 0.257623798, |
|
"Standard Deviation": 0.02653724437, |
|
"Rank": 34 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 27.198874596635843, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"CPP": { |
|
"Average Score": 28.01838653090379, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-7b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 18.339626858215343, |
|
"Standard Deviation": 0.1553156123023995, |
|
"Rank": 43 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.244791417, |
|
"Standard Deviation": 0.0289612078, |
|
"Rank": 37 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.250614794, |
|
"Standard Deviation": 0.01991678295, |
|
"Rank": 38 |
|
}, |
|
"Probability": { |
|
"Average Score": 18.066869704202595, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
}, |
|
"Logical": { |
|
"Average Score": 22.446113532575186, |
|
"Standard Deviation": 1.1759308097806727, |
|
"Rank": 43 |
|
}, |
|
"Social": { |
|
"Average Score": 0.202138025, |
|
"Standard Deviation": 0.02098346639, |
|
"Rank": 39 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 27.195166540671735, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"CPP": { |
|
"Average Score": 28.014658234926813, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "mistral-7b-instruct-2", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 32.27919528900069, |
|
"Standard Deviation": 2.070593349377193, |
|
"Rank": 28 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.216402626, |
|
"Standard Deviation": 0.03338414918, |
|
"Rank": 38 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.233777838, |
|
"Standard Deviation": 0.0155226054, |
|
"Rank": 40 |
|
}, |
|
"Probability": { |
|
"Average Score": 25.70261650740474, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"Logical": { |
|
"Average Score": 26.165635051797608, |
|
"Standard Deviation": 1.5009510944001014, |
|
"Rank": 41 |
|
}, |
|
"Social": { |
|
"Average Score": 0.209386782, |
|
"Standard Deviation": 0.02738569921, |
|
"Rank": 38 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 30.70773868184025, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"CPP": { |
|
"Average Score": 31.382959631870822, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "mistral-7b-instruct-1", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 14.750363553682964, |
|
"Standard Deviation": 0.442399072321264, |
|
"Rank": 48 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.161799938, |
|
"Standard Deviation": 0.03595278559, |
|
"Rank": 44 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.210341624, |
|
"Standard Deviation": 0.01736539119, |
|
"Rank": 41 |
|
}, |
|
"Probability": { |
|
"Average Score": 24.69501890202338, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"Logical": { |
|
"Average Score": 15.957706802740889, |
|
"Standard Deviation": 2.080778273455708, |
|
"Rank": 50 |
|
}, |
|
"Social": { |
|
"Average Score": 0.117646827, |
|
"Standard Deviation": 0.009321202779, |
|
"Rank": 47 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 18.375111202411667, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"CPP": { |
|
"Average Score": 18.929093202755805, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "vicuna-13b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 13.302607436757697, |
|
"Standard Deviation": 0.570272227659312, |
|
"Rank": 50 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.200941928, |
|
"Standard Deviation": 0.03366817781, |
|
"Rank": 41 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.196123323, |
|
"Standard Deviation": 0.0135715643, |
|
"Rank": 42 |
|
}, |
|
"Probability": { |
|
"Average Score": 15.08476669604627, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"Logical": { |
|
"Average Score": 16.548339412104294, |
|
"Standard Deviation": 3.443370777556759, |
|
"Rank": 49 |
|
}, |
|
"Social": { |
|
"Average Score": 0.124655135, |
|
"Standard Deviation": 0.01122382671, |
|
"Rank": 46 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 21.201173318496842, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"CPP": { |
|
"Average Score": 21.840013221590294, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "zephyr-7b-beta", |
|
"organization": "HuggingFace", |
|
"license": "MIT", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 7.378234886105356, |
|
"Standard Deviation": 1.1456147261693999, |
|
"Rank": 51 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.114005544, |
|
"Standard Deviation": 0.03144354365, |
|
"Rank": 45 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.141766633, |
|
"Standard Deviation": 0.03179520129, |
|
"Rank": 43 |
|
}, |
|
"Probability": { |
|
"Average Score": 8.92696070171298, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"Logical": { |
|
"Average Score": 6.971377981442089, |
|
"Standard Deviation": 0.31669853263737413, |
|
"Rank": 55 |
|
}, |
|
"Social": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 52 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 18.374948840997902, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"CPP": { |
|
"Average Score": 18.92902220864132, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-1.1-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 16.083251992757752, |
|
"Standard Deviation": 0.7340624884005772, |
|
"Rank": 46 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.183974034, |
|
"Standard Deviation": 0.0215548886, |
|
"Rank": 43 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.13422252, |
|
"Standard Deviation": 0.01922819511, |
|
"Rank": 44 |
|
}, |
|
"Probability": { |
|
"Average Score": 9.992136776217318, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"Logical": { |
|
"Average Score": 9.537233946101678, |
|
"Standard Deviation": 0.7567112693269967, |
|
"Rank": 53 |
|
}, |
|
"Social": { |
|
"Average Score": 0.167796727, |
|
"Standard Deviation": 0.01666541942, |
|
"Rank": 42 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 20.11834233400297, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"CPP": { |
|
"Average Score": 20.724691953843916, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama2-7b-chat", |
|
"organization": "Meta", |
|
"license": "Llama 2 Community", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 17.319161859655946, |
|
"Standard Deviation": 0.495520710612214, |
|
"Rank": 45 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.087067276, |
|
"Standard Deviation": 0.04274343402, |
|
"Rank": 46 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.12308805, |
|
"Standard Deviation": 0.01856053622, |
|
"Rank": 45 |
|
}, |
|
"Probability": { |
|
"Average Score": 8.860911732515305, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Logical": { |
|
"Average Score": 18.812132126028335, |
|
"Standard Deviation": 3.0846832107977433, |
|
"Rank": 47 |
|
}, |
|
"Social": { |
|
"Average Score": 0.152905272, |
|
"Standard Deviation": 0.007166957097, |
|
"Rank": 43 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 15.270334671133512, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"CPP": { |
|
"Average Score": 15.730513733660898, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 15.029602991101632, |
|
"Standard Deviation": 0.4529017602377039, |
|
"Rank": 47 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.198571153, |
|
"Standard Deviation": 0.01699161031, |
|
"Rank": 42 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.109883009, |
|
"Standard Deviation": 0.01520005833, |
|
"Rank": 46 |
|
}, |
|
"Probability": { |
|
"Average Score": 6.561360414966015, |
|
"Standard Deviation": null, |
|
"Rank": 56 |
|
}, |
|
"Logical": { |
|
"Average Score": 3.9858662356708785, |
|
"Standard Deviation": 0.5609499073366407, |
|
"Rank": 56 |
|
}, |
|
"Social": { |
|
"Average Score": 0.087452913, |
|
"Standard Deviation": 0.008170146562, |
|
"Rank": 50 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 16.766144078336097, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"CPP": { |
|
"Average Score": 17.2715657115764, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama2-13b-chat", |
|
"organization": "Meta", |
|
"license": "Llama 2 Community", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 17.47902371074294, |
|
"Standard Deviation": 0.4047581815962028, |
|
"Rank": 44 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.072729954, |
|
"Standard Deviation": 0.02315988261, |
|
"Rank": 48 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.080371692, |
|
"Standard Deviation": 0.01277569453, |
|
"Rank": 47 |
|
}, |
|
"Probability": { |
|
"Average Score": 12.738302754764042, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Logical": { |
|
"Average Score": 21.708359515217182, |
|
"Standard Deviation": 1.4862481594434973, |
|
"Rank": 45 |
|
}, |
|
"Social": { |
|
"Average Score": 0.149125922, |
|
"Standard Deviation": 0.01157416827, |
|
"Rank": 44 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 12.786967781868814, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"CPP": { |
|
"Average Score": 13.17258252933903, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "vicuna-7b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 13.31896682669754, |
|
"Standard Deviation": 0.30441157156016124, |
|
"Rank": 49 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.083457058, |
|
"Standard Deviation": 0.02520989111, |
|
"Rank": 47 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.070883882, |
|
"Standard Deviation": 0.007315853253, |
|
"Rank": 48 |
|
}, |
|
"Probability": { |
|
"Average Score": 8.255246380068842, |
|
"Standard Deviation": null, |
|
"Rank": 55 |
|
}, |
|
"Logical": { |
|
"Average Score": 10.046676845257544, |
|
"Standard Deviation": 0.6816182835206797, |
|
"Rank": 52 |
|
}, |
|
"Social": { |
|
"Average Score": 0.111076414, |
|
"Standard Deviation": 0.004805626512, |
|
"Rank": 48 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 13.838150481781991, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"CPP": { |
|
"Average Score": 14.255194156624162, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "koala-13b", |
|
"organization": "UC Berkeley", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 6.419305623111718, |
|
"Standard Deviation": 0.19611070515647736, |
|
"Rank": 53 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.017374001, |
|
"Standard Deviation": 0.01747053557, |
|
"Rank": 49 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.018129197, |
|
"Standard Deviation": 0.01054371383, |
|
"Rank": 49 |
|
}, |
|
"Probability": { |
|
"Average Score": 4.1717283559090035, |
|
"Standard Deviation": null, |
|
"Rank": 57 |
|
}, |
|
"Logical": { |
|
"Average Score": 7.484701131693112, |
|
"Standard Deviation": 0.172417770163525, |
|
"Rank": 54 |
|
}, |
|
"Social": { |
|
"Average Score": 0.096983835, |
|
"Standard Deviation": 0.007847059783, |
|
"Rank": 49 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 6.177985738164252, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"CPP": { |
|
"Average Score": 6.36433272373514, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "openassistant-pythia-12b", |
|
"organization": "OpenAssistant", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 54 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 50 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 50 |
|
}, |
|
"Probability": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 58 |
|
}, |
|
"Logical": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 57 |
|
}, |
|
"Social": { |
|
"Average Score": 0.030792528, |
|
"Standard Deviation": 0.007518796391, |
|
"Rank": 51 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 55 |
|
}, |
|
"CPP": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "nemotron-70b", |
|
"organization": "NVIDIA", |
|
"license": "Unknown", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 1 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 96.00601450276388, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
}, |
|
"Logical": { |
|
"Average Score": 98.08807085219765, |
|
"Standard Deviation": 0.832489959144682, |
|
"Rank": 5 |
|
}, |
|
"Probability": { |
|
"Average Score": 91.16755514126538, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama-3.2-3b-it", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 30.40742747938681, |
|
"Standard Deviation": 1.6816556668351852, |
|
"Rank": 30 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 27.43049468475638, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Logical": { |
|
"Average Score": 41.58905844173492, |
|
"Standard Deviation": 5.2798221527591, |
|
"Rank": 29 |
|
}, |
|
"Probability": { |
|
"Average Score": 62.02868227997844, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "yi-lightning", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"Chemistry": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Logical": { |
|
"Average Score": 98.816765663456, |
|
"Standard Deviation": 0.3271335810663529, |
|
"Rank": 3 |
|
}, |
|
"Probability": { |
|
"Average Score": 95.8842044402052, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "glm-4-plus", |
|
"organization": "Zhipu AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"Chemistry": { |
|
"Average Score": 99.05822908668402, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Logical": { |
|
"Average Score": 99.45307787995229, |
|
"Standard Deviation": 0.5982476107949444, |
|
"Rank": 1 |
|
}, |
|
"Probability": { |
|
"Average Score": 92.04426702796823, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
} |
|
} |
|
} |
|
] |