[ { "config": { "model_name": "ChatGPT-4o-latest (2024-09-03)", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 87.33082346779815, "Standard Deviation": 1.4853337406399776, "Rank": 3 }, "Geometry": { "Average Score": 0.976028578, "Standard Deviation": 0.01507912373, "Rank": 3 }, "Algebra": { "Average Score": 0.951199453, "Standard Deviation": 0.08452452108, "Rank": 3 }, "Probability": { "Average Score": 80.1332207690739, "Standard Deviation": null, "Rank": 7 }, "Logical": { "Average Score": 84.12975867250425, "Standard Deviation": 0.21211547702245045, "Rank": 6 }, "Social": { "Average Score": 0.815902987, "Standard Deviation": 0.0196254222, "Rank": 3 }, "Chemistry": { "Average Score": 89.92480228064885, "Standard Deviation": null, "Rank": 4 }, "CPP": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 } } }, { "config": { "model_name": "gpt-4o-2024-08-06", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 77.7818546246671, "Standard Deviation": 2.7097581088879505, "Rank": 5 }, "Geometry": { "Average Score": 0.99773096, "Standard Deviation": 0.002835555172, "Rank": 1 }, "Algebra": { "Average Score": 1.0, "Standard Deviation": 0.0, "Rank": 1 }, "Probability": { "Average Score": 74.97136205481755, "Standard Deviation": null, "Rank": 11 }, "Logical": { "Average Score": 66.0597109743056, "Standard Deviation": 1.5021351704575163, "Rank": 14 }, "Social": { "Average Score": 0.680417314, "Standard Deviation": 0.00656867063, "Rank": 8 }, "Chemistry": { "Average Score": 82.55189735524202, "Standard Deviation": null, "Rank": 7 }, "CPP": { "Average Score": 92.43090226400756, "Standard Deviation": null, "Rank": 2 } } }, { "config": { "model_name": "gpt-4o-2024-05-13", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 72.6093654197998, "Standard Deviation": 13.515345690976028, "Rank": 10 }, "Geometry": { "Average Score": 0.972472377, "Standard Deviation": 0.01648274205, "Rank": 4 }, "Algebra": { "Average Score": 0.995511298, "Standard Deviation": 0.004097802515, "Rank": 2 }, "Probability": { "Average Score": 77.97816201050715, "Standard Deviation": null, "Rank": 8 }, "Logical": { "Average Score": 75.65058939137873, "Standard Deviation": 0.07522785572103825, "Rank": 9 }, "Social": { "Average Score": 0.609875087, "Standard Deviation": 0.038729239, "Rank": 13 }, "Chemistry": { "Average Score": 76.03377031297643, "Standard Deviation": null, "Rank": 9 }, "CPP": { "Average Score": 79.1592634699295, "Standard Deviation": null, "Rank": 6 } } }, { "config": { "model_name": "gpt-4-turbo-2024-04-09", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 73.32308543749606, "Standard Deviation": 6.562777844134629, "Rank": 9 }, "Geometry": { "Average Score": 0.95374588, "Standard Deviation": 0.03109307166, "Rank": 5 }, "Algebra": { "Average Score": 0.930945223, "Standard Deviation": 0.06705136813, "Rank": 4 }, "Probability": { "Average Score": 74.97144205445957, "Standard Deviation": null, "Rank": 12 }, "Logical": { "Average Score": 76.82291715624933, "Standard Deviation": 0.03462548327631355, "Rank": 7 }, "Social": { "Average Score": 0.715935163, "Standard Deviation": 0.1209141409, "Rank": 6 }, "Chemistry": { "Average Score": 70.44329321394066, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 70.73143363230263, "Standard Deviation": null, "Rank": 11 } } }, { "config": { "model_name": "gemini-1.5-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 74.27365448117855, "Standard Deviation": 3.9515447172901847, "Rank": 8 }, "Geometry": { "Average Score": 0.9947169, "Standard Deviation": 0.009150597621, "Rank": 2 }, "Algebra": { "Average Score": 0.857464301, "Standard Deviation": 0.05014285338, "Rank": 5 }, "Probability": { "Average Score": 64.77713215500482, "Standard Deviation": null, "Rank": 15 }, "Logical": { "Average Score": 74.3275461555815, "Standard Deviation": 0.8092355737847541, "Rank": 10 }, "Social": { "Average Score": 0.649601885, "Standard Deviation": 0.104854889, "Rank": 11 } } }, { "config": { "model_name": "qwen2-72b-instruct", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/09" }, "results": { "OVERALL": { "Average Score": 71.00423311357184, "Standard Deviation": 1.6189609141983887, "Rank": 12 }, "Geometry": { "Average Score": 0.796870305, "Standard Deviation": 0.0509025346, "Rank": 9 }, "Algebra": { "Average Score": 0.836194231, "Standard Deviation": 0.04517093028, "Rank": 6 }, "Probability": { "Average Score": 76.33751777233937, "Standard Deviation": null, "Rank": 10 }, "Logical": { "Average Score": 61.22020517318166, "Standard Deviation": 10.241399997578569, "Rank": 17 }, "Social": { "Average Score": 0.652578786, "Standard Deviation": 0.04259293171, "Rank": 10 }, "Chemistry": { "Average Score": 70.44342338869497, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 73.54037778797029, "Standard Deviation": null, "Rank": 7 } } }, { "config": { "model_name": "gpt-4o-mini-2024-07-18", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 77.35427394420829, "Standard Deviation": 3.162321541714492, "Rank": 6 }, "Geometry": { "Average Score": 0.946650435, "Standard Deviation": 0.01831236482, "Rank": 7 }, "Algebra": { "Average Score": 0.796243022, "Standard Deviation": 0.05537539202, "Rank": 7 }, "Probability": { "Average Score": 77.63972720989734, "Standard Deviation": null, "Rank": 9 }, "Logical": { "Average Score": 71.81267717239906, "Standard Deviation": 0.3393593163824375, "Rank": 11 }, "Social": { "Average Score": 0.691949855, "Standard Deviation": 0.02072934333, "Rank": 7 }, "Chemistry": { "Average Score": 78.10636943659426, "Standard Deviation": null, "Rank": 8 }, "CPP": { "Average Score": 88.3877070580296, "Standard Deviation": null, "Rank": 3 } } }, { "config": { "model_name": "claude-3.5-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 75.97534774560863, "Standard Deviation": 9.237316832705584, "Rank": 7 }, "Geometry": { "Average Score": 0.95316419, "Standard Deviation": 0.02081192856, "Rank": 6 }, "Algebra": { "Average Score": 0.759789952, "Standard Deviation": 0.02611765096, "Rank": 8 }, "Probability": { "Average Score": 65.4531881044298, "Standard Deviation": null, "Rank": 14 }, "Logical": { "Average Score": 76.47424588300288, "Standard Deviation": 0.07699328617321737, "Rank": 8 }, "Social": { "Average Score": 0.790002247, "Standard Deviation": 0.1007410022, "Rank": 4 }, "Chemistry": { "Average Score": 85.17654674052096, "Standard Deviation": null, "Rank": 6 }, "CPP": { "Average Score": 82.37734076815008, "Standard Deviation": null, "Rank": 5 } } }, { "config": { "model_name": "o1-mini", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 87.92989248183513, "Standard Deviation": 1.3401058431409953, "Rank": 2 }, "Geometry": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Algebra": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Probability": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Logical": { "Average Score": 99.15920225407733, "Standard Deviation": 0.49801294410288666, "Rank": 2 }, "Social": { "Average Score": 0.993974241, "Standard Deviation": 0.001996882328, "Rank": 2 } } }, { "config": { "model_name": "o1-preview", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 85.40247108906188, "Standard Deviation": 1.5796898764998464, "Rank": 4 }, "Geometry": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Algebra": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Probability": { "Average Score": 90.32625019320989, "Standard Deviation": null, "Rank": 5 }, "Logical": { "Average Score": 98.18241651273537, "Standard Deviation": 0.16231417987288874, "Rank": 4 }, "Social": { "Average Score": 1.0, "Standard Deviation": 0.0, "Rank": 1 } } }, { "config": { "model_name": "gemini-1.5-flash-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 67.67997467963976, "Standard Deviation": 2.624276751646549, "Rank": 13 }, "Geometry": { "Average Score": 0.804144103, "Standard Deviation": 0.1327142178, "Rank": 8 }, "Algebra": { "Average Score": 0.731776765, "Standard Deviation": 0.02594657111, "Rank": 9 }, "Probability": { "Average Score": 61.17190439316032, "Standard Deviation": null, "Rank": 19 }, "Logical": { "Average Score": 62.284381466778335, "Standard Deviation": 3.9592476945909674, "Rank": 16 }, "Social": { "Average Score": 0.555933822, "Standard Deviation": 0.1029934524, "Rank": 15 }, "Chemistry": { "Average Score": 70.24726462490831, "Standard Deviation": null, "Rank": 15 }, "CPP": { "Average Score": 72.1127762005651, "Standard Deviation": null, "Rank": 10 } } }, { "config": { "model_name": "gpt4-1106", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 72.24829405851214, "Standard Deviation": 13.633826990442946, "Rank": 11 }, "Geometry": { "Average Score": 0.71843088, "Standard Deviation": 0.04778038294, "Rank": 11 }, "Algebra": { "Average Score": 0.712910417, "Standard Deviation": 0.02581828898, "Rank": 10 }, "Probability": { "Average Score": 63.29462909293814, "Standard Deviation": null, "Rank": 16 }, "Logical": { "Average Score": 62.987098158883875, "Standard Deviation": 4.027795425350514, "Rank": 15 }, "Social": { "Average Score": 0.450609816, "Standard Deviation": 0.05208655446, "Rank": 21 }, "Chemistry": { "Average Score": 67.34047237109209, "Standard Deviation": null, "Rank": 16 }, "CPP": { "Average Score": 69.11824072252848, "Standard Deviation": null, "Rank": 12 } } }, { "config": { "model_name": "gemma-2-27b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 62.70975283121063, "Standard Deviation": 6.376450054715319, "Rank": 15 }, "Geometry": { "Average Score": 0.60112744, "Standard Deviation": 0.0469109952, "Rank": 17 }, "Algebra": { "Average Score": 0.687955914, "Standard Deviation": 0.01959958192, "Rank": 11 }, "Probability": { "Average Score": 60.04180799425261, "Standard Deviation": null, "Rank": 20 }, "Logical": { "Average Score": 60.77082327163094, "Standard Deviation": 7.2164902432618625, "Rank": 19 }, "Social": { "Average Score": 0.487844257, "Standard Deviation": 0.05857760809, "Rank": 18 }, "Chemistry": { "Average Score": 61.68181926111706, "Standard Deviation": null, "Rank": 18 }, "CPP": { "Average Score": 63.28920072143611, "Standard Deviation": null, "Rank": 14 } } }, { "config": { "model_name": "claude-3-opus", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 60.56449573632771, "Standard Deviation": 8.485936885427277, "Rank": 17 }, "Geometry": { "Average Score": 0.7215743, "Standard Deviation": 0.04712598358, "Rank": 10 }, "Algebra": { "Average Score": 0.68777327, "Standard Deviation": 0.02382683713, "Rank": 12 }, "Probability": { "Average Score": 62.296041016641176, "Standard Deviation": null, "Rank": 17 }, "Logical": { "Average Score": 68.36295609287292, "Standard Deviation": 1.6558271236588655, "Rank": 13 }, "Social": { "Average Score": 0.663410854, "Standard Deviation": 0.09540220876, "Rank": 9 }, "Chemistry": { "Average Score": 70.44337273504232, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 73.5404403567132, "Standard Deviation": null, "Rank": 8 } } }, { "config": { "model_name": "gemma-2-9b-it-simpo", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Geometry": { "Average Score": 0.582787508, "Standard Deviation": 0.03965204074, "Rank": 18 }, "Algebra": { "Average Score": 0.658648133, "Standard Deviation": 0.02565919856, "Rank": 13 }, "Probability": { "Average Score": 57.545408188912894, "Standard Deviation": null, "Rank": 23 }, "Logical": { "Average Score": 53.1996479262466, "Standard Deviation": 2.690106544431167, "Rank": 23 }, "Social": { "Average Score": 0.635266187, "Standard Deviation": 0.03620021751, "Rank": 12 }, "Chemistry": { "Average Score": 74.44267231381626, "Standard Deviation": null, "Rank": 11 }, "CPP": { "Average Score": 73.43757596214863, "Standard Deviation": null, "Rank": 9 } } }, { "config": { "model_name": "qwen1.5-72b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 52.983715751652085, "Standard Deviation": 3.097613966427763, "Rank": 18 }, "Geometry": { "Average Score": 0.543139301, "Standard Deviation": 0.03425202326, "Rank": 22 }, "Algebra": { "Average Score": 0.635228729, "Standard Deviation": 0.01944043425, "Rank": 14 }, "Probability": { "Average Score": 52.650033879924905, "Standard Deviation": null, "Rank": 26 }, "Logical": { "Average Score": 32.628853250402074, "Standard Deviation": 3.227745519436025, "Rank": 37 }, "Social": { "Average Score": 0.415007627, "Standard Deviation": 0.03920053159, "Rank": 22 }, "Chemistry": { "Average Score": 47.5126781973184, "Standard Deviation": null, "Rank": 24 }, "CPP": { "Average Score": 48.69302376665551, "Standard Deviation": null, "Rank": 20 } } }, { "config": { "model_name": "qwen1.5-32b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 26.978561942890224, "Standard Deviation": 1.575986887925592, "Rank": 32 }, "Geometry": { "Average Score": 0.51086835, "Standard Deviation": 0.04052471998, "Rank": 25 }, "Algebra": { "Average Score": 0.609003168, "Standard Deviation": 0.04874143541, "Rank": 15 }, "Probability": { "Average Score": 49.50617919486678, "Standard Deviation": null, "Rank": 29 }, "Logical": { "Average Score": 34.07387941414556, "Standard Deviation": 4.616974831074921, "Rank": 34 }, "Social": { "Average Score": 0.380987334, "Standard Deviation": 0.03762251776, "Rank": 24 }, "Chemistry": { "Average Score": 44.06627265183811, "Standard Deviation": null, "Rank": 28 }, "CPP": { "Average Score": 45.14284028264288, "Standard Deviation": null, "Rank": 24 } } }, { "config": { "model_name": "google-gemma-2-9b-it", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 52.23013018580635, "Standard Deviation": 3.3939236141078495, "Rank": 19 }, "Geometry": { "Average Score": 0.575371308, "Standard Deviation": 0.03556220251, "Rank": 20 }, "Algebra": { "Average Score": 0.597045661, "Standard Deviation": 0.0313828123, "Rank": 16 }, "Probability": { "Average Score": 58.73062101843859, "Standard Deviation": null, "Rank": 21 }, "Logical": { "Average Score": 58.01791397899675, "Standard Deviation": 5.751983660134971, "Rank": 21 }, "Social": { "Average Score": 0.768337958, "Standard Deviation": 0.04078610476, "Rank": 5 }, "Chemistry": { "Average Score": 52.69494515004607, "Standard Deviation": null, "Rank": 21 }, "CPP": { "Average Score": 54.03167523687635, "Standard Deviation": null, "Rank": 17 } } }, { "config": { "model_name": "yi-1.5-34b-chat", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "2024/05" }, "results": { "OVERALL": { "Average Score": 62.568637878216464, "Standard Deviation": 8.554205798418673, "Rank": 16 }, "Geometry": { "Average Score": 0.566666724, "Standard Deviation": 0.04001381658, "Rank": 21 }, "Algebra": { "Average Score": 0.590997292, "Standard Deviation": 0.03594087315, "Rank": 17 }, "Probability": { "Average Score": 57.545207891104354, "Standard Deviation": null, "Rank": 22 }, "Logical": { "Average Score": 56.598158131627194, "Standard Deviation": 1.1072821075127297, "Rank": 22 }, "Social": { "Average Score": 0.516980832, "Standard Deviation": 0.03369347985, "Rank": 17 }, "Chemistry": { "Average Score": 50.867343712131174, "Standard Deviation": null, "Rank": 22 }, "CPP": { "Average Score": 52.148798061768964, "Standard Deviation": null, "Rank": 18 } } }, { "config": { "model_name": "meta-llama-3.1-70b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 65.61302047306724, "Standard Deviation": 7.113338386318571, "Rank": 14 }, "Geometry": { "Average Score": 0.76184398, "Standard Deviation": 0.01790377984, "Rank": 10 }, "Algebra": { "Average Score": 0.732041699, "Standard Deviation": 0.02621439062, "Rank": 9 }, "Probability": { "Average Score": 65.4531285887158, "Standard Deviation": null, "Rank": 13 }, "Logical": { "Average Score": 61.16321386785366, "Standard Deviation": 0.8920966760646541, "Rank": 18 }, "Social": { "Average Score": 0.45872939, "Standard Deviation": 0.05347039576, "Rank": 20 }, "Chemistry": { "Average Score": 76.03374498429748, "Standard Deviation": null, "Rank": 9 }, "CPP": { "Average Score": 84.36815192532764, "Standard Deviation": null, "Rank": 4 } } }, { "config": { "model_name": "meta-llama-3.1-8b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 48.86242501618216, "Standard Deviation": 3.7761459978540257, "Rank": 21 }, "Geometry": { "Average Score": 0.522442162, "Standard Deviation": 0.03908236317, "Rank": 23 }, "Algebra": { "Average Score": 0.582702645, "Standard Deviation": 0.05002277711, "Rank": 18 }, "Probability": { "Average Score": 52.44179989233465, "Standard Deviation": null, "Rank": 27 }, "Logical": { "Average Score": 43.3706774850582, "Standard Deviation": 2.820707319899787, "Rank": 28 }, "Social": { "Average Score": 0.329195941, "Standard Deviation": 0.03925019528, "Rank": 28 }, "Chemistry": { "Average Score": 43.36264580455019, "Standard Deviation": null, "Rank": 30 }, "CPP": { "Average Score": 44.41846841004584, "Standard Deviation": null, "Rank": 26 } } }, { "config": { "model_name": "gpt3.5-turbo-0125", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2021/09" }, "results": { "OVERALL": { "Average Score": 18.951737690142235, "Standard Deviation": 0.7967088395458379, "Rank": 42 }, "Geometry": { "Average Score": 0.678714519, "Standard Deviation": 0.05926546762, "Rank": 12 }, "Algebra": { "Average Score": 0.569296173, "Standard Deviation": 0.05277281097, "Rank": 19 }, "Probability": { "Average Score": 45.77959177088119, "Standard Deviation": null, "Rank": 30 }, "Logical": { "Average Score": 17.159084771200394, "Standard Deviation": 2.5845422782742546, "Rank": 48 }, "Social": { "Average Score": 0.235071541, "Standard Deviation": 0.02632892457, "Rank": 37 }, "Chemistry": { "Average Score": 39.52885225927276, "Standard Deviation": null, "Rank": 33 }, "CPP": { "Average Score": 40.46958736582551, "Standard Deviation": null, "Rank": 29 } } }, { "config": { "model_name": "llama-3-70b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 40.57810915454436, "Standard Deviation": 1.3134243733127455, "Rank": 26 }, "Geometry": { "Average Score": 0.516865529, "Standard Deviation": 0.03858112564, "Rank": 24 }, "Algebra": { "Average Score": 0.566756531, "Standard Deviation": 0.03369826926, "Rank": 20 }, "Probability": { "Average Score": 52.64997876875813, "Standard Deviation": null, "Rank": 25 }, "Logical": { "Average Score": 70.51651844158742, "Standard Deviation": 0.12355022869457871, "Rank": 12 }, "Social": { "Average Score": 0.45872939, "Standard Deviation": 0.05347039576, "Rank": 20 }, "Chemistry": { "Average Score": 63.65476403379996, "Standard Deviation": null, "Rank": 17 }, "CPP": { "Average Score": 65.32140697218945, "Standard Deviation": null, "Rank": 13 } } }, { "config": { "model_name": "claude-3-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 52.19088595402735, "Standard Deviation": 3.743258734262917, "Rank": 20 }, "Geometry": { "Average Score": 0.675613638, "Standard Deviation": 0.05275594408, "Rank": 13 }, "Algebra": { "Average Score": 0.552025728, "Standard Deviation": 0.04122192409, "Rank": 21 }, "Probability": { "Average Score": 54.0284459891417, "Standard Deviation": null, "Rank": 24 }, "Logical": { "Average Score": 58.099761779812475, "Standard Deviation": 7.815595203680491, "Rank": 20 }, "Social": { "Average Score": 0.570437582, "Standard Deviation": 0.08607040862, "Rank": 14 }, "Chemistry": { "Average Score": 59.784958090634056, "Standard Deviation": null, "Rank": 19 }, "CPP": { "Average Score": 61.33538592327427, "Standard Deviation": null, "Rank": 15 } } }, { "config": { "model_name": "qwen1.5-14b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 31.56999734729493, "Standard Deviation": 5.42704987916441, "Rank": 29 }, "Geometry": { "Average Score": 0.452504016, "Standard Deviation": 0.04225594393, "Rank": 26 }, "Algebra": { "Average Score": 0.538655725, "Standard Deviation": 0.03721542594, "Rank": 22 }, "Probability": { "Average Score": 41.027908758027046, "Standard Deviation": null, "Rank": 35 }, "Logical": { "Average Score": 31.638560769720616, "Standard Deviation": 3.175225377796435, "Rank": 38 }, "Social": { "Average Score": 0.287370142, "Standard Deviation": 0.04264085315, "Rank": 30 }, "Chemistry": { "Average Score": 37.667977565724996, "Standard Deviation": null, "Rank": 35 }, "CPP": { "Average Score": 38.552779976347026, "Standard Deviation": null, "Rank": 31 } } }, { "config": { "model_name": "claude-3-haiku", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 42.975259650014074, "Standard Deviation": 2.248602505751528, "Rank": 25 }, "Geometry": { "Average Score": 0.607993912, "Standard Deviation": 0.05793460748, "Rank": 15 }, "Algebra": { "Average Score": 0.520054055, "Standard Deviation": 0.03333544511, "Rank": 23 }, "Probability": { "Average Score": 52.44184603289214, "Standard Deviation": null, "Rank": 28 }, "Logical": { "Average Score": 50.38523351226464, "Standard Deviation": 1.9928131873345676, "Rank": 24 }, "Social": { "Average Score": 0.551083976, "Standard Deviation": 0.05374722539, "Rank": 16 }, "Chemistry": { "Average Score": 54.99584531372778, "Standard Deviation": null, "Rank": 20 }, "CPP": { "Average Score": 56.40200048817984, "Standard Deviation": null, "Rank": 16 } } }, { "config": { "model_name": "claude-2.1", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 23.82704986290717, "Standard Deviation": 1.6337262681919007, "Rank": 37 }, "Geometry": { "Average Score": 0.62752395, "Standard Deviation": 0.07232659398, "Rank": 14 }, "Algebra": { "Average Score": 0.508849609, "Standard Deviation": 0.0346897465, "Rank": 24 }, "Probability": { "Average Score": 42.82280874207299, "Standard Deviation": null, "Rank": 32 }, "Logical": { "Average Score": 47.40647506260718, "Standard Deviation": 3.5140099122016686, "Rank": 25 }, "Social": { "Average Score": 0.333804568, "Standard Deviation": 0.03775548253, "Rank": 27 }, "Chemistry": { "Average Score": 46.09889239661357, "Standard Deviation": null, "Rank": 25 }, "CPP": { "Average Score": 47.23672563994903, "Standard Deviation": null, "Rank": 21 } } }, { "config": { "model_name": "mistral-8x7b-instruct-v0.1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 26.279729527476174, "Standard Deviation": 1.7823676900027476, "Rank": 33 }, "Geometry": { "Average Score": 0.432216097, "Standard Deviation": 0.04747949254, "Rank": 29 }, "Algebra": { "Average Score": 0.478314888, "Standard Deviation": 0.01998797419, "Rank": 25 }, "Probability": { "Average Score": 42.27303178662447, "Standard Deviation": null, "Rank": 33 }, "Logical": { "Average Score": 34.58281320758576, "Standard Deviation": 2.5548927504271073, "Rank": 33 }, "Social": { "Average Score": 0.251949622, "Standard Deviation": 0.03346674405, "Rank": 35 }, "Chemistry": { "Average Score": 43.47423835615602, "Standard Deviation": null, "Rank": 29 }, "CPP": { "Average Score": 44.533118241976666, "Standard Deviation": null, "Rank": 25 } } }, { "config": { "model_name": "claude-2.0", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 20.490629074737296, "Standard Deviation": 0.4821482730133453, "Rank": 40 }, "Geometry": { "Average Score": 0.604141967, "Standard Deviation": 0.05116441826, "Rank": 16 }, "Algebra": { "Average Score": 0.474350734, "Standard Deviation": 0.01510393066, "Rank": 26 }, "Probability": { "Average Score": 45.15580067803421, "Standard Deviation": null, "Rank": 31 }, "Logical": { "Average Score": 43.65660021552717, "Standard Deviation": 4.959029305063026, "Rank": 27 }, "Social": { "Average Score": 0.469422836, "Standard Deviation": 0.05999901796, "Rank": 19 }, "Chemistry": { "Average Score": 49.53201090067431, "Standard Deviation": null, "Rank": 23 }, "CPP": { "Average Score": 50.773143448036464, "Standard Deviation": null, "Rank": 19 } } }, { "config": { "model_name": "starling-lm-7b-beta", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 43.0415265396966, "Standard Deviation": 0.8770524316858576, "Rank": 24 }, "Geometry": { "Average Score": 0.446654388, "Standard Deviation": 0.05637864999, "Rank": 28 }, "Algebra": { "Average Score": 0.473952749, "Standard Deviation": 0.01584301288, "Rank": 27 }, "Probability": { "Average Score": 41.320066911500234, "Standard Deviation": null, "Rank": 34 }, "Logical": { "Average Score": 39.79665241383638, "Standard Deviation": 3.4711628274016544, "Rank": 30 }, "Social": { "Average Score": 0.380021662, "Standard Deviation": 0.04622452748, "Rank": 25 }, "Chemistry": { "Average Score": 37.39896886078588, "Standard Deviation": null, "Rank": 36 }, "CPP": { "Average Score": 38.27587102395908, "Standard Deviation": null, "Rank": 32 } } }, { "config": { "model_name": "gemini-1.0-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 45.78126809517331, "Standard Deviation": 3.7275133674569783, "Rank": 23 }, "Geometry": { "Average Score": 0.578347959, "Standard Deviation": 0.04242873607, "Rank": 19 }, "Algebra": { "Average Score": 0.462417786, "Standard Deviation": 0.01668313635, "Rank": 28 }, "Probability": { "Average Score": 31.410607001114293, "Standard Deviation": null, "Rank": 42 }, "Logical": { "Average Score": 21.717362428653246, "Standard Deviation": 4.392290522642325, "Rank": 44 }, "Social": { "Average Score": 0.130790863, "Standard Deviation": 0.02800188173, "Rank": 45 }, "Chemistry": { "Average Score": 44.14314678087462, "Standard Deviation": null, "Rank": 27 }, "CPP": { "Average Score": 45.22204471452975, "Standard Deviation": null, "Rank": 23 } } }, { "config": { "model_name": "openchat-3.5-0106", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2024/01" }, "results": { "OVERALL": { "Average Score": 23.85666609339201, "Standard Deviation": 1.341285455536348, "Rank": 36 }, "Geometry": { "Average Score": 0.38715246, "Standard Deviation": 0.03701851946, "Rank": 32 }, "Algebra": { "Average Score": 0.441233712, "Standard Deviation": 0.01135753754, "Rank": 29 }, "Probability": { "Average Score": 40.37790468557232, "Standard Deviation": null, "Rank": 36 }, "Logical": { "Average Score": 35.1573373260624, "Standard Deviation": 2.485128777146724, "Rank": 32 }, "Social": { "Average Score": 0.250891608, "Standard Deviation": 0.03253769914, "Rank": 36 }, "Chemistry": { "Average Score": 32.96322247853182, "Standard Deviation": null, "Rank": 37 }, "CPP": { "Average Score": 33.70639271807677, "Standard Deviation": null, "Rank": 33 } } }, { "config": { "model_name": "openchat-3.5", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 23.63538251797928, "Standard Deviation": 2.0516295921862095, "Rank": 38 }, "Geometry": { "Average Score": 0.401699069, "Standard Deviation": 0.03410726557, "Rank": 30 }, "Algebra": { "Average Score": 0.414095336, "Standard Deviation": 0.01881964261, "Rank": 31 }, "Probability": { "Average Score": 36.00454588244476, "Standard Deviation": null, "Rank": 38 }, "Logical": { "Average Score": 34.029859502735654, "Standard Deviation": 3.354098427500673, "Rank": 35 }, "Social": { "Average Score": 0.319991655, "Standard Deviation": 0.04502478724, "Rank": 29 }, "Chemistry": { "Average Score": 32.29778226319944, "Standard Deviation": null, "Rank": 38 }, "CPP": { "Average Score": 33.020911255646965, "Standard Deviation": null, "Rank": 34 } } }, { "config": { "model_name": "command-r-(08-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/08" }, "results": { "OVERALL": { "Average Score": 38.783798277856995, "Standard Deviation": 1.1948096596199191, "Rank": 27 }, "Geometry": { "Average Score": 0.448300727, "Standard Deviation": 0.04996362328, "Rank": 27 }, "Algebra": { "Average Score": 0.417519167, "Standard Deviation": 0.01822196902, "Rank": 30 }, "Probability": { "Average Score": 38.019523941917335, "Standard Deviation": null, "Rank": 37 }, "Logical": { "Average Score": 23.408826179018206, "Standard Deviation": 0.9355701468205376, "Rank": 42 }, "Social": { "Average Score": 0.276088379, "Standard Deviation": 0.03295234688, "Rank": 32 }, "Chemistry": { "Average Score": 38.699171059988636, "Standard Deviation": null, "Rank": 34 }, "CPP": { "Average Score": 39.61492485677676, "Standard Deviation": null, "Rank": 30 } } }, { "config": { "model_name": "gemma-1.1-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 20.965269549151657, "Standard Deviation": 0.6031600560715249, "Rank": 39 }, "Geometry": { "Average Score": 0.324170977, "Standard Deviation": 0.04668553765, "Rank": 35 }, "Algebra": { "Average Score": 0.398684697, "Standard Deviation": 0.01982398259, "Rank": 32 }, "Probability": { "Average Score": 30.98345832281905, "Standard Deviation": null, "Rank": 43 }, "Logical": { "Average Score": 33.36570116785516, "Standard Deviation": 3.8824795120929765, "Rank": 36 }, "Social": { "Average Score": 0.179073276, "Standard Deviation": 0.02009658805, "Rank": 41 }, "Chemistry": { "Average Score": 41.66173653808921, "Standard Deviation": null, "Rank": 31 }, "CPP": { "Average Score": 42.666504105798204, "Standard Deviation": null, "Rank": 27 } } }, { "config": { "model_name": "llama3-8b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/03" }, "results": { "OVERALL": { "Average Score": 30.183633696164936, "Standard Deviation": 3.5901082045571266, "Rank": 31 }, "Geometry": { "Average Score": 0.367143758, "Standard Deviation": 0.04363680358, "Rank": 33 }, "Algebra": { "Average Score": 0.391480973, "Standard Deviation": 0.02757445266, "Rank": 33 }, "Probability": { "Average Score": 34.51621975866105, "Standard Deviation": null, "Rank": 39 }, "Logical": { "Average Score": 45.27560737491475, "Standard Deviation": 4.639305724878496, "Rank": 26 }, "Social": { "Average Score": 0.336373622, "Standard Deviation": 0.05762408512, "Rank": 26 }, "Chemistry": { "Average Score": 44.271144265487514, "Standard Deviation": null, "Rank": 26 }, "CPP": { "Average Score": 45.35392139264795, "Standard Deviation": null, "Rank": 22 } } }, { "config": { "model_name": "gemma-2-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": 47.37377937645159, "Standard Deviation": 2.72420190928707, "Rank": 22 }, "Geometry": { "Average Score": 0.395006676, "Standard Deviation": 0.05882607713, "Rank": 31 }, "Algebra": { "Average Score": 0.379391887, "Standard Deviation": 0.01722410785, "Rank": 34 }, "Probability": { "Average Score": 33.90530403382374, "Standard Deviation": null, "Rank": 41 }, "Logical": { "Average Score": 37.64262561604027, "Standard Deviation": 3.0627256408495804, "Rank": 31 }, "Social": { "Average Score": 0.393482094, "Standard Deviation": 0.06450214024, "Rank": 23 }, "Chemistry": { "Average Score": 29.883648650177584, "Standard Deviation": null, "Rank": 40 }, "CPP": { "Average Score": 30.53406933106768, "Standard Deviation": null, "Rank": 36 } } }, { "config": { "model_name": "starling-lm-7b-alpha", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 24.34505731078066, "Standard Deviation": 1.4660872513914562, "Rank": 35 }, "Geometry": { "Average Score": 0.336782578, "Standard Deviation": 0.04069449132, "Rank": 34 }, "Algebra": { "Average Score": 0.371551932, "Standard Deviation": 0.03367241745, "Rank": 35 }, "Probability": { "Average Score": 34.51613212227484, "Standard Deviation": null, "Rank": 40 }, "Logical": { "Average Score": 29.88612695085449, "Standard Deviation": 2.4070524024678672, "Rank": 40 }, "Social": { "Average Score": 0.271975534, "Standard Deviation": 0.04266753408, "Rank": 33 }, "Chemistry": { "Average Score": 29.442057363491365, "Standard Deviation": null, "Rank": 41 }, "CPP": { "Average Score": 30.07926487356878, "Standard Deviation": null, "Rank": 37 } } }, { "config": { "model_name": "qwen1.5-4b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 7.19753150259024, "Standard Deviation": 0.6175113365944395, "Rank": 52 }, "Geometry": { "Average Score": 0.215834522, "Standard Deviation": 0.0363766363, "Rank": 39 }, "Algebra": { "Average Score": 0.305589811, "Standard Deviation": 0.02354198912, "Rank": 36 }, "Probability": { "Average Score": 15.124506890648007, "Standard Deviation": null, "Rank": 49 }, "Logical": { "Average Score": 11.67206257803879, "Standard Deviation": 1.140401009846497, "Rank": 51 }, "Social": { "Average Score": 0.18195615, "Standard Deviation": 0.02269805277, "Rank": 40 }, "Chemistry": { "Average Score": 12.825435835657133, "Standard Deviation": null, "Rank": 52 }, "CPP": { "Average Score": 13.21208067122554, "Standard Deviation": null, "Rank": 47 } } }, { "config": { "model_name": "command-r-(04-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 26.20787727166716, "Standard Deviation": 1.6793980036057201, "Rank": 34 }, "Geometry": { "Average Score": 0.300416698, "Standard Deviation": 0.03485612736, "Rank": 36 }, "Algebra": { "Average Score": 0.293120231, "Standard Deviation": 0.032926484, "Rank": 37 }, "Probability": { "Average Score": 28.551833516483626, "Standard Deviation": null, "Rank": 44 }, "Logical": { "Average Score": 30.83782425033377, "Standard Deviation": 3.4266833154577383, "Rank": 39 }, "Social": { "Average Score": 0.283882949, "Standard Deviation": 0.03336901148, "Rank": 31 }, "Chemistry": { "Average Score": 40.38004181614496, "Standard Deviation": null, "Rank": 32 }, "CPP": { "Average Score": 41.346336503003236, "Standard Deviation": null, "Rank": 28 } } }, { "config": { "model_name": "vicuna-33b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 19.726298678709266, "Standard Deviation": 1.0771354692793496, "Rank": 41 }, "Geometry": { "Average Score": 0.208284679, "Standard Deviation": 0.03937771461, "Rank": 40 }, "Algebra": { "Average Score": 0.248994048, "Standard Deviation": 0.02668175054, "Rank": 39 }, "Probability": { "Average Score": 23.2308538772627, "Standard Deviation": null, "Rank": 47 }, "Logical": { "Average Score": 19.488409585540122, "Standard Deviation": 0.7913465863319494, "Rank": 46 }, "Social": { "Average Score": 0.257623798, "Standard Deviation": 0.02653724437, "Rank": 34 }, "Chemistry": { "Average Score": 27.198874596635843, "Standard Deviation": null, "Rank": 43 }, "CPP": { "Average Score": 28.01838653090379, "Standard Deviation": null, "Rank": 38 } } }, { "config": { "model_name": "gemma-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 18.339626858215343, "Standard Deviation": 0.1553156123023995, "Rank": 43 }, "Geometry": { "Average Score": 0.244791417, "Standard Deviation": 0.0289612078, "Rank": 37 }, "Algebra": { "Average Score": 0.250614794, "Standard Deviation": 0.01991678295, "Rank": 38 }, "Probability": { "Average Score": 18.066869704202595, "Standard Deviation": null, "Rank": 48 }, "Logical": { "Average Score": 22.446113532575186, "Standard Deviation": 1.1759308097806727, "Rank": 43 }, "Social": { "Average Score": 0.202138025, "Standard Deviation": 0.02098346639, "Rank": 39 }, "Chemistry": { "Average Score": 27.195166540671735, "Standard Deviation": null, "Rank": 43 }, "CPP": { "Average Score": 28.014658234926813, "Standard Deviation": null, "Rank": 39 } } }, { "config": { "model_name": "mistral-7b-instruct-2", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 32.27919528900069, "Standard Deviation": 2.070593349377193, "Rank": 28 }, "Geometry": { "Average Score": 0.216402626, "Standard Deviation": 0.03338414918, "Rank": 38 }, "Algebra": { "Average Score": 0.233777838, "Standard Deviation": 0.0155226054, "Rank": 40 }, "Probability": { "Average Score": 25.70261650740474, "Standard Deviation": null, "Rank": 45 }, "Logical": { "Average Score": 26.165635051797608, "Standard Deviation": 1.5009510944001014, "Rank": 41 }, "Social": { "Average Score": 0.209386782, "Standard Deviation": 0.02738569921, "Rank": 38 }, "Chemistry": { "Average Score": 30.70773868184025, "Standard Deviation": null, "Rank": 39 }, "CPP": { "Average Score": 31.382959631870822, "Standard Deviation": null, "Rank": 35 } } }, { "config": { "model_name": "mistral-7b-instruct-1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 14.750363553682964, "Standard Deviation": 0.442399072321264, "Rank": 48 }, "Geometry": { "Average Score": 0.161799938, "Standard Deviation": 0.03595278559, "Rank": 44 }, "Algebra": { "Average Score": 0.210341624, "Standard Deviation": 0.01736539119, "Rank": 41 }, "Probability": { "Average Score": 24.69501890202338, "Standard Deviation": null, "Rank": 46 }, "Logical": { "Average Score": 15.957706802740889, "Standard Deviation": 2.080778273455708, "Rank": 50 }, "Social": { "Average Score": 0.117646827, "Standard Deviation": 0.009321202779, "Rank": 47 }, "Chemistry": { "Average Score": 18.375111202411667, "Standard Deviation": null, "Rank": 47 }, "CPP": { "Average Score": 18.929093202755805, "Standard Deviation": null, "Rank": 42 } } }, { "config": { "model_name": "vicuna-13b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 13.302607436757697, "Standard Deviation": 0.570272227659312, "Rank": 50 }, "Geometry": { "Average Score": 0.200941928, "Standard Deviation": 0.03366817781, "Rank": 41 }, "Algebra": { "Average Score": 0.196123323, "Standard Deviation": 0.0135715643, "Rank": 42 }, "Probability": { "Average Score": 15.08476669604627, "Standard Deviation": null, "Rank": 50 }, "Logical": { "Average Score": 16.548339412104294, "Standard Deviation": 3.443370777556759, "Rank": 49 }, "Social": { "Average Score": 0.124655135, "Standard Deviation": 0.01122382671, "Rank": 46 }, "Chemistry": { "Average Score": 21.201173318496842, "Standard Deviation": null, "Rank": 45 }, "CPP": { "Average Score": 21.840013221590294, "Standard Deviation": null, "Rank": 40 } } }, { "config": { "model_name": "zephyr-7b-beta", "organization": "HuggingFace", "license": "MIT", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 7.378234886105356, "Standard Deviation": 1.1456147261693999, "Rank": 51 }, "Geometry": { "Average Score": 0.114005544, "Standard Deviation": 0.03144354365, "Rank": 45 }, "Algebra": { "Average Score": 0.141766633, "Standard Deviation": 0.03179520129, "Rank": 43 }, "Probability": { "Average Score": 8.92696070171298, "Standard Deviation": null, "Rank": 53 }, "Logical": { "Average Score": 6.971377981442089, "Standard Deviation": 0.31669853263737413, "Rank": 55 }, "Social": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 52 }, "Chemistry": { "Average Score": 18.374948840997902, "Standard Deviation": null, "Rank": 47 }, "CPP": { "Average Score": 18.92902220864132, "Standard Deviation": null, "Rank": 43 } } }, { "config": { "model_name": "gemma-1.1-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 16.083251992757752, "Standard Deviation": 0.7340624884005772, "Rank": 46 }, "Geometry": { "Average Score": 0.183974034, "Standard Deviation": 0.0215548886, "Rank": 43 }, "Algebra": { "Average Score": 0.13422252, "Standard Deviation": 0.01922819511, "Rank": 44 }, "Probability": { "Average Score": 9.992136776217318, "Standard Deviation": null, "Rank": 52 }, "Logical": { "Average Score": 9.537233946101678, "Standard Deviation": 0.7567112693269967, "Rank": 53 }, "Social": { "Average Score": 0.167796727, "Standard Deviation": 0.01666541942, "Rank": 42 }, "Chemistry": { "Average Score": 20.11834233400297, "Standard Deviation": null, "Rank": 46 }, "CPP": { "Average Score": 20.724691953843916, "Standard Deviation": null, "Rank": 41 } } }, { "config": { "model_name": "llama2-7b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 17.319161859655946, "Standard Deviation": 0.495520710612214, "Rank": 45 }, "Geometry": { "Average Score": 0.087067276, "Standard Deviation": 0.04274343402, "Rank": 46 }, "Algebra": { "Average Score": 0.12308805, "Standard Deviation": 0.01856053622, "Rank": 45 }, "Probability": { "Average Score": 8.860911732515305, "Standard Deviation": null, "Rank": 54 }, "Logical": { "Average Score": 18.812132126028335, "Standard Deviation": 3.0846832107977433, "Rank": 47 }, "Social": { "Average Score": 0.152905272, "Standard Deviation": 0.007166957097, "Rank": 43 }, "Chemistry": { "Average Score": 15.270334671133512, "Standard Deviation": null, "Rank": 50 }, "CPP": { "Average Score": 15.730513733660898, "Standard Deviation": null, "Rank": 45 } } }, { "config": { "model_name": "gemma-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 15.029602991101632, "Standard Deviation": 0.4529017602377039, "Rank": 47 }, "Geometry": { "Average Score": 0.198571153, "Standard Deviation": 0.01699161031, "Rank": 42 }, "Algebra": { "Average Score": 0.109883009, "Standard Deviation": 0.01520005833, "Rank": 46 }, "Probability": { "Average Score": 6.561360414966015, "Standard Deviation": null, "Rank": 56 }, "Logical": { "Average Score": 3.9858662356708785, "Standard Deviation": 0.5609499073366407, "Rank": 56 }, "Social": { "Average Score": 0.087452913, "Standard Deviation": 0.008170146562, "Rank": 50 }, "Chemistry": { "Average Score": 16.766144078336097, "Standard Deviation": null, "Rank": 49 }, "CPP": { "Average Score": 17.2715657115764, "Standard Deviation": null, "Rank": 44 } } }, { "config": { "model_name": "llama2-13b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 17.47902371074294, "Standard Deviation": 0.4047581815962028, "Rank": 44 }, "Geometry": { "Average Score": 0.072729954, "Standard Deviation": 0.02315988261, "Rank": 48 }, "Algebra": { "Average Score": 0.080371692, "Standard Deviation": 0.01277569453, "Rank": 47 }, "Probability": { "Average Score": 12.738302754764042, "Standard Deviation": null, "Rank": 51 }, "Logical": { "Average Score": 21.708359515217182, "Standard Deviation": 1.4862481594434973, "Rank": 45 }, "Social": { "Average Score": 0.149125922, "Standard Deviation": 0.01157416827, "Rank": 44 }, "Chemistry": { "Average Score": 12.786967781868814, "Standard Deviation": null, "Rank": 53 }, "CPP": { "Average Score": 13.17258252933903, "Standard Deviation": null, "Rank": 48 } } }, { "config": { "model_name": "vicuna-7b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 13.31896682669754, "Standard Deviation": 0.30441157156016124, "Rank": 49 }, "Geometry": { "Average Score": 0.083457058, "Standard Deviation": 0.02520989111, "Rank": 47 }, "Algebra": { "Average Score": 0.070883882, "Standard Deviation": 0.007315853253, "Rank": 48 }, "Probability": { "Average Score": 8.255246380068842, "Standard Deviation": null, "Rank": 55 }, "Logical": { "Average Score": 10.046676845257544, "Standard Deviation": 0.6816182835206797, "Rank": 52 }, "Social": { "Average Score": 0.111076414, "Standard Deviation": 0.004805626512, "Rank": 48 }, "Chemistry": { "Average Score": 13.838150481781991, "Standard Deviation": null, "Rank": 51 }, "CPP": { "Average Score": 14.255194156624162, "Standard Deviation": null, "Rank": 46 } } }, { "config": { "model_name": "koala-13b", "organization": "UC Berkeley", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 6.419305623111718, "Standard Deviation": 0.19611070515647736, "Rank": 53 }, "Geometry": { "Average Score": 0.017374001, "Standard Deviation": 0.01747053557, "Rank": 49 }, "Algebra": { "Average Score": 0.018129197, "Standard Deviation": 0.01054371383, "Rank": 49 }, "Probability": { "Average Score": 4.1717283559090035, "Standard Deviation": null, "Rank": 57 }, "Logical": { "Average Score": 7.484701131693112, "Standard Deviation": 0.172417770163525, "Rank": 54 }, "Social": { "Average Score": 0.096983835, "Standard Deviation": 0.007847059783, "Rank": 49 }, "Chemistry": { "Average Score": 6.177985738164252, "Standard Deviation": null, "Rank": 54 }, "CPP": { "Average Score": 6.36433272373514, "Standard Deviation": null, "Rank": 49 } } }, { "config": { "model_name": "openassistant-pythia-12b", "organization": "OpenAssistant", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 54 }, "Geometry": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 50 }, "Algebra": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 50 }, "Probability": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 58 }, "Logical": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 57 }, "Social": { "Average Score": 0.030792528, "Standard Deviation": 0.007518796391, "Rank": 51 }, "Chemistry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 55 }, "CPP": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 50 } } }, { "config": { "model_name": "nemotron-70b", "organization": "NVIDIA", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 100.0, "Standard Deviation": 0.0, "Rank": 1 }, "Chemistry": { "Average Score": 96.00601450276388, "Standard Deviation": null, "Rank": 3 }, "Logical": { "Average Score": 98.08807085219765, "Standard Deviation": 0.832489959144682, "Rank": 5 }, "Probability": { "Average Score": 91.16755514126538, "Standard Deviation": null, "Rank": 4 } } }, { "config": { "model_name": "llama-3.2-3b-it", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 30.40742747938681, "Standard Deviation": 1.6816556668351852, "Rank": 30 }, "Chemistry": { "Average Score": 27.43049468475638, "Standard Deviation": null, "Rank": 42 }, "Logical": { "Average Score": 41.58905844173492, "Standard Deviation": 5.2798221527591, "Rank": 29 }, "Probability": { "Average Score": 62.02868227997844, "Standard Deviation": null, "Rank": 18 } } }, { "config": { "model_name": "yi-lightning", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "Chemistry": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Logical": { "Average Score": 98.816765663456, "Standard Deviation": 0.3271335810663529, "Rank": 3 }, "Probability": { "Average Score": 95.8842044402052, "Standard Deviation": null, "Rank": 2 } } }, { "config": { "model_name": "glm-4-plus", "organization": "Zhipu AI", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "Chemistry": { "Average Score": 99.05822908668402, "Standard Deviation": null, "Rank": 2 }, "Logical": { "Average Score": 99.45307787995229, "Standard Deviation": 0.5982476107949444, "Rank": 1 }, "Probability": { "Average Score": 92.04426702796823, "Standard Deviation": null, "Rank": 3 } } } ]