[ { "config": { "model_name": "ChatGPT-4o-latest (2024-09-03)", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Score": 0.974329609, "Standard Deviation": 0.005024959031 }, "Geometry": { "Score": 0.976028578, "Standard Deviation": 0.01507912373 }, "Algebra": { "Score": 0.951199453, "Standard Deviation": 0.08452452108 }, "Probability": { "Score": 0.842116641, "Standard Deviation": 0.006267759054 }, "Logical": { "Score": 0.828490728, "Standard Deviation": 0.009134213144 }, "Social": { "Score": 0.815902987, "Standard Deviation": 0.0196254222 } } }, { "config": { "model_name": "gpt-4o-2024-08-06", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Score": 0.846571548, "Standard Deviation": 0.03394056554 }, "Geometry": { "Score": 0.99773096, "Standard Deviation": 0.002835555172 }, "Algebra": { "Score": 1.0, "Standard Deviation": 0.0 }, "Probability": { "Score": 0.78855795, "Standard Deviation": 0.008188675452 }, "Logical": { "Score": 0.668635768, "Standard Deviation": 0.03466314094 }, "Social": { "Score": 0.680417314, "Standard Deviation": 0.00656867063 } } }, { "config": { "model_name": "gpt-4o-2024-05-13", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Score": 0.846334477, "Standard Deviation": 0.09377911572 }, "Geometry": { "Score": 0.972472377, "Standard Deviation": 0.01648274205 }, "Algebra": { "Score": 0.995511298, "Standard Deviation": 0.004097802515 }, "Probability": { "Score": 0.812149974, "Standard Deviation": 0.007669585485 }, "Logical": { "Score": 0.755019692, "Standard Deviation": 0.008149588572 }, "Social": { "Score": 0.609875087, "Standard Deviation": 0.038729239 } } }, { "config": { "model_name": "gpt-4-turbo-2024-04-09", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Score": 0.855357972, "Standard Deviation": 0.1016986368 }, "Geometry": { "Score": 0.95374588, "Standard Deviation": 0.03109307166 }, "Algebra": { "Score": 0.930945223, "Standard Deviation": 0.06705136813 }, "Probability": { "Score": 0.750705448, "Standard Deviation": 0.05944483103 }, "Logical": { "Score": 0.77906699, "Standard Deviation": 0.007406734161 }, "Social": { "Score": 0.715935163, "Standard Deviation": 0.1209141409 } } }, { "config": { "model_name": "gemini-1.5-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.797187842, "Standard Deviation": 0.0272375249 }, "Geometry": { "Score": 0.9947169, "Standard Deviation": 0.009150597621 }, "Algebra": { "Score": 0.857464301, "Standard Deviation": 0.05014285338 }, "Probability": { "Score": 0.651781767, "Standard Deviation": 0.04156998547 }, "Logical": { "Score": 0.739745471, "Standard Deviation": 0.01631532019 }, "Social": { "Score": 0.649601885, "Standard Deviation": 0.104854889 } } }, { "config": { "model_name": "qwen2-72b-instruct", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024-02" }, "results": { "OVERALL": { "Score": 0.737918558, "Standard Deviation": 0.09069077339 }, "Geometry": { "Score": 0.796870305, "Standard Deviation": 0.0509025346 }, "Algebra": { "Score": 0.836194231, "Standard Deviation": 0.04517093028 }, "Probability": { "Score": 0.788068004, "Standard Deviation": 0.007288989044 }, "Logical": { "Score": 0.619300904, "Standard Deviation": 0.06377931612 }, "Social": { "Score": 0.652578786, "Standard Deviation": 0.04259293171 } } }, { "config": { "model_name": "gpt-4o-mini-2024-07-18", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024-07" }, "results": { "OVERALL": { "Score": 0.847694133, "Standard Deviation": 0.02164304402 }, "Geometry": { "Score": 0.946650435, "Standard Deviation": 0.01831236482 }, "Algebra": { "Score": 0.796243022, "Standard Deviation": 0.05537539202 }, "Probability": { "Score": 0.798402685, "Standard Deviation": 0.009404491967 }, "Logical": { "Score": 0.727009735, "Standard Deviation": 0.02628110141 }, "Social": { "Score": 0.691949855, "Standard Deviation": 0.02072934333 } } }, { "config": { "model_name": "claude-3.5-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024-03" }, "results": { "OVERALL": { "Score": 0.839004422, "Standard Deviation": 0.1461079564 }, "Geometry": { "Score": 0.95316419, "Standard Deviation": 0.02081192856 }, "Algebra": { "Score": 0.759789952, "Standard Deviation": 0.02611765096 }, "Probability": { "Score": 0.707730127, "Standard Deviation": 0.0394436664 }, "Logical": { "Score": 0.77342666, "Standard Deviation": 0.002892426458 }, "Social": { "Score": 0.790002247, "Standard Deviation": 0.1007410022 } } }, { "config": { "model_name": "o1-mini", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 1.0, "Standard Deviation": 0.0 }, "Geometry": { "Score": "N/A", "Standard Deviation": "N/A" }, "Algebra": { "Score": "N/A", "Standard Deviation": "N/A" }, "Probability": { "Score": 1.0, "Standard Deviation": 0.0 }, "Logical": { "Score": 1.0, "Standard Deviation": 0.0 }, "Social": { "Score": 0.993974241, "Standard Deviation": 0.001996882328 } } }, { "config": { "model_name": "o1-preview", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.945884589, "Standard Deviation": 0.01059250762 }, "Geometry": { "Score": "N/A", "Standard Deviation": "N/A" }, "Algebra": { "Score": "N/A", "Standard Deviation": "N/A" }, "Probability": { "Score": 0.964666392, "Standard Deviation": 0.003139983398 }, "Logical": { "Score": 0.987950057, "Standard Deviation": 0.004881220327 }, "Social": { "Score": 1.0, "Standard Deviation": 0.0 } } }, { "config": { "model_name": "gemini-1.5-flash-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024-02" }, "results": { "OVERALL": { "Score": 0.726493401, "Standard Deviation": 0.01113913725 }, "Geometry": { "Score": 0.804144103, "Standard Deviation": 0.1327142178 }, "Algebra": { "Score": 0.731776765, "Standard Deviation": 0.02594657111 }, "Probability": { "Score": 0.614461891, "Standard Deviation": 0.04690131826 }, "Logical": { "Score": 0.630805991, "Standard Deviation": 0.04871350612 }, "Social": { "Score": 0.555933822, "Standard Deviation": 0.1029934524 } } }, { "config": { "model_name": "gpt4-1106", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024-04" }, "results": { "OVERALL": { "Score": 0.816347784, "Standard Deviation": 0.1566815755 }, "Geometry": { "Score": 0.71843088, "Standard Deviation": 0.04778038294 }, "Algebra": { "Score": 0.712910417, "Standard Deviation": 0.02581828898 }, "Probability": { "Score": 0.623947619, "Standard Deviation": 0.03502982933 }, "Logical": { "Score": 0.637482274, "Standard Deviation": 0.04158809888 }, "Social": { "Score": 0.450609816, "Standard Deviation": 0.05208655446 } } }, { "config": { "model_name": "gemma-2-27b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024-03" }, "results": { "OVERALL": { "Score": 0.624169623, "Standard Deviation": 0.1048365121 }, "Geometry": { "Score": 0.60112744, "Standard Deviation": 0.0469109952 }, "Algebra": { "Score": 0.687955914, "Standard Deviation": 0.01959958192 }, "Probability": { "Score": 0.589524771, "Standard Deviation": 0.03112689325 }, "Logical": { "Score": 0.614978944, "Standard Deviation": 0.05710657859 }, "Social": { "Score": 0.487844257, "Standard Deviation": 0.05857760809 } } }, { "config": { "model_name": "claude-3-opus", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.650636271, "Standard Deviation": 0.1197773541 }, "Geometry": { "Score": 0.7215743, "Standard Deviation": 0.04712598358 }, "Algebra": { "Score": 0.68777327, "Standard Deviation": 0.02382683713 }, "Probability": { "Score": 0.626471421, "Standard Deviation": 0.02911817976 }, "Logical": { "Score": 0.692346381, "Standard Deviation": 0.03617185198 }, "Social": { "Score": 0.663410854, "Standard Deviation": 0.09540220876 } } }, { "config": { "model_name": "gemma-2-9b-it-simpo", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024-02" }, "results": { "OVERALL": { "Score": "N/A", "Standard Deviation": "N/A" }, "Geometry": { "Score": 0.582787508, "Standard Deviation": 0.03965204074 }, "Algebra": { "Score": 0.658648133, "Standard Deviation": 0.02565919856 }, "Probability": { "Score": 0.547861265, "Standard Deviation": 0.02885209131 }, "Logical": { "Score": 0.540720893, "Standard Deviation": 0.01970134508 }, "Social": { "Score": 0.635266187, "Standard Deviation": 0.03620021751 } } }, { "config": { "model_name": "qwen1.5-72b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024-03" }, "results": { "OVERALL": { "Score": 0.519549796, "Standard Deviation": 0.00903634343 }, "Geometry": { "Score": 0.543139301, "Standard Deviation": 0.03425202326 }, "Algebra": { "Score": 0.635228729, "Standard Deviation": 0.01944043425 }, "Probability": { "Score": 0.486948658, "Standard Deviation": 0.06064655315 }, "Logical": { "Score": 0.284069394, "Standard Deviation": 0.02686608506 }, "Social": { "Score": 0.415007627, "Standard Deviation": 0.03920053159 } } }, { "config": { "model_name": "qwen1.5-32b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024-03" }, "results": { "OVERALL": { "Score": 0.393789407, "Standard Deviation": 0.05413770095 }, "Geometry": { "Score": 0.51086835, "Standard Deviation": 0.04052471998 }, "Algebra": { "Score": 0.609003168, "Standard Deviation": 0.04874143541 }, "Probability": { "Score": 0.476300002, "Standard Deviation": 0.05322403912 }, "Logical": { "Score": 0.331781014, "Standard Deviation": 0.004938997686 }, "Social": { "Score": 0.380987334, "Standard Deviation": 0.03762251776 } } }, { "config": { "model_name": "google-gemma-2-9b-it", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.489663449, "Standard Deviation": 0.002595702019 }, "Geometry": { "Score": 0.575371308, "Standard Deviation": 0.03556220251 }, "Algebra": { "Score": 0.597045661, "Standard Deviation": 0.0313828123 }, "Probability": { "Score": 0.589221807, "Standard Deviation": 0.03110811656 }, "Logical": { "Score": 0.587579897, "Standard Deviation": 0.05512716783 }, "Social": { "Score": 0.768337958, "Standard Deviation": 0.04078610476 } } }, { "config": { "model_name": "yi-1.5-34b-chat", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.607812897, "Standard Deviation": 0.1440881293 }, "Geometry": { "Score": 0.566666724, "Standard Deviation": 0.04001381658 }, "Algebra": { "Score": 0.590997292, "Standard Deviation": 0.03594087315 }, "Probability": { "Score": 0.589524589, "Standard Deviation": 0.03112618772 }, "Logical": { "Score": 0.574105508, "Standard Deviation": 0.03441737941 }, "Social": { "Score": 0.516980832, "Standard Deviation": 0.03369347985 } } }, { "config": { "model_name": "meta-llama-3.1-8b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2024-02" }, "results": { "OVERALL": { "Score": 0.505936324, "Standard Deviation": 0.05286756493 }, "Geometry": { "Score": 0.522442162, "Standard Deviation": 0.03908236317 }, "Algebra": { "Score": 0.582702645, "Standard Deviation": 0.05002277711 }, "Probability": { "Score": 0.495001149, "Standard Deviation": 0.05244587037 }, "Logical": { "Score": 0.443030561, "Standard Deviation": 0.01343820628 }, "Social": { "Score": 0.329195941, "Standard Deviation": 0.03925019528 } } }, { "config": { "model_name": "gpt3.5-turbo-0125", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.313398088, "Standard Deviation": 0.09322528606 }, "Geometry": { "Score": 0.678714519, "Standard Deviation": 0.05926546762 }, "Algebra": { "Score": 0.569296173, "Standard Deviation": 0.05277281097 }, "Probability": { "Score": 0.448460767, "Standard Deviation": 0.05768095196 }, "Logical": { "Score": 0.148521348, "Standard Deviation": 0.04033712907 }, "Social": { "Score": 0.235071541, "Standard Deviation": 0.02632892457 } } }, { "config": { "model_name": "llama-3-70b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2024-03" }, "results": { "OVERALL": { "Score": 0.456689885, "Standard Deviation": 0.01385989995 }, "Geometry": { "Score": 0.516865529, "Standard Deviation": 0.03858112564 }, "Algebra": { "Score": 0.566756531, "Standard Deviation": 0.03369826926 }, "Probability": { "Score": 0.513857306, "Standard Deviation": 0.05453699062 }, "Logical": { "Score": 0.713796415, "Standard Deviation": 0.02031215107 }, "Social": { "Score": 0.45872939, "Standard Deviation": 0.05347039576 } } }, { "config": { "model_name": "claude-3-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024-02" }, "results": { "OVERALL": { "Score": 0.520010833, "Standard Deviation": 0.005030563799 }, "Geometry": { "Score": 0.675613638, "Standard Deviation": 0.05275594408 }, "Algebra": { "Score": 0.552025728, "Standard Deviation": 0.04122192409 }, "Probability": { "Score": 0.516192848, "Standard Deviation": 0.04152293217 }, "Logical": { "Score": 0.588545747, "Standard Deviation": 0.06068211943 }, "Social": { "Score": 0.570437582, "Standard Deviation": 0.08607040862 } } }, { "config": { "model_name": "qwen1.5-14b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.415328996, "Standard Deviation": 0.0743938717 }, "Geometry": { "Score": 0.452504016, "Standard Deviation": 0.04225594393 }, "Algebra": { "Score": 0.538655725, "Standard Deviation": 0.03721542594 }, "Probability": { "Score": 0.397185975, "Standard Deviation": 0.05607695946 }, "Logical": { "Score": 0.264573129, "Standard Deviation": 0.03936133174 }, "Social": { "Score": 0.287370142, "Standard Deviation": 0.04264085315 } } }, { "config": { "model_name": "claude-3-haiku", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.453901163, "Standard Deviation": 0.003604084261 }, "Geometry": { "Score": 0.607993912, "Standard Deviation": 0.05793460748 }, "Algebra": { "Score": 0.520054055, "Standard Deviation": 0.03333544511 }, "Probability": { "Score": 0.474460688, "Standard Deviation": 0.0446501933 }, "Logical": { "Score": 0.512815976, "Standard Deviation": 0.0163264281 }, "Social": { "Score": 0.551083976, "Standard Deviation": 0.05374722539 } } }, { "config": { "model_name": "claude-2.1", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.35814708, "Standard Deviation": 0.09168134168 }, "Geometry": { "Score": 0.62752395, "Standard Deviation": 0.07232659398 }, "Algebra": { "Score": 0.508849609, "Standard Deviation": 0.0346897465 }, "Probability": { "Score": 0.41477086, "Standard Deviation": 0.05964060239 }, "Logical": { "Score": 0.482923674, "Standard Deviation": 0.01989147048 }, "Social": { "Score": 0.333804568, "Standard Deviation": 0.03775548253 } } }, { "config": { "model_name": "mistral-8x7b-instruct-v0.1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.382659161, "Standard Deviation": 0.07594496929 }, "Geometry": { "Score": 0.432216097, "Standard Deviation": 0.04747949254 }, "Algebra": { "Score": 0.478314888, "Standard Deviation": 0.01998797419 }, "Probability": { "Score": 0.427144725, "Standard Deviation": 0.0590923329 }, "Logical": { "Score": 0.340041983, "Standard Deviation": 0.008397574592 }, "Social": { "Score": 0.251949622, "Standard Deviation": 0.03346674405 } } }, { "config": { "model_name": "claude-2.0", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023-10" }, "results": { "OVERALL": { "Score": 0.322718057, "Standard Deviation": 0.08369883584 }, "Geometry": { "Score": 0.604141967, "Standard Deviation": 0.05116441826 }, "Algebra": { "Score": 0.474350734, "Standard Deviation": 0.01510393066 }, "Probability": { "Score": 0.437950412, "Standard Deviation": 0.05985594317 }, "Logical": { "Score": 0.445620646, "Standard Deviation": 0.01812614805 }, "Social": { "Score": 0.469422836, "Standard Deviation": 0.05999901796 } } }, { "config": { "model_name": "starling-lm-7b-beta", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.479391856, "Standard Deviation": 0.04199990887 }, "Geometry": { "Score": 0.446654388, "Standard Deviation": 0.05637864999 }, "Algebra": { "Score": 0.473952749, "Standard Deviation": 0.01584301288 }, "Probability": { "Score": 0.395197837, "Standard Deviation": 0.05814798892 }, "Logical": { "Score": 0.39927199, "Standard Deviation": 0.02125277518 }, "Social": { "Score": 0.380021662, "Standard Deviation": 0.04622452748 } } }, { "config": { "model_name": "gemini-1.0-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023-11" }, "results": { "OVERALL": { "Score": 0.449040654, "Standard Deviation": 0.0450610177 }, "Geometry": { "Score": 0.578347959, "Standard Deviation": 0.04242873607 }, "Algebra": { "Score": 0.462417786, "Standard Deviation": 0.01668313635 }, "Probability": { "Score": 0.289836324, "Standard Deviation": 0.05739831115 }, "Logical": { "Score": 0.191140355, "Standard Deviation": 0.03394652499 }, "Social": { "Score": 0.130790863, "Standard Deviation": 0.02800188173 } } }, { "config": { "model_name": "openchat-3.5-0106", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.363929888, "Standard Deviation": 0.08602347145 }, "Geometry": { "Score": 0.38715246, "Standard Deviation": 0.03701851946 }, "Algebra": { "Score": 0.441233712, "Standard Deviation": 0.01135753754 }, "Probability": { "Score": 0.38802618, "Standard Deviation": 0.05663879714 }, "Logical": { "Score": 0.336754383, "Standard Deviation": 0.01608478079 }, "Social": { "Score": 0.250891608, "Standard Deviation": 0.03253769914 } } }, { "config": { "model_name": "openchat-3.5", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.361341296, "Standard Deviation": 0.09034869493 }, "Geometry": { "Score": 0.401699069, "Standard Deviation": 0.03410726557 }, "Algebra": { "Score": 0.414095336, "Standard Deviation": 0.01881964261 }, "Probability": { "Score": 0.349601002, "Standard Deviation": 0.05077455539 }, "Logical": { "Score": 0.331069242, "Standard Deviation": 0.02180827173 }, "Social": { "Score": 0.319991655, "Standard Deviation": 0.04502478724 } } }, { "config": { "model_name": "command-r-(08-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024-08" }, "results": { "OVERALL": { "Score": 0.427605298, "Standard Deviation": 0.01747449163 }, "Geometry": { "Score": 0.448300727, "Standard Deviation": 0.04996362328 }, "Algebra": { "Score": 0.417519167, "Standard Deviation": 0.01822196902 }, "Probability": { "Score": 0.366336281, "Standard Deviation": 0.04716826942 }, "Logical": { "Score": 0.214657906, "Standard Deviation": 0.03003579835 }, "Social": { "Score": 0.276088379, "Standard Deviation": 0.03295234688 } } }, { "config": { "model_name": "gemma-1.1-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2023-11" }, "results": { "OVERALL": { "Score": 0.339506922, "Standard Deviation": 0.1066279108 }, "Geometry": { "Score": 0.324170977, "Standard Deviation": 0.04668553765 }, "Algebra": { "Score": 0.398684697, "Standard Deviation": 0.01982398259 }, "Probability": { "Score": 0.293253175, "Standard Deviation": 0.05126192191 }, "Logical": { "Score": 0.317750796, "Standard Deviation": 0.01101933543 }, "Social": { "Score": 0.179073276, "Standard Deviation": 0.02009658805 } } }, { "config": { "model_name": "llama3-8b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2024-01" }, "results": { "OVERALL": { "Score": 0.367722676, "Standard Deviation": 0.1071368221 }, "Geometry": { "Score": 0.367143758, "Standard Deviation": 0.04363680358 }, "Algebra": { "Score": 0.391480973, "Standard Deviation": 0.02757445266 }, "Probability": { "Score": 0.317616445, "Standard Deviation": 0.04300430361 }, "Logical": { "Score": 0.461607495, "Standard Deviation": 0.02185028842 }, "Social": { "Score": 0.336373622, "Standard Deviation": 0.05762408512 } } }, { "config": { "model_name": "gemma-2-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.502167612, "Standard Deviation": 0.04389786763 }, "Geometry": { "Score": 0.395006676, "Standard Deviation": 0.05882607713 }, "Algebra": { "Score": 0.379391887, "Standard Deviation": 0.01722410785 }, "Probability": { "Score": 0.331231097, "Standard Deviation": 0.05392499987 }, "Logical": { "Score": 0.367687789, "Standard Deviation": 0.02547968808 }, "Social": { "Score": 0.393482094, "Standard Deviation": 0.06450214024 } } }, { "config": { "model_name": "starling-lm-7b-alpha", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.366628765, "Standard Deviation": 0.08405492929 }, "Geometry": { "Score": 0.336782578, "Standard Deviation": 0.04069449132 }, "Algebra": { "Score": 0.371551932, "Standard Deviation": 0.03367241745 }, "Probability": { "Score": 0.331472505, "Standard Deviation": 0.04833324282 }, "Logical": { "Score": 0.260869624, "Standard Deviation": 0.03562735237 }, "Social": { "Score": 0.271975534, "Standard Deviation": 0.04266753408 } } }, { "config": { "model_name": "qwen1.5-4b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024-02" }, "results": { "OVERALL": { "Score": 0.111876411, "Standard Deviation": 0.04241022785 }, "Geometry": { "Score": 0.215834522, "Standard Deviation": 0.0363766363 }, "Algebra": { "Score": 0.305589811, "Standard Deviation": 0.02354198912 }, "Probability": { "Score": 0.149365327, "Standard Deviation": 0.03489672675 }, "Logical": { "Score": 0.116210168, "Standard Deviation": 0.005927966496 }, "Social": { "Score": 0.18195615, "Standard Deviation": 0.02269805277 } } }, { "config": { "model_name": "command-r-(04-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024-04" }, "results": { "OVERALL": { "Score": 0.388783887, "Standard Deviation": 0.07417186783 }, "Geometry": { "Score": 0.300416698, "Standard Deviation": 0.03485612736 }, "Algebra": { "Score": 0.293120231, "Standard Deviation": 0.032926484 }, "Probability": { "Score": 0.281271304, "Standard Deviation": 0.05697149867 }, "Logical": { "Score": 0.276189906, "Standard Deviation": 0.03562914754 }, "Social": { "Score": 0.283882949, "Standard Deviation": 0.03336901148 } } }, { "config": { "model_name": "vicuna-33b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.316543555, "Standard Deviation": 0.08922095647 }, "Geometry": { "Score": 0.208284679, "Standard Deviation": 0.03937771461 }, "Algebra": { "Score": 0.248994048, "Standard Deviation": 0.02668175054 }, "Probability": { "Score": 0.222313995, "Standard Deviation": 0.03978859759 }, "Logical": { "Score": 0.180291222, "Standard Deviation": 0.021886267 }, "Social": { "Score": 0.257623798, "Standard Deviation": 0.02653724437 } } }, { "config": { "model_name": "gemma-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.285077558, "Standard Deviation": 0.08871758453 }, "Geometry": { "Score": 0.244791417, "Standard Deviation": 0.0289612078 }, "Algebra": { "Score": 0.250614794, "Standard Deviation": 0.01991678295 }, "Probability": { "Score": 0.174313053, "Standard Deviation": 0.03765424728 }, "Logical": { "Score": 0.197505536, "Standard Deviation": 0.02050298885 }, "Social": { "Score": 0.202138025, "Standard Deviation": 0.02098346639 } } }, { "config": { "model_name": "mistral-7b-instruct-2", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.427513868, "Standard Deviation": 0.05553921135 }, "Geometry": { "Score": 0.216402626, "Standard Deviation": 0.03338414918 }, "Algebra": { "Score": 0.233777838, "Standard Deviation": 0.0155226054 }, "Probability": { "Score": 0.25118175, "Standard Deviation": 0.04065514593 }, "Logical": { "Score": 0.224469136, "Standard Deviation": 0.03404706752 }, "Social": { "Score": 0.209386782, "Standard Deviation": 0.02738569921 } } }, { "config": { "model_name": "mistral-7b-instruct-1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.23016314, "Standard Deviation": 0.07137625271 }, "Geometry": { "Score": 0.161799938, "Standard Deviation": 0.03595278559 }, "Algebra": { "Score": 0.210341624, "Standard Deviation": 0.01736539119 }, "Probability": { "Score": 0.238417922, "Standard Deviation": 0.03744211933 }, "Logical": { "Score": 0.142636601, "Standard Deviation": 0.02080406365 }, "Social": { "Score": 0.117646827, "Standard Deviation": 0.009321202779 } } }, { "config": { "model_name": "vicuna-13b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023-11" }, "results": { "OVERALL": { "Score": 0.201892849, "Standard Deviation": 0.06021749802 }, "Geometry": { "Score": 0.200941928, "Standard Deviation": 0.03366817781 }, "Algebra": { "Score": 0.196123323, "Standard Deviation": 0.0135715643 }, "Probability": { "Score": 0.141214079, "Standard Deviation": 0.02721328211 }, "Logical": { "Score": 0.148598631, "Standard Deviation": 0.02241523892 }, "Social": { "Score": 0.124655135, "Standard Deviation": 0.01122382671 } } }, { "config": { "model_name": "zephyr-7b-beta", "organization": "HuggingFace", "license": "MIT", "knowledge_cutoff": "2023-10" }, "results": { "OVERALL": { "Score": 0.102705119, "Standard Deviation": 0.03683757312 }, "Geometry": { "Score": 0.114005544, "Standard Deviation": 0.03144354365 }, "Algebra": { "Score": 0.141766633, "Standard Deviation": 0.03179520129 }, "Probability": { "Score": 0.089050714, "Standard Deviation": 0.002136754266 }, "Logical": { "Score": 0.069520789, "Standard Deviation": 0.004477840857 }, "Social": { "Score": 0.0, "Standard Deviation": 0.0 } } }, { "config": { "model_name": "gemma-1.1-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.257700845, "Standard Deviation": 0.07369021445 }, "Geometry": { "Score": 0.183974034, "Standard Deviation": 0.0215548886 }, "Algebra": { "Score": 0.13422252, "Standard Deviation": 0.01922819511 }, "Probability": { "Score": 0.095628657, "Standard Deviation": 0.007536076456 }, "Logical": { "Score": 0.094965074, "Standard Deviation": 0.005019175487 }, "Social": { "Score": 0.167796727, "Standard Deviation": 0.01666541942 } } }, { "config": { "model_name": "llama2-7b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023-10" }, "results": { "OVERALL": { "Score": 0.260189428, "Standard Deviation": 0.08019299364 }, "Geometry": { "Score": 0.087067276, "Standard Deviation": 0.04274343402 }, "Algebra": { "Score": 0.12308805, "Standard Deviation": 0.01856053622 }, "Probability": { "Score": 0.087515438, "Standard Deviation": 0.006315053573 }, "Logical": { "Score": 0.17312827, "Standard Deviation": 0.01867044092 }, "Social": { "Score": 0.152905272, "Standard Deviation": 0.007166957097 } } }, { "config": { "model_name": "gemma-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2023-11" }, "results": { "OVERALL": { "Score": 0.234172069, "Standard Deviation": 0.06522685718 }, "Geometry": { "Score": 0.198571153, "Standard Deviation": 0.01699161031 }, "Algebra": { "Score": 0.109883009, "Standard Deviation": 0.01520005833 }, "Probability": { "Score": 0.06467432, "Standard Deviation": 0.002117497231 }, "Logical": { "Score": 0.039624492, "Standard Deviation": 0.007606972686 }, "Social": { "Score": 0.087452913, "Standard Deviation": 0.008170146562 } } }, { "config": { "model_name": "llama2-13b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023-12" }, "results": { "OVERALL": { "Score": 0.263305684, "Standard Deviation": 0.07283640689 }, "Geometry": { "Score": 0.072729954, "Standard Deviation": 0.02315988261 }, "Algebra": { "Score": 0.080371692, "Standard Deviation": 0.01277569453 }, "Probability": { "Score": 0.117757344, "Standard Deviation": 0.02418619619 }, "Logical": { "Score": 0.193149889, "Standard Deviation": 0.01776690764 }, "Social": { "Score": 0.149125922, "Standard Deviation": 0.01157416827 } } }, { "config": { "model_name": "vicuna-7b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023-11" }, "results": { "OVERALL": { "Score": 0.198839786, "Standard Deviation": 0.05725381576 }, "Geometry": { "Score": 0.083457058, "Standard Deviation": 0.02520989111 }, "Algebra": { "Score": 0.070883882, "Standard Deviation": 0.007315853253 }, "Probability": { "Score": 0.080987673, "Standard Deviation": 0.005474288861 }, "Logical": { "Score": 0.100065588, "Standard Deviation": 0.003561886452 }, "Social": { "Score": 0.111076414, "Standard Deviation": 0.004805626512 } } }, { "config": { "model_name": "koala-13b", "organization": "UC Berkeley", "license": "Non-commercial", "knowledge_cutoff": "2023-10" }, "results": { "OVERALL": { "Score": 0.09387188, "Standard Deviation": 0.02642167489 }, "Geometry": { "Score": 0.017374001, "Standard Deviation": 0.01747053557 }, "Algebra": { "Score": 0.018129197, "Standard Deviation": 0.01054371383 }, "Probability": { "Score": 0.043654362, "Standard Deviation": 0.004288231886 }, "Logical": { "Score": 0.074694053, "Standard Deviation": 0.002674646998 }, "Social": { "Score": 0.096983835, "Standard Deviation": 0.007847059783 } } }, { "config": { "model_name": "openassistant-pythia-12b", "organization": "OpenAssistant", "license": "Non-commercial", "knowledge_cutoff": "2023-09" }, "results": { "OVERALL": { "Score": 0.0, "Standard Deviation": 0.0 }, "Geometry": { "Score": 0.0, "Standard Deviation": 0.0 }, "Algebra": { "Score": 0.0, "Standard Deviation": 0.0 }, "Probability": { "Score": 0.0, "Standard Deviation": 0.0 }, "Logical": { "Score": 0.0, "Standard Deviation": 0.0 }, "Social": { "Score": 0.030792528, "Standard Deviation": 0.007518796391 } } } ]