de-arena / src /results /models_2024-10-08-03:10:26.811832.jsonl
yzabc007's picture
Update space
190ad0c
raw
history blame
43.7 kB
[
{
"config": {
"model_name": "ChatGPT-4o-latest (2024-09-03)",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Score": 0.974329609,
"Standard Deviation": 0.005024959031
},
"Geometry": {
"Score": 0.976028578,
"Standard Deviation": 0.01507912373
},
"Algebra": {
"Score": 0.951199453,
"Standard Deviation": 0.08452452108
},
"Probability": {
"Score": 0.842116641,
"Standard Deviation": 0.006267759054
},
"Logical": {
"Score": 0.828490728,
"Standard Deviation": 0.009134213144
},
"Social": {
"Score": 0.815902987,
"Standard Deviation": 0.0196254222
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-08-06",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Score": 0.846571548,
"Standard Deviation": 0.03394056554
},
"Geometry": {
"Score": 0.99773096,
"Standard Deviation": 0.002835555172
},
"Algebra": {
"Score": 1.0,
"Standard Deviation": 0.0
},
"Probability": {
"Score": 0.78855795,
"Standard Deviation": 0.008188675452
},
"Logical": {
"Score": 0.668635768,
"Standard Deviation": 0.03466314094
},
"Social": {
"Score": 0.680417314,
"Standard Deviation": 0.00656867063
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-05-13",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Score": 0.846334477,
"Standard Deviation": 0.09377911572
},
"Geometry": {
"Score": 0.972472377,
"Standard Deviation": 0.01648274205
},
"Algebra": {
"Score": 0.995511298,
"Standard Deviation": 0.004097802515
},
"Probability": {
"Score": 0.812149974,
"Standard Deviation": 0.007669585485
},
"Logical": {
"Score": 0.755019692,
"Standard Deviation": 0.008149588572
},
"Social": {
"Score": 0.609875087,
"Standard Deviation": 0.038729239
}
}
},
{
"config": {
"model_name": "gpt-4-turbo-2024-04-09",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Score": 0.855357972,
"Standard Deviation": 0.1016986368
},
"Geometry": {
"Score": 0.95374588,
"Standard Deviation": 0.03109307166
},
"Algebra": {
"Score": 0.930945223,
"Standard Deviation": 0.06705136813
},
"Probability": {
"Score": 0.750705448,
"Standard Deviation": 0.05944483103
},
"Logical": {
"Score": 0.77906699,
"Standard Deviation": 0.007406734161
},
"Social": {
"Score": 0.715935163,
"Standard Deviation": 0.1209141409
}
}
},
{
"config": {
"model_name": "gemini-1.5-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.797187842,
"Standard Deviation": 0.0272375249
},
"Geometry": {
"Score": 0.9947169,
"Standard Deviation": 0.009150597621
},
"Algebra": {
"Score": 0.857464301,
"Standard Deviation": 0.05014285338
},
"Probability": {
"Score": 0.651781767,
"Standard Deviation": 0.04156998547
},
"Logical": {
"Score": 0.739745471,
"Standard Deviation": 0.01631532019
},
"Social": {
"Score": 0.649601885,
"Standard Deviation": 0.104854889
}
}
},
{
"config": {
"model_name": "qwen2-72b-instruct",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-02"
},
"results": {
"OVERALL": {
"Score": 0.737918558,
"Standard Deviation": 0.09069077339
},
"Geometry": {
"Score": 0.796870305,
"Standard Deviation": 0.0509025346
},
"Algebra": {
"Score": 0.836194231,
"Standard Deviation": 0.04517093028
},
"Probability": {
"Score": 0.788068004,
"Standard Deviation": 0.007288989044
},
"Logical": {
"Score": 0.619300904,
"Standard Deviation": 0.06377931612
},
"Social": {
"Score": 0.652578786,
"Standard Deviation": 0.04259293171
}
}
},
{
"config": {
"model_name": "gpt-4o-mini-2024-07-18",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024-07"
},
"results": {
"OVERALL": {
"Score": 0.847694133,
"Standard Deviation": 0.02164304402
},
"Geometry": {
"Score": 0.946650435,
"Standard Deviation": 0.01831236482
},
"Algebra": {
"Score": 0.796243022,
"Standard Deviation": 0.05537539202
},
"Probability": {
"Score": 0.798402685,
"Standard Deviation": 0.009404491967
},
"Logical": {
"Score": 0.727009735,
"Standard Deviation": 0.02628110141
},
"Social": {
"Score": 0.691949855,
"Standard Deviation": 0.02072934333
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-03"
},
"results": {
"OVERALL": {
"Score": 0.839004422,
"Standard Deviation": 0.1461079564
},
"Geometry": {
"Score": 0.95316419,
"Standard Deviation": 0.02081192856
},
"Algebra": {
"Score": 0.759789952,
"Standard Deviation": 0.02611765096
},
"Probability": {
"Score": 0.707730127,
"Standard Deviation": 0.0394436664
},
"Logical": {
"Score": 0.77342666,
"Standard Deviation": 0.002892426458
},
"Social": {
"Score": 0.790002247,
"Standard Deviation": 0.1007410022
}
}
},
{
"config": {
"model_name": "o1-mini",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 1.0,
"Standard Deviation": 0.0
},
"Geometry": {
"Score": "N/A",
"Standard Deviation": "N/A"
},
"Algebra": {
"Score": "N/A",
"Standard Deviation": "N/A"
},
"Probability": {
"Score": 1.0,
"Standard Deviation": 0.0
},
"Logical": {
"Score": 1.0,
"Standard Deviation": 0.0
},
"Social": {
"Score": 0.993974241,
"Standard Deviation": 0.001996882328
}
}
},
{
"config": {
"model_name": "o1-preview",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.945884589,
"Standard Deviation": 0.01059250762
},
"Geometry": {
"Score": "N/A",
"Standard Deviation": "N/A"
},
"Algebra": {
"Score": "N/A",
"Standard Deviation": "N/A"
},
"Probability": {
"Score": 0.964666392,
"Standard Deviation": 0.003139983398
},
"Logical": {
"Score": 0.987950057,
"Standard Deviation": 0.004881220327
},
"Social": {
"Score": 1.0,
"Standard Deviation": 0.0
}
}
},
{
"config": {
"model_name": "gemini-1.5-flash-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024-02"
},
"results": {
"OVERALL": {
"Score": 0.726493401,
"Standard Deviation": 0.01113913725
},
"Geometry": {
"Score": 0.804144103,
"Standard Deviation": 0.1327142178
},
"Algebra": {
"Score": 0.731776765,
"Standard Deviation": 0.02594657111
},
"Probability": {
"Score": 0.614461891,
"Standard Deviation": 0.04690131826
},
"Logical": {
"Score": 0.630805991,
"Standard Deviation": 0.04871350612
},
"Social": {
"Score": 0.555933822,
"Standard Deviation": 0.1029934524
}
}
},
{
"config": {
"model_name": "gpt4-1106",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024-04"
},
"results": {
"OVERALL": {
"Score": 0.816347784,
"Standard Deviation": 0.1566815755
},
"Geometry": {
"Score": 0.71843088,
"Standard Deviation": 0.04778038294
},
"Algebra": {
"Score": 0.712910417,
"Standard Deviation": 0.02581828898
},
"Probability": {
"Score": 0.623947619,
"Standard Deviation": 0.03502982933
},
"Logical": {
"Score": 0.637482274,
"Standard Deviation": 0.04158809888
},
"Social": {
"Score": 0.450609816,
"Standard Deviation": 0.05208655446
}
}
},
{
"config": {
"model_name": "gemma-2-27b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024-03"
},
"results": {
"OVERALL": {
"Score": 0.624169623,
"Standard Deviation": 0.1048365121
},
"Geometry": {
"Score": 0.60112744,
"Standard Deviation": 0.0469109952
},
"Algebra": {
"Score": 0.687955914,
"Standard Deviation": 0.01959958192
},
"Probability": {
"Score": 0.589524771,
"Standard Deviation": 0.03112689325
},
"Logical": {
"Score": 0.614978944,
"Standard Deviation": 0.05710657859
},
"Social": {
"Score": 0.487844257,
"Standard Deviation": 0.05857760809
}
}
},
{
"config": {
"model_name": "claude-3-opus",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.650636271,
"Standard Deviation": 0.1197773541
},
"Geometry": {
"Score": 0.7215743,
"Standard Deviation": 0.04712598358
},
"Algebra": {
"Score": 0.68777327,
"Standard Deviation": 0.02382683713
},
"Probability": {
"Score": 0.626471421,
"Standard Deviation": 0.02911817976
},
"Logical": {
"Score": 0.692346381,
"Standard Deviation": 0.03617185198
},
"Social": {
"Score": 0.663410854,
"Standard Deviation": 0.09540220876
}
}
},
{
"config": {
"model_name": "gemma-2-9b-it-simpo",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024-02"
},
"results": {
"OVERALL": {
"Score": "N/A",
"Standard Deviation": "N/A"
},
"Geometry": {
"Score": 0.582787508,
"Standard Deviation": 0.03965204074
},
"Algebra": {
"Score": 0.658648133,
"Standard Deviation": 0.02565919856
},
"Probability": {
"Score": 0.547861265,
"Standard Deviation": 0.02885209131
},
"Logical": {
"Score": 0.540720893,
"Standard Deviation": 0.01970134508
},
"Social": {
"Score": 0.635266187,
"Standard Deviation": 0.03620021751
}
}
},
{
"config": {
"model_name": "qwen1.5-72b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-03"
},
"results": {
"OVERALL": {
"Score": 0.519549796,
"Standard Deviation": 0.00903634343
},
"Geometry": {
"Score": 0.543139301,
"Standard Deviation": 0.03425202326
},
"Algebra": {
"Score": 0.635228729,
"Standard Deviation": 0.01944043425
},
"Probability": {
"Score": 0.486948658,
"Standard Deviation": 0.06064655315
},
"Logical": {
"Score": 0.284069394,
"Standard Deviation": 0.02686608506
},
"Social": {
"Score": 0.415007627,
"Standard Deviation": 0.03920053159
}
}
},
{
"config": {
"model_name": "qwen1.5-32b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-03"
},
"results": {
"OVERALL": {
"Score": 0.393789407,
"Standard Deviation": 0.05413770095
},
"Geometry": {
"Score": 0.51086835,
"Standard Deviation": 0.04052471998
},
"Algebra": {
"Score": 0.609003168,
"Standard Deviation": 0.04874143541
},
"Probability": {
"Score": 0.476300002,
"Standard Deviation": 0.05322403912
},
"Logical": {
"Score": 0.331781014,
"Standard Deviation": 0.004938997686
},
"Social": {
"Score": 0.380987334,
"Standard Deviation": 0.03762251776
}
}
},
{
"config": {
"model_name": "google-gemma-2-9b-it",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.489663449,
"Standard Deviation": 0.002595702019
},
"Geometry": {
"Score": 0.575371308,
"Standard Deviation": 0.03556220251
},
"Algebra": {
"Score": 0.597045661,
"Standard Deviation": 0.0313828123
},
"Probability": {
"Score": 0.589221807,
"Standard Deviation": 0.03110811656
},
"Logical": {
"Score": 0.587579897,
"Standard Deviation": 0.05512716783
},
"Social": {
"Score": 0.768337958,
"Standard Deviation": 0.04078610476
}
}
},
{
"config": {
"model_name": "yi-1.5-34b-chat",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.607812897,
"Standard Deviation": 0.1440881293
},
"Geometry": {
"Score": 0.566666724,
"Standard Deviation": 0.04001381658
},
"Algebra": {
"Score": 0.590997292,
"Standard Deviation": 0.03594087315
},
"Probability": {
"Score": 0.589524589,
"Standard Deviation": 0.03112618772
},
"Logical": {
"Score": 0.574105508,
"Standard Deviation": 0.03441737941
},
"Social": {
"Score": 0.516980832,
"Standard Deviation": 0.03369347985
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-8b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2024-02"
},
"results": {
"OVERALL": {
"Score": 0.505936324,
"Standard Deviation": 0.05286756493
},
"Geometry": {
"Score": 0.522442162,
"Standard Deviation": 0.03908236317
},
"Algebra": {
"Score": 0.582702645,
"Standard Deviation": 0.05002277711
},
"Probability": {
"Score": 0.495001149,
"Standard Deviation": 0.05244587037
},
"Logical": {
"Score": 0.443030561,
"Standard Deviation": 0.01343820628
},
"Social": {
"Score": 0.329195941,
"Standard Deviation": 0.03925019528
}
}
},
{
"config": {
"model_name": "gpt3.5-turbo-0125",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.313398088,
"Standard Deviation": 0.09322528606
},
"Geometry": {
"Score": 0.678714519,
"Standard Deviation": 0.05926546762
},
"Algebra": {
"Score": 0.569296173,
"Standard Deviation": 0.05277281097
},
"Probability": {
"Score": 0.448460767,
"Standard Deviation": 0.05768095196
},
"Logical": {
"Score": 0.148521348,
"Standard Deviation": 0.04033712907
},
"Social": {
"Score": 0.235071541,
"Standard Deviation": 0.02632892457
}
}
},
{
"config": {
"model_name": "llama-3-70b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2024-03"
},
"results": {
"OVERALL": {
"Score": 0.456689885,
"Standard Deviation": 0.01385989995
},
"Geometry": {
"Score": 0.516865529,
"Standard Deviation": 0.03858112564
},
"Algebra": {
"Score": 0.566756531,
"Standard Deviation": 0.03369826926
},
"Probability": {
"Score": 0.513857306,
"Standard Deviation": 0.05453699062
},
"Logical": {
"Score": 0.713796415,
"Standard Deviation": 0.02031215107
},
"Social": {
"Score": 0.45872939,
"Standard Deviation": 0.05347039576
}
}
},
{
"config": {
"model_name": "claude-3-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-02"
},
"results": {
"OVERALL": {
"Score": 0.520010833,
"Standard Deviation": 0.005030563799
},
"Geometry": {
"Score": 0.675613638,
"Standard Deviation": 0.05275594408
},
"Algebra": {
"Score": 0.552025728,
"Standard Deviation": 0.04122192409
},
"Probability": {
"Score": 0.516192848,
"Standard Deviation": 0.04152293217
},
"Logical": {
"Score": 0.588545747,
"Standard Deviation": 0.06068211943
},
"Social": {
"Score": 0.570437582,
"Standard Deviation": 0.08607040862
}
}
},
{
"config": {
"model_name": "qwen1.5-14b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.415328996,
"Standard Deviation": 0.0743938717
},
"Geometry": {
"Score": 0.452504016,
"Standard Deviation": 0.04225594393
},
"Algebra": {
"Score": 0.538655725,
"Standard Deviation": 0.03721542594
},
"Probability": {
"Score": 0.397185975,
"Standard Deviation": 0.05607695946
},
"Logical": {
"Score": 0.264573129,
"Standard Deviation": 0.03936133174
},
"Social": {
"Score": 0.287370142,
"Standard Deviation": 0.04264085315
}
}
},
{
"config": {
"model_name": "claude-3-haiku",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.453901163,
"Standard Deviation": 0.003604084261
},
"Geometry": {
"Score": 0.607993912,
"Standard Deviation": 0.05793460748
},
"Algebra": {
"Score": 0.520054055,
"Standard Deviation": 0.03333544511
},
"Probability": {
"Score": 0.474460688,
"Standard Deviation": 0.0446501933
},
"Logical": {
"Score": 0.512815976,
"Standard Deviation": 0.0163264281
},
"Social": {
"Score": 0.551083976,
"Standard Deviation": 0.05374722539
}
}
},
{
"config": {
"model_name": "claude-2.1",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.35814708,
"Standard Deviation": 0.09168134168
},
"Geometry": {
"Score": 0.62752395,
"Standard Deviation": 0.07232659398
},
"Algebra": {
"Score": 0.508849609,
"Standard Deviation": 0.0346897465
},
"Probability": {
"Score": 0.41477086,
"Standard Deviation": 0.05964060239
},
"Logical": {
"Score": 0.482923674,
"Standard Deviation": 0.01989147048
},
"Social": {
"Score": 0.333804568,
"Standard Deviation": 0.03775548253
}
}
},
{
"config": {
"model_name": "mistral-8x7b-instruct-v0.1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.382659161,
"Standard Deviation": 0.07594496929
},
"Geometry": {
"Score": 0.432216097,
"Standard Deviation": 0.04747949254
},
"Algebra": {
"Score": 0.478314888,
"Standard Deviation": 0.01998797419
},
"Probability": {
"Score": 0.427144725,
"Standard Deviation": 0.0590923329
},
"Logical": {
"Score": 0.340041983,
"Standard Deviation": 0.008397574592
},
"Social": {
"Score": 0.251949622,
"Standard Deviation": 0.03346674405
}
}
},
{
"config": {
"model_name": "claude-2.0",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023-10"
},
"results": {
"OVERALL": {
"Score": 0.322718057,
"Standard Deviation": 0.08369883584
},
"Geometry": {
"Score": 0.604141967,
"Standard Deviation": 0.05116441826
},
"Algebra": {
"Score": 0.474350734,
"Standard Deviation": 0.01510393066
},
"Probability": {
"Score": 0.437950412,
"Standard Deviation": 0.05985594317
},
"Logical": {
"Score": 0.445620646,
"Standard Deviation": 0.01812614805
},
"Social": {
"Score": 0.469422836,
"Standard Deviation": 0.05999901796
}
}
},
{
"config": {
"model_name": "starling-lm-7b-beta",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.479391856,
"Standard Deviation": 0.04199990887
},
"Geometry": {
"Score": 0.446654388,
"Standard Deviation": 0.05637864999
},
"Algebra": {
"Score": 0.473952749,
"Standard Deviation": 0.01584301288
},
"Probability": {
"Score": 0.395197837,
"Standard Deviation": 0.05814798892
},
"Logical": {
"Score": 0.39927199,
"Standard Deviation": 0.02125277518
},
"Social": {
"Score": 0.380021662,
"Standard Deviation": 0.04622452748
}
}
},
{
"config": {
"model_name": "gemini-1.0-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023-11"
},
"results": {
"OVERALL": {
"Score": 0.449040654,
"Standard Deviation": 0.0450610177
},
"Geometry": {
"Score": 0.578347959,
"Standard Deviation": 0.04242873607
},
"Algebra": {
"Score": 0.462417786,
"Standard Deviation": 0.01668313635
},
"Probability": {
"Score": 0.289836324,
"Standard Deviation": 0.05739831115
},
"Logical": {
"Score": 0.191140355,
"Standard Deviation": 0.03394652499
},
"Social": {
"Score": 0.130790863,
"Standard Deviation": 0.02800188173
}
}
},
{
"config": {
"model_name": "openchat-3.5-0106",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.363929888,
"Standard Deviation": 0.08602347145
},
"Geometry": {
"Score": 0.38715246,
"Standard Deviation": 0.03701851946
},
"Algebra": {
"Score": 0.441233712,
"Standard Deviation": 0.01135753754
},
"Probability": {
"Score": 0.38802618,
"Standard Deviation": 0.05663879714
},
"Logical": {
"Score": 0.336754383,
"Standard Deviation": 0.01608478079
},
"Social": {
"Score": 0.250891608,
"Standard Deviation": 0.03253769914
}
}
},
{
"config": {
"model_name": "openchat-3.5",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.361341296,
"Standard Deviation": 0.09034869493
},
"Geometry": {
"Score": 0.401699069,
"Standard Deviation": 0.03410726557
},
"Algebra": {
"Score": 0.414095336,
"Standard Deviation": 0.01881964261
},
"Probability": {
"Score": 0.349601002,
"Standard Deviation": 0.05077455539
},
"Logical": {
"Score": 0.331069242,
"Standard Deviation": 0.02180827173
},
"Social": {
"Score": 0.319991655,
"Standard Deviation": 0.04502478724
}
}
},
{
"config": {
"model_name": "command-r-(08-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024-08"
},
"results": {
"OVERALL": {
"Score": 0.427605298,
"Standard Deviation": 0.01747449163
},
"Geometry": {
"Score": 0.448300727,
"Standard Deviation": 0.04996362328
},
"Algebra": {
"Score": 0.417519167,
"Standard Deviation": 0.01822196902
},
"Probability": {
"Score": 0.366336281,
"Standard Deviation": 0.04716826942
},
"Logical": {
"Score": 0.214657906,
"Standard Deviation": 0.03003579835
},
"Social": {
"Score": 0.276088379,
"Standard Deviation": 0.03295234688
}
}
},
{
"config": {
"model_name": "gemma-1.1-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-11"
},
"results": {
"OVERALL": {
"Score": 0.339506922,
"Standard Deviation": 0.1066279108
},
"Geometry": {
"Score": 0.324170977,
"Standard Deviation": 0.04668553765
},
"Algebra": {
"Score": 0.398684697,
"Standard Deviation": 0.01982398259
},
"Probability": {
"Score": 0.293253175,
"Standard Deviation": 0.05126192191
},
"Logical": {
"Score": 0.317750796,
"Standard Deviation": 0.01101933543
},
"Social": {
"Score": 0.179073276,
"Standard Deviation": 0.02009658805
}
}
},
{
"config": {
"model_name": "llama3-8b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2024-01"
},
"results": {
"OVERALL": {
"Score": 0.367722676,
"Standard Deviation": 0.1071368221
},
"Geometry": {
"Score": 0.367143758,
"Standard Deviation": 0.04363680358
},
"Algebra": {
"Score": 0.391480973,
"Standard Deviation": 0.02757445266
},
"Probability": {
"Score": 0.317616445,
"Standard Deviation": 0.04300430361
},
"Logical": {
"Score": 0.461607495,
"Standard Deviation": 0.02185028842
},
"Social": {
"Score": 0.336373622,
"Standard Deviation": 0.05762408512
}
}
},
{
"config": {
"model_name": "gemma-2-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.502167612,
"Standard Deviation": 0.04389786763
},
"Geometry": {
"Score": 0.395006676,
"Standard Deviation": 0.05882607713
},
"Algebra": {
"Score": 0.379391887,
"Standard Deviation": 0.01722410785
},
"Probability": {
"Score": 0.331231097,
"Standard Deviation": 0.05392499987
},
"Logical": {
"Score": 0.367687789,
"Standard Deviation": 0.02547968808
},
"Social": {
"Score": 0.393482094,
"Standard Deviation": 0.06450214024
}
}
},
{
"config": {
"model_name": "starling-lm-7b-alpha",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.366628765,
"Standard Deviation": 0.08405492929
},
"Geometry": {
"Score": 0.336782578,
"Standard Deviation": 0.04069449132
},
"Algebra": {
"Score": 0.371551932,
"Standard Deviation": 0.03367241745
},
"Probability": {
"Score": 0.331472505,
"Standard Deviation": 0.04833324282
},
"Logical": {
"Score": 0.260869624,
"Standard Deviation": 0.03562735237
},
"Social": {
"Score": 0.271975534,
"Standard Deviation": 0.04266753408
}
}
},
{
"config": {
"model_name": "qwen1.5-4b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-02"
},
"results": {
"OVERALL": {
"Score": 0.111876411,
"Standard Deviation": 0.04241022785
},
"Geometry": {
"Score": 0.215834522,
"Standard Deviation": 0.0363766363
},
"Algebra": {
"Score": 0.305589811,
"Standard Deviation": 0.02354198912
},
"Probability": {
"Score": 0.149365327,
"Standard Deviation": 0.03489672675
},
"Logical": {
"Score": 0.116210168,
"Standard Deviation": 0.005927966496
},
"Social": {
"Score": 0.18195615,
"Standard Deviation": 0.02269805277
}
}
},
{
"config": {
"model_name": "command-r-(04-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024-04"
},
"results": {
"OVERALL": {
"Score": 0.388783887,
"Standard Deviation": 0.07417186783
},
"Geometry": {
"Score": 0.300416698,
"Standard Deviation": 0.03485612736
},
"Algebra": {
"Score": 0.293120231,
"Standard Deviation": 0.032926484
},
"Probability": {
"Score": 0.281271304,
"Standard Deviation": 0.05697149867
},
"Logical": {
"Score": 0.276189906,
"Standard Deviation": 0.03562914754
},
"Social": {
"Score": 0.283882949,
"Standard Deviation": 0.03336901148
}
}
},
{
"config": {
"model_name": "vicuna-33b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.316543555,
"Standard Deviation": 0.08922095647
},
"Geometry": {
"Score": 0.208284679,
"Standard Deviation": 0.03937771461
},
"Algebra": {
"Score": 0.248994048,
"Standard Deviation": 0.02668175054
},
"Probability": {
"Score": 0.222313995,
"Standard Deviation": 0.03978859759
},
"Logical": {
"Score": 0.180291222,
"Standard Deviation": 0.021886267
},
"Social": {
"Score": 0.257623798,
"Standard Deviation": 0.02653724437
}
}
},
{
"config": {
"model_name": "gemma-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.285077558,
"Standard Deviation": 0.08871758453
},
"Geometry": {
"Score": 0.244791417,
"Standard Deviation": 0.0289612078
},
"Algebra": {
"Score": 0.250614794,
"Standard Deviation": 0.01991678295
},
"Probability": {
"Score": 0.174313053,
"Standard Deviation": 0.03765424728
},
"Logical": {
"Score": 0.197505536,
"Standard Deviation": 0.02050298885
},
"Social": {
"Score": 0.202138025,
"Standard Deviation": 0.02098346639
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-2",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.427513868,
"Standard Deviation": 0.05553921135
},
"Geometry": {
"Score": 0.216402626,
"Standard Deviation": 0.03338414918
},
"Algebra": {
"Score": 0.233777838,
"Standard Deviation": 0.0155226054
},
"Probability": {
"Score": 0.25118175,
"Standard Deviation": 0.04065514593
},
"Logical": {
"Score": 0.224469136,
"Standard Deviation": 0.03404706752
},
"Social": {
"Score": 0.209386782,
"Standard Deviation": 0.02738569921
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.23016314,
"Standard Deviation": 0.07137625271
},
"Geometry": {
"Score": 0.161799938,
"Standard Deviation": 0.03595278559
},
"Algebra": {
"Score": 0.210341624,
"Standard Deviation": 0.01736539119
},
"Probability": {
"Score": 0.238417922,
"Standard Deviation": 0.03744211933
},
"Logical": {
"Score": 0.142636601,
"Standard Deviation": 0.02080406365
},
"Social": {
"Score": 0.117646827,
"Standard Deviation": 0.009321202779
}
}
},
{
"config": {
"model_name": "vicuna-13b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023-11"
},
"results": {
"OVERALL": {
"Score": 0.201892849,
"Standard Deviation": 0.06021749802
},
"Geometry": {
"Score": 0.200941928,
"Standard Deviation": 0.03366817781
},
"Algebra": {
"Score": 0.196123323,
"Standard Deviation": 0.0135715643
},
"Probability": {
"Score": 0.141214079,
"Standard Deviation": 0.02721328211
},
"Logical": {
"Score": 0.148598631,
"Standard Deviation": 0.02241523892
},
"Social": {
"Score": 0.124655135,
"Standard Deviation": 0.01122382671
}
}
},
{
"config": {
"model_name": "zephyr-7b-beta",
"organization": "HuggingFace",
"license": "MIT",
"knowledge_cutoff": "2023-10"
},
"results": {
"OVERALL": {
"Score": 0.102705119,
"Standard Deviation": 0.03683757312
},
"Geometry": {
"Score": 0.114005544,
"Standard Deviation": 0.03144354365
},
"Algebra": {
"Score": 0.141766633,
"Standard Deviation": 0.03179520129
},
"Probability": {
"Score": 0.089050714,
"Standard Deviation": 0.002136754266
},
"Logical": {
"Score": 0.069520789,
"Standard Deviation": 0.004477840857
},
"Social": {
"Score": 0.0,
"Standard Deviation": 0.0
}
}
},
{
"config": {
"model_name": "gemma-1.1-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.257700845,
"Standard Deviation": 0.07369021445
},
"Geometry": {
"Score": 0.183974034,
"Standard Deviation": 0.0215548886
},
"Algebra": {
"Score": 0.13422252,
"Standard Deviation": 0.01922819511
},
"Probability": {
"Score": 0.095628657,
"Standard Deviation": 0.007536076456
},
"Logical": {
"Score": 0.094965074,
"Standard Deviation": 0.005019175487
},
"Social": {
"Score": 0.167796727,
"Standard Deviation": 0.01666541942
}
}
},
{
"config": {
"model_name": "llama2-7b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023-10"
},
"results": {
"OVERALL": {
"Score": 0.260189428,
"Standard Deviation": 0.08019299364
},
"Geometry": {
"Score": 0.087067276,
"Standard Deviation": 0.04274343402
},
"Algebra": {
"Score": 0.12308805,
"Standard Deviation": 0.01856053622
},
"Probability": {
"Score": 0.087515438,
"Standard Deviation": 0.006315053573
},
"Logical": {
"Score": 0.17312827,
"Standard Deviation": 0.01867044092
},
"Social": {
"Score": 0.152905272,
"Standard Deviation": 0.007166957097
}
}
},
{
"config": {
"model_name": "gemma-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-11"
},
"results": {
"OVERALL": {
"Score": 0.234172069,
"Standard Deviation": 0.06522685718
},
"Geometry": {
"Score": 0.198571153,
"Standard Deviation": 0.01699161031
},
"Algebra": {
"Score": 0.109883009,
"Standard Deviation": 0.01520005833
},
"Probability": {
"Score": 0.06467432,
"Standard Deviation": 0.002117497231
},
"Logical": {
"Score": 0.039624492,
"Standard Deviation": 0.007606972686
},
"Social": {
"Score": 0.087452913,
"Standard Deviation": 0.008170146562
}
}
},
{
"config": {
"model_name": "llama2-13b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023-12"
},
"results": {
"OVERALL": {
"Score": 0.263305684,
"Standard Deviation": 0.07283640689
},
"Geometry": {
"Score": 0.072729954,
"Standard Deviation": 0.02315988261
},
"Algebra": {
"Score": 0.080371692,
"Standard Deviation": 0.01277569453
},
"Probability": {
"Score": 0.117757344,
"Standard Deviation": 0.02418619619
},
"Logical": {
"Score": 0.193149889,
"Standard Deviation": 0.01776690764
},
"Social": {
"Score": 0.149125922,
"Standard Deviation": 0.01157416827
}
}
},
{
"config": {
"model_name": "vicuna-7b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023-11"
},
"results": {
"OVERALL": {
"Score": 0.198839786,
"Standard Deviation": 0.05725381576
},
"Geometry": {
"Score": 0.083457058,
"Standard Deviation": 0.02520989111
},
"Algebra": {
"Score": 0.070883882,
"Standard Deviation": 0.007315853253
},
"Probability": {
"Score": 0.080987673,
"Standard Deviation": 0.005474288861
},
"Logical": {
"Score": 0.100065588,
"Standard Deviation": 0.003561886452
},
"Social": {
"Score": 0.111076414,
"Standard Deviation": 0.004805626512
}
}
},
{
"config": {
"model_name": "koala-13b",
"organization": "UC Berkeley",
"license": "Non-commercial",
"knowledge_cutoff": "2023-10"
},
"results": {
"OVERALL": {
"Score": 0.09387188,
"Standard Deviation": 0.02642167489
},
"Geometry": {
"Score": 0.017374001,
"Standard Deviation": 0.01747053557
},
"Algebra": {
"Score": 0.018129197,
"Standard Deviation": 0.01054371383
},
"Probability": {
"Score": 0.043654362,
"Standard Deviation": 0.004288231886
},
"Logical": {
"Score": 0.074694053,
"Standard Deviation": 0.002674646998
},
"Social": {
"Score": 0.096983835,
"Standard Deviation": 0.007847059783
}
}
},
{
"config": {
"model_name": "openassistant-pythia-12b",
"organization": "OpenAssistant",
"license": "Non-commercial",
"knowledge_cutoff": "2023-09"
},
"results": {
"OVERALL": {
"Score": 0.0,
"Standard Deviation": 0.0
},
"Geometry": {
"Score": 0.0,
"Standard Deviation": 0.0
},
"Algebra": {
"Score": 0.0,
"Standard Deviation": 0.0
},
"Probability": {
"Score": 0.0,
"Standard Deviation": 0.0
},
"Logical": {
"Score": 0.0,
"Standard Deviation": 0.0
},
"Social": {
"Score": 0.030792528,
"Standard Deviation": 0.007518796391
}
}
}
]