de-arena / src /results /models_2024-10-20-23:34:57.242641.json
yzabc007's picture
update
8ef75a7
raw
history blame
87 kB
[
{
"config": {
"model_name": "ChatGPT-4o-latest (2024-09-03)",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 93.51557945652831,
"Standard Deviation": 3.1900396436407785,
"Rank": 4
},
"Geometry": {
"Average Score": 81.8536937387725,
"Standard Deviation": null,
"Rank": 5
},
"Algebra": {
"Average Score": 89.3642910524324,
"Standard Deviation": null,
"Rank": 3
},
"Probability": {
"Average Score": 86.55761073510537,
"Standard Deviation": null,
"Rank": 4
},
"Logical": {
"Average Score": 97.39734315785844,
"Standard Deviation": null,
"Rank": 2
},
"Social": {
"Average Score": 91.03727530739368,
"Standard Deviation": null,
"Rank": 7
},
"Chemistry": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"CPP": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-08-06",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 79.7806321863411,
"Standard Deviation": 0.8302330946013555,
"Rank": 14
},
"Geometry": {
"Average Score": 86.29041459755453,
"Standard Deviation": null,
"Rank": 2
},
"Algebra": {
"Average Score": 88.53373721863113,
"Standard Deviation": null,
"Rank": 4
},
"Probability": {
"Average Score": 78.694360721361,
"Standard Deviation": null,
"Rank": 7
},
"Logical": {
"Average Score": 78.3116623496895,
"Standard Deviation": null,
"Rank": 12
},
"Social": {
"Average Score": 79.90944696263446,
"Standard Deviation": null,
"Rank": 11
},
"Chemistry": {
"Average Score": 86.96011263543132,
"Standard Deviation": null,
"Rank": 7
},
"CPP": {
"Average Score": 92.43090226400756,
"Standard Deviation": null,
"Rank": 2
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-05-13",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 86.40675398236253,
"Standard Deviation": 6.473604235710212,
"Rank": 9
},
"Geometry": {
"Average Score": 82.42032988843268,
"Standard Deviation": null,
"Rank": 4
},
"Algebra": {
"Average Score": 83.51580675782952,
"Standard Deviation": null,
"Rank": 9
},
"Probability": {
"Average Score": 81.88434691830915,
"Standard Deviation": null,
"Rank": 5
},
"Logical": {
"Average Score": 87.92744931984977,
"Standard Deviation": null,
"Rank": 9
},
"Social": {
"Average Score": 76.12369632852445,
"Standard Deviation": null,
"Rank": 15
},
"Chemistry": {
"Average Score": 90.93459148149344,
"Standard Deviation": null,
"Rank": 4
},
"CPP": {
"Average Score": 79.1592634699295,
"Standard Deviation": null,
"Rank": 6
}
}
},
{
"config": {
"model_name": "gpt-4-turbo-2024-04-09",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 87.17581147282237,
"Standard Deviation": 8.716963621850567,
"Rank": 8
},
"Geometry": {
"Average Score": 78.76635545274637,
"Standard Deviation": null,
"Rank": 7
},
"Algebra": {
"Average Score": 79.96323615621023,
"Standard Deviation": null,
"Rank": 11
},
"Probability": {
"Average Score": 77.65333799733705,
"Standard Deviation": null,
"Rank": 9
},
"Logical": {
"Average Score": 89.33307138659873,
"Standard Deviation": null,
"Rank": 8
},
"Social": {
"Average Score": 76.86597570996584,
"Standard Deviation": null,
"Rank": 14
},
"Chemistry": {
"Average Score": 84.02855687506661,
"Standard Deviation": null,
"Rank": 9
},
"CPP": {
"Average Score": 70.73143363230263,
"Standard Deviation": null,
"Rank": 11
}
}
},
{
"config": {
"model_name": "gemini-1.5-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 80.38345723548734,
"Standard Deviation": 2.4635699815143584,
"Rank": 13
},
"Geometry": {
"Average Score": 84.30455076458965,
"Standard Deviation": null,
"Rank": 3
},
"Algebra": {
"Average Score": 85.9212061409364,
"Standard Deviation": null,
"Rank": 6
},
"Probability": {
"Average Score": 73.11806712394745,
"Standard Deviation": null,
"Rank": 13
},
"Logical": {
"Average Score": 78.27369746632996,
"Standard Deviation": null,
"Rank": 12
},
"Social": {
"Average Score": 79.57606824531047,
"Standard Deviation": null,
"Rank": 13
}
}
},
{
"config": {
"model_name": "qwen2-72b-instruct",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/09"
},
"results": {
"OVERALL": {
"Average Score": 74.44059692248071,
"Standard Deviation": 2.3957041566666697,
"Rank": 16
},
"Geometry": {
"Average Score": 72.58490369919883,
"Standard Deviation": null,
"Rank": 11
},
"Algebra": {
"Average Score": 88.53359632761772,
"Standard Deviation": null,
"Rank": 4
},
"Probability": {
"Average Score": 80.19789976985243,
"Standard Deviation": null,
"Rank": 6
},
"Logical": {
"Average Score": 72.76843081200641,
"Standard Deviation": null,
"Rank": 17
},
"Social": {
"Average Score": 57.256064868444426,
"Standard Deviation": null,
"Rank": 19
},
"Chemistry": {
"Average Score": 75.47190401351077,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 73.54037778797029,
"Standard Deviation": null,
"Rank": 7
}
}
},
{
"config": {
"model_name": "gpt-4o-mini-2024-07-18",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 82.82456893277315,
"Standard Deviation": 7.714840109805867,
"Rank": 12
},
"Geometry": {
"Average Score": 78.89323869622943,
"Standard Deviation": null,
"Rank": 6
},
"Algebra": {
"Average Score": 84.8722603687823,
"Standard Deviation": null,
"Rank": 8
},
"Probability": {
"Average Score": 78.6942843346463,
"Standard Deviation": null,
"Rank": 7
},
"Logical": {
"Average Score": 85.68921109829361,
"Standard Deviation": null,
"Rank": 10
},
"Social": {
"Average Score": 81.79892848722542,
"Standard Deviation": null,
"Rank": 10
},
"Chemistry": {
"Average Score": 81.46805623180109,
"Standard Deviation": null,
"Rank": 10
},
"CPP": {
"Average Score": 88.3877070580296,
"Standard Deviation": null,
"Rank": 3
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 88.43557924843628,
"Standard Deviation": 5.680338106806327,
"Rank": 7
},
"Geometry": {
"Average Score": 76.26169400931595,
"Standard Deviation": null,
"Rank": 10
},
"Algebra": {
"Average Score": 77.15040433072186,
"Standard Deviation": null,
"Rank": 13
},
"Probability": {
"Average Score": 73.9942759783754,
"Standard Deviation": null,
"Rank": 11
},
"Logical": {
"Average Score": 89.70827617930533,
"Standard Deviation": null,
"Rank": 7
},
"Social": {
"Average Score": 97.3810636467068,
"Standard Deviation": null,
"Rank": 3
},
"Chemistry": {
"Average Score": 94.92819763202698,
"Standard Deviation": null,
"Rank": 3
},
"CPP": {
"Average Score": 82.37734076815008,
"Standard Deviation": null,
"Rank": 5
}
}
},
{
"config": {
"model_name": "o1-mini",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 96.12399889226096,
"Standard Deviation": 0.5674965705992511,
"Rank": 2
},
"Geometry": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Algebra": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Probability": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Logical": {
"Average Score": 96.52089445393929,
"Standard Deviation": null,
"Rank": 3
},
"Social": {
"Average Score": 95.00695256918654,
"Standard Deviation": null,
"Rank": 5
}
}
},
{
"config": {
"model_name": "o1-preview",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 91.08240629161766,
"Standard Deviation": 4.83378135710071,
"Rank": 5
},
"Geometry": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Algebra": {
"Average Score": 98.1870991822192,
"Standard Deviation": null,
"Rank": 2
},
"Probability": {
"Average Score": 94.12657646584134,
"Standard Deviation": null,
"Rank": 2
},
"Logical": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Social": {
"Average Score": 96.56802743955569,
"Standard Deviation": null,
"Rank": 4
}
}
},
{
"config": {
"model_name": "gemini-1.5-flash-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 66.25275609135964,
"Standard Deviation": 2.5314573702881438,
"Rank": 20
},
"Geometry": {
"Average Score": 66.8010242138006,
"Standard Deviation": null,
"Rank": 13
},
"Algebra": {
"Average Score": 78.24639082497596,
"Standard Deviation": null,
"Rank": 12
},
"Probability": {
"Average Score": 67.84602916736804,
"Standard Deviation": null,
"Rank": 15
},
"Logical": {
"Average Score": 72.76845749138818,
"Standard Deviation": null,
"Rank": 17
},
"Social": {
"Average Score": 68.57728479711058,
"Standard Deviation": null,
"Rank": 16
},
"Chemistry": {
"Average Score": 75.47188329078935,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 72.1127762005651,
"Standard Deviation": null,
"Rank": 10
}
}
},
{
"config": {
"model_name": "gpt4-1106",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 85.660054434658,
"Standard Deviation": 7.392502344300497,
"Rank": 10
},
"Geometry": {
"Average Score": 63.36396165140893,
"Standard Deviation": null,
"Rank": 15
},
"Algebra": {
"Average Score": 74.67191687355754,
"Standard Deviation": null,
"Rank": 15
},
"Probability": {
"Average Score": 71.35141952665965,
"Standard Deviation": null,
"Rank": 14
},
"Logical": {
"Average Score": 76.34506017196868,
"Standard Deviation": null,
"Rank": 15
},
"Social": {
"Average Score": 46.00126575332808,
"Standard Deviation": null,
"Rank": 25
},
"Chemistry": {
"Average Score": 78.70156756289569,
"Standard Deviation": null,
"Rank": 11
},
"CPP": {
"Average Score": 69.11824072252848,
"Standard Deviation": null,
"Rank": 12
}
}
},
{
"config": {
"model_name": "gemma-2-27b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 70.82622192650408,
"Standard Deviation": 0.18962869075029884,
"Rank": 18
},
"Geometry": {
"Average Score": 58.25724467150374,
"Standard Deviation": null,
"Rank": 16
},
"Algebra": {
"Average Score": 73.71614711121721,
"Standard Deviation": null,
"Rank": 16
},
"Probability": {
"Average Score": 66.08200742339983,
"Standard Deviation": null,
"Rank": 17
},
"Logical": {
"Average Score": 72.76841354275011,
"Standard Deviation": null,
"Rank": 17
},
"Social": {
"Average Score": 53.736358144621576,
"Standard Deviation": null,
"Rank": 21
},
"Chemistry": {
"Average Score": 68.1178055540124,
"Standard Deviation": null,
"Rank": 17
},
"CPP": {
"Average Score": 63.28920072143611,
"Standard Deviation": null,
"Rank": 14
}
}
},
{
"config": {
"model_name": "claude-3-opus",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 82.28903171580336,
"Standard Deviation": 10.093273304495547,
"Rank": 11
},
"Geometry": {
"Average Score": 57.98602891013921,
"Standard Deviation": null,
"Rank": 17
},
"Algebra": {
"Average Score": 73.54334730242743,
"Standard Deviation": null,
"Rank": 18
},
"Probability": {
"Average Score": 67.8341594991468,
"Standard Deviation": null,
"Rank": 15
},
"Logical": {
"Average Score": 78.31155849680502,
"Standard Deviation": null,
"Rank": 12
},
"Social": {
"Average Score": 90.45833112761075,
"Standard Deviation": null,
"Rank": 8
},
"Chemistry": {
"Average Score": 85.97349470177741,
"Standard Deviation": null,
"Rank": 8
},
"CPP": {
"Average Score": 73.5404403567132,
"Standard Deviation": null,
"Rank": 8
}
}
},
{
"config": {
"model_name": "gemma-2-9b-it-simpo",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Geometry": {
"Average Score": 52.80896798216458,
"Standard Deviation": null,
"Rank": 19
},
"Algebra": {
"Average Score": 69.60260038105677,
"Standard Deviation": null,
"Rank": 19
},
"Probability": {
"Average Score": 59.52630271491633,
"Standard Deviation": null,
"Rank": 21
},
"Logical": {
"Average Score": 63.57920031465781,
"Standard Deviation": null,
"Rank": 23
},
"Social": {
"Average Score": 79.90950201631269,
"Standard Deviation": null,
"Rank": 11
},
"Chemistry": {
"Average Score": 90.36508196626548,
"Standard Deviation": null,
"Rank": 5
},
"CPP": {
"Average Score": 73.43757596214863,
"Standard Deviation": null,
"Rank": 9
}
}
},
{
"config": {
"model_name": "qwen1.5-72b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 65.26710370586439,
"Standard Deviation": 9.198700753743012,
"Rank": 19
},
"Geometry": {
"Average Score": 48.52417714351894,
"Standard Deviation": null,
"Rank": 24
},
"Algebra": {
"Average Score": 68.55765479604507,
"Standard Deviation": null,
"Rank": 20
},
"Probability": {
"Average Score": 49.52382148131357,
"Standard Deviation": null,
"Rank": 26
},
"Logical": {
"Average Score": 37.33563924001827,
"Standard Deviation": null,
"Rank": 35
},
"Social": {
"Average Score": 46.00141195402727,
"Standard Deviation": null,
"Rank": 25
},
"Chemistry": {
"Average Score": 52.625823960166215,
"Standard Deviation": null,
"Rank": 23
},
"CPP": {
"Average Score": 48.69302376665551,
"Standard Deviation": null,
"Rank": 20
}
}
},
{
"config": {
"model_name": "qwen1.5-32b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 46.74335731441104,
"Standard Deviation": 4.096227849530709,
"Rank": 28
},
"Geometry": {
"Average Score": 44.96670224519297,
"Standard Deviation": null,
"Rank": 26
},
"Algebra": {
"Average Score": 63.19715848628476,
"Standard Deviation": null,
"Rank": 23
},
"Probability": {
"Average Score": 48.59873650270336,
"Standard Deviation": null,
"Rank": 27
},
"Logical": {
"Average Score": 42.028753105249216,
"Standard Deviation": null,
"Rank": 33
},
"Social": {
"Average Score": 43.183938768454986,
"Standard Deviation": null,
"Rank": 28
},
"Chemistry": {
"Average Score": 47.84488021045937,
"Standard Deviation": null,
"Rank": 26
},
"CPP": {
"Average Score": 45.14284028264288,
"Standard Deviation": null,
"Rank": 24
}
}
},
{
"config": {
"model_name": "google-gemma-2-9b-it",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 60.71065949101693,
"Standard Deviation": 0.12283018509137462,
"Rank": 23
},
"Geometry": {
"Average Score": 52.49270527783856,
"Standard Deviation": null,
"Rank": 20
},
"Algebra": {
"Average Score": 63.446032975128176,
"Standard Deviation": null,
"Rank": 21
},
"Probability": {
"Average Score": 63.95287475488081,
"Standard Deviation": null,
"Rank": 20
},
"Logical": {
"Average Score": 70.18644584116615,
"Standard Deviation": null,
"Rank": 20
},
"Social": {
"Average Score": 86.45401862572464,
"Standard Deviation": null,
"Rank": 9
},
"Chemistry": {
"Average Score": 57.56342217758078,
"Standard Deviation": null,
"Rank": 20
},
"CPP": {
"Average Score": 54.03167523687635,
"Standard Deviation": null,
"Rank": 17
}
}
},
{
"config": {
"model_name": "yi-1.5-34b-chat",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024/05"
},
"results": {
"OVERALL": {
"Average Score": 71.53811567931923,
"Standard Deviation": 0.4838075734512934,
"Rank": 17
},
"Geometry": {
"Average Score": 53.98343904373819,
"Standard Deviation": null,
"Rank": 18
},
"Algebra": {
"Average Score": 63.317896075817885,
"Standard Deviation": null,
"Rank": 22
},
"Probability": {
"Average Score": 64.73492918491159,
"Standard Deviation": null,
"Rank": 19
},
"Logical": {
"Average Score": 66.39420245024361,
"Standard Deviation": null,
"Rank": 21
},
"Social": {
"Average Score": 53.73650350964252,
"Standard Deviation": null,
"Rank": 21
},
"Chemistry": {
"Average Score": 56.722360677914686,
"Standard Deviation": null,
"Rank": 21
},
"CPP": {
"Average Score": 52.148798061768964,
"Standard Deviation": null,
"Rank": 18
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-70b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 74.01502078434305,
"Standard Deviation": 0.24116839515156926,
"Rank": 15
},
"Geometry": {
"Average Score": 66.80097850274383,
"Standard Deviation": null,
"Rank": 13
},
"Algebra": {
"Average Score": 74.7667367179752,
"Standard Deviation": null,
"Rank": 14
},
"Probability": {
"Average Score": 66.0819470113051,
"Standard Deviation": null,
"Rank": 17
},
"Logical": {
"Average Score": 73.68238947162197,
"Standard Deviation": null,
"Rank": 16
},
"Social": {
"Average Score": 68.577541438994,
"Standard Deviation": null,
"Rank": 16
},
"Chemistry": {
"Average Score": 70.4019514562452,
"Standard Deviation": null,
"Rank": 15
},
"CPP": {
"Average Score": 84.36815192532764,
"Standard Deviation": null,
"Rank": 4
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-8b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 55.268736955905695,
"Standard Deviation": 7.060517225126177,
"Rank": 26
},
"Geometry": {
"Average Score": 42.44262022417502,
"Standard Deviation": null,
"Rank": 28
},
"Algebra": {
"Average Score": 60.632347391080486,
"Standard Deviation": null,
"Rank": 25
},
"Probability": {
"Average Score": 52.372362507453694,
"Standard Deviation": null,
"Rank": 24
},
"Logical": {
"Average Score": 54.17571378414435,
"Standard Deviation": null,
"Rank": 28
},
"Social": {
"Average Score": 39.07966801070027,
"Standard Deviation": null,
"Rank": 31
},
"Chemistry": {
"Average Score": 45.0170262190059,
"Standard Deviation": null,
"Rank": 29
},
"CPP": {
"Average Score": 44.41846841004584,
"Standard Deviation": null,
"Rank": 26
}
}
},
{
"config": {
"model_name": "gpt3.5-turbo-0125",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2021/09"
},
"results": {
"OVERALL": {
"Average Score": 29.17379433602279,
"Standard Deviation": 2.6813415847393878,
"Rank": 44
},
"Geometry": {
"Average Score": 51.47279337094397,
"Standard Deviation": null,
"Rank": 21
},
"Algebra": {
"Average Score": 59.03601450977881,
"Standard Deviation": null,
"Rank": 26
},
"Probability": {
"Average Score": 46.71541304474977,
"Standard Deviation": null,
"Rank": 28
},
"Logical": {
"Average Score": 20.82026871015984,
"Standard Deviation": null,
"Rank": 46
},
"Social": {
"Average Score": 28.31096293069848,
"Standard Deviation": null,
"Rank": 41
},
"Chemistry": {
"Average Score": 42.899594571904004,
"Standard Deviation": null,
"Rank": 31
},
"CPP": {
"Average Score": 40.46958736582551,
"Standard Deviation": null,
"Rank": 29
}
}
},
{
"config": {
"model_name": "llama-3-70b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 65.90407336557487,
"Standard Deviation": 66.63940143516267,
"Rank": 24
},
"Geometry": {
"Average Score": 46.40555349958932,
"Standard Deviation": null,
"Rank": 25
},
"Algebra": {
"Average Score": 60.86276607976933,
"Standard Deviation": null,
"Rank": 24
},
"Probability": {
"Average Score": 55.0233135868055,
"Standard Deviation": null,
"Rank": 22
},
"Logical": {
"Average Score": 83.99546392889077,
"Standard Deviation": null,
"Rank": 11
},
"Social": {
"Average Score": 47.90189246663785,
"Standard Deviation": null,
"Rank": 23
},
"Chemistry": {
"Average Score": 70.40198909396582,
"Standard Deviation": null,
"Rank": 15
},
"CPP": {
"Average Score": 65.32140697218945,
"Standard Deviation": null,
"Rank": 13
}
}
},
{
"config": {
"model_name": "claude-3-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 64.4278622266347,
"Standard Deviation": 3.089828107392469,
"Rank": 21
},
"Geometry": {
"Average Score": 51.4677627365698,
"Standard Deviation": null,
"Rank": 21
},
"Algebra": {
"Average Score": 57.157810499255426,
"Standard Deviation": null,
"Rank": 27
},
"Probability": {
"Average Score": 54.68761427070592,
"Standard Deviation": null,
"Rank": 23
},
"Logical": {
"Average Score": 65.8346271849297,
"Standard Deviation": null,
"Rank": 22
},
"Social": {
"Average Score": 62.842721798877186,
"Standard Deviation": null,
"Rank": 18
},
"Chemistry": {
"Average Score": 66.1914400411681,
"Standard Deviation": null,
"Rank": 18
},
"CPP": {
"Average Score": 61.33538592327427,
"Standard Deviation": null,
"Rank": 15
}
}
},
{
"config": {
"model_name": "qwen1.5-14b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 44.920016997055804,
"Standard Deviation": 0.3041914765974254,
"Rank": 30
},
"Geometry": {
"Average Score": 36.40735570120079,
"Standard Deviation": null,
"Rank": 30
},
"Algebra": {
"Average Score": 56.004717588310726,
"Standard Deviation": null,
"Rank": 28
},
"Probability": {
"Average Score": 39.24866255465088,
"Standard Deviation": null,
"Rank": 33
},
"Logical": {
"Average Score": 35.15462916949486,
"Standard Deviation": null,
"Rank": 38
},
"Social": {
"Average Score": 35.236185321936766,
"Standard Deviation": null,
"Rank": 34
},
"Chemistry": {
"Average Score": 40.803706763362605,
"Standard Deviation": null,
"Rank": 34
},
"CPP": {
"Average Score": 38.552779976347026,
"Standard Deviation": null,
"Rank": 31
}
}
},
{
"config": {
"model_name": "claude-3-haiku",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 53.46814061793852,
"Standard Deviation": 10.143567097006747,
"Rank": 25
},
"Geometry": {
"Average Score": 42.87542087805953,
"Standard Deviation": null,
"Rank": 27
},
"Algebra": {
"Average Score": 53.706856083803686,
"Standard Deviation": null,
"Rank": 30
},
"Probability": {
"Average Score": 49.80372052799326,
"Standard Deviation": null,
"Rank": 25
},
"Logical": {
"Average Score": 62.585349577709394,
"Standard Deviation": null,
"Rank": 24
},
"Social": {
"Average Score": 57.25601125762336,
"Standard Deviation": null,
"Rank": 19
},
"Chemistry": {
"Average Score": 60.48921113945562,
"Standard Deviation": null,
"Rank": 19
},
"CPP": {
"Average Score": 56.40200048817984,
"Standard Deviation": null,
"Rank": 16
}
}
},
{
"config": {
"model_name": "claude-2.1",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 39.855928282633364,
"Standard Deviation": 8.396129652430814,
"Rank": 35
},
"Geometry": {
"Average Score": 51.1749207092159,
"Standard Deviation": null,
"Rank": 23
},
"Algebra": {
"Average Score": 53.05386216145516,
"Standard Deviation": null,
"Rank": 31
},
"Probability": {
"Average Score": 44.42150447611455,
"Standard Deviation": null,
"Rank": 30
},
"Logical": {
"Average Score": 60.51381867118053,
"Standard Deviation": null,
"Rank": 25
},
"Social": {
"Average Score": 38.492280755756035,
"Standard Deviation": null,
"Rank": 32
},
"Chemistry": {
"Average Score": 50.66182745698702,
"Standard Deviation": null,
"Rank": 24
},
"CPP": {
"Average Score": 47.23672563994903,
"Standard Deviation": null,
"Rank": 21
}
}
},
{
"config": {
"model_name": "mistral-8x7b-instruct-v0.1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 42.70451051343715,
"Standard Deviation": 9.965602920103015,
"Rank": 31
},
"Geometry": {
"Average Score": 33.473933494899164,
"Standard Deviation": null,
"Rank": 34
},
"Algebra": {
"Average Score": 48.99207852115047,
"Standard Deviation": null,
"Rank": 34
},
"Probability": {
"Average Score": 44.46936520340586,
"Standard Deviation": null,
"Rank": 30
},
"Logical": {
"Average Score": 42.656238987207246,
"Standard Deviation": null,
"Rank": 31
},
"Social": {
"Average Score": 30.32900110312259,
"Standard Deviation": null,
"Rank": 40
},
"Chemistry": {
"Average Score": 47.047104057571026,
"Standard Deviation": null,
"Rank": 27
},
"CPP": {
"Average Score": 44.533118241976666,
"Standard Deviation": null,
"Rank": 25
}
}
},
{
"config": {
"model_name": "claude-2.0",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 33.53990717968659,
"Standard Deviation": 7.640386327990536,
"Rank": 41
},
"Geometry": {
"Average Score": 38.40953902052666,
"Standard Deviation": null,
"Rank": 29
},
"Algebra": {
"Average Score": 49.07235259762855,
"Standard Deviation": null,
"Rank": 33
},
"Probability": {
"Average Score": 46.71546649299419,
"Standard Deviation": null,
"Rank": 28
},
"Logical": {
"Average Score": 56.26908965013192,
"Standard Deviation": null,
"Rank": 27
},
"Social": {
"Average Score": 47.84034165469707,
"Standard Deviation": null,
"Rank": 23
},
"Chemistry": {
"Average Score": 55.20362543510563,
"Standard Deviation": null,
"Rank": 22
},
"CPP": {
"Average Score": 50.773143448036464,
"Standard Deviation": null,
"Rank": 19
}
}
},
{
"config": {
"model_name": "starling-lm-7b-beta",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 50.90398580969381,
"Standard Deviation": 0.2839403187065694,
"Rank": 27
},
"Geometry": {
"Average Score": 34.653904247826965,
"Standard Deviation": null,
"Rank": 33
},
"Algebra": {
"Average Score": 49.66265150940668,
"Standard Deviation": null,
"Rank": 32
},
"Probability": {
"Average Score": 40.04695085773174,
"Standard Deviation": null,
"Rank": 32
},
"Logical": {
"Average Score": 48.02284849364292,
"Standard Deviation": null,
"Rank": 29
},
"Social": {
"Average Score": 42.82322308642107,
"Standard Deviation": null,
"Rank": 29
},
"Chemistry": {
"Average Score": 40.54467030566931,
"Standard Deviation": null,
"Rank": 35
},
"CPP": {
"Average Score": 38.27587102395908,
"Standard Deviation": null,
"Rank": 32
}
}
},
{
"config": {
"model_name": "gemini-1.0-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 37.91102687366529,
"Standard Deviation": 15.15111885239772,
"Rank": 38
},
"Geometry": {
"Average Score": 35.480853719259684,
"Standard Deviation": null,
"Rank": 32
},
"Algebra": {
"Average Score": 48.08542847805497,
"Standard Deviation": null,
"Rank": 35
},
"Probability": {
"Average Score": 29.862669786973395,
"Standard Deviation": null,
"Rank": 42
},
"Logical": {
"Average Score": 24.141794297157134,
"Standard Deviation": null,
"Rank": 43
},
"Social": {
"Average Score": 15.062345665891504,
"Standard Deviation": null,
"Rank": 51
},
"Chemistry": {
"Average Score": 46.52522766257804,
"Standard Deviation": null,
"Rank": 28
},
"CPP": {
"Average Score": 45.22204471452975,
"Standard Deviation": null,
"Rank": 23
}
}
},
{
"config": {
"model_name": "openchat-3.5-0106",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/01"
},
"results": {
"OVERALL": {
"Average Score": 41.34314082389491,
"Standard Deviation": 4.394481877390224,
"Rank": 32
},
"Geometry": {
"Average Score": 29.859015723426758,
"Standard Deviation": null,
"Rank": 36
},
"Algebra": {
"Average Score": 45.79428201943078,
"Standard Deviation": null,
"Rank": 36
},
"Probability": {
"Average Score": 38.766888608782956,
"Standard Deviation": null,
"Rank": 34
},
"Logical": {
"Average Score": 42.1345774485532,
"Standard Deviation": null,
"Rank": 32
},
"Social": {
"Average Score": 32.07155544930587,
"Standard Deviation": null,
"Rank": 39
},
"Chemistry": {
"Average Score": 35.28601797606463,
"Standard Deviation": null,
"Rank": 37
},
"CPP": {
"Average Score": 33.70639271807677,
"Standard Deviation": null,
"Rank": 33
}
}
},
{
"config": {
"model_name": "openchat-3.5",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 39.60454188051808,
"Standard Deviation": 0.8232501722386516,
"Rank": 36
},
"Geometry": {
"Average Score": 30.77657388742533,
"Standard Deviation": null,
"Rank": 35
},
"Algebra": {
"Average Score": 42.13028451761782,
"Standard Deviation": null,
"Rank": 38
},
"Probability": {
"Average Score": 34.817635171077754,
"Standard Deviation": null,
"Rank": 37
},
"Logical": {
"Average Score": 36.21944706732088,
"Standard Deviation": null,
"Rank": 36
},
"Social": {
"Average Score": 37.59265084241427,
"Standard Deviation": null,
"Rank": 33
},
"Chemistry": {
"Average Score": 37.21911183748652,
"Standard Deviation": null,
"Rank": 36
},
"CPP": {
"Average Score": 33.020911255646965,
"Standard Deviation": null,
"Rank": 34
}
}
},
{
"config": {
"model_name": "command-r-(08-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/08"
},
"results": {
"OVERALL": {
"Average Score": 45.84310421663912,
"Standard Deviation": 0.14535750785421472,
"Rank": 29
},
"Geometry": {
"Average Score": 36.33550343578038,
"Standard Deviation": null,
"Rank": 31
},
"Algebra": {
"Average Score": 41.87079446639028,
"Standard Deviation": null,
"Rank": 39
},
"Probability": {
"Average Score": 36.87662939858684,
"Standard Deviation": null,
"Rank": 36
},
"Logical": {
"Average Score": 26.22482921268266,
"Standard Deviation": null,
"Rank": 41
},
"Social": {
"Average Score": 35.11019761697373,
"Standard Deviation": null,
"Rank": 35
},
"Chemistry": {
"Average Score": 41.81772722027254,
"Standard Deviation": null,
"Rank": 33
},
"CPP": {
"Average Score": 39.61492485677676,
"Standard Deviation": null,
"Rank": 30
}
}
},
{
"config": {
"model_name": "gemma-1.1-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 35.873210924652795,
"Standard Deviation": 6.462625645064649,
"Rank": 37
},
"Geometry": {
"Average Score": 25.79207201693066,
"Standard Deviation": null,
"Rank": 40
},
"Algebra": {
"Average Score": 40.58046616460041,
"Standard Deviation": null,
"Rank": 40
},
"Probability": {
"Average Score": 29.581773053230897,
"Standard Deviation": null,
"Rank": 43
},
"Logical": {
"Average Score": 41.99821650962693,
"Standard Deviation": null,
"Rank": 33
},
"Social": {
"Average Score": 24.39015213949678,
"Standard Deviation": null,
"Rank": 43
},
"Chemistry": {
"Average Score": 45.01706482033765,
"Standard Deviation": null,
"Rank": 29
},
"CPP": {
"Average Score": 42.666504105798204,
"Standard Deviation": null,
"Rank": 27
}
}
},
{
"config": {
"model_name": "llama3-8b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/03"
},
"results": {
"OVERALL": {
"Average Score": 39.00917270775336,
"Standard Deviation": 3.999506140299149,
"Rank": 39
},
"Geometry": {
"Average Score": 29.224089668837465,
"Standard Deviation": null,
"Rank": 38
},
"Algebra": {
"Average Score": 42.90961619082775,
"Standard Deviation": null,
"Rank": 37
},
"Probability": {
"Average Score": 34.15721355738147,
"Standard Deviation": null,
"Rank": 38
},
"Logical": {
"Average Score": 58.39773915370141,
"Standard Deviation": null,
"Rank": 26
},
"Social": {
"Average Score": 40.88535401371015,
"Standard Deviation": null,
"Rank": 30
},
"Chemistry": {
"Average Score": 49.70839372661025,
"Standard Deviation": null,
"Rank": 25
},
"CPP": {
"Average Score": 45.35392139264795,
"Standard Deviation": null,
"Rank": 22
}
}
},
{
"config": {
"model_name": "gemma-2-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": 57.45780847204313,
"Standard Deviation": 16.310023687014333,
"Rank": 22
},
"Geometry": {
"Average Score": 29.820233374501843,
"Standard Deviation": null,
"Rank": 36
},
"Algebra": {
"Average Score": 39.873024674507214,
"Standard Deviation": null,
"Rank": 41
},
"Probability": {
"Average Score": 31.85692359301203,
"Standard Deviation": null,
"Rank": 40
},
"Logical": {
"Average Score": 43.93437465788311,
"Standard Deviation": null,
"Rank": 30
},
"Social": {
"Average Score": 44.689420554662476,
"Standard Deviation": null,
"Rank": 27
},
"Chemistry": {
"Average Score": 32.05704364512495,
"Standard Deviation": null,
"Rank": 40
},
"CPP": {
"Average Score": 30.53406933106768,
"Standard Deviation": null,
"Rank": 36
}
}
},
{
"config": {
"model_name": "starling-lm-7b-alpha",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 40.625443347641045,
"Standard Deviation": 3.0544259540377268,
"Rank": 34
},
"Geometry": {
"Average Score": 26.171147508308422,
"Standard Deviation": null,
"Rank": 39
},
"Algebra": {
"Average Score": 39.149463007523856,
"Standard Deviation": null,
"Rank": 42
},
"Probability": {
"Average Score": 32.36862021879827,
"Standard Deviation": null,
"Rank": 39
},
"Logical": {
"Average Score": 34.17344938419256,
"Standard Deviation": null,
"Rank": 39
},
"Social": {
"Average Score": 35.06966333212518,
"Standard Deviation": null,
"Rank": 35
},
"Chemistry": {
"Average Score": 32.15932739848045,
"Standard Deviation": null,
"Rank": 39
},
"CPP": {
"Average Score": 30.07926487356878,
"Standard Deviation": null,
"Rank": 37
}
}
},
{
"config": {
"model_name": "qwen1.5-4b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 11.723779019126527,
"Standard Deviation": 0.856230353584155,
"Rank": 53
},
"Geometry": {
"Average Score": 16.072772563608115,
"Standard Deviation": null,
"Rank": 45
},
"Algebra": {
"Average Score": 32.22626131587612,
"Standard Deviation": null,
"Rank": 44
},
"Probability": {
"Average Score": 13.98282712349133,
"Standard Deviation": null,
"Rank": 48
},
"Logical": {
"Average Score": 13.993097991375581,
"Standard Deviation": null,
"Rank": 51
},
"Social": {
"Average Score": 22.955898106386442,
"Standard Deviation": null,
"Rank": 45
},
"Chemistry": {
"Average Score": 13.907481529463642,
"Standard Deviation": null,
"Rank": 51
},
"CPP": {
"Average Score": 13.21208067122554,
"Standard Deviation": null,
"Rank": 47
}
}
},
{
"config": {
"model_name": "command-r-(04-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 43.08187135994592,
"Standard Deviation": 0.7654553730614279,
"Rank": 33
},
"Geometry": {
"Average Score": 24.037084801508428,
"Standard Deviation": null,
"Rank": 41
},
"Algebra": {
"Average Score": 32.37474440275246,
"Standard Deviation": null,
"Rank": 43
},
"Probability": {
"Average Score": 31.014039425232298,
"Standard Deviation": null,
"Rank": 41
},
"Logical": {
"Average Score": 35.49507014348235,
"Standard Deviation": null,
"Rank": 37
},
"Social": {
"Average Score": 34.782695172510856,
"Standard Deviation": null,
"Rank": 37
},
"Chemistry": {
"Average Score": 42.46395478814961,
"Standard Deviation": null,
"Rank": 32
},
"CPP": {
"Average Score": 41.346336503003236,
"Standard Deviation": null,
"Rank": 28
}
}
},
{
"config": {
"model_name": "vicuna-33b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 30.8582386682731,
"Standard Deviation": 2.3851186735858945,
"Rank": 42
},
"Geometry": {
"Average Score": 17.058968577112452,
"Standard Deviation": null,
"Rank": 44
},
"Algebra": {
"Average Score": 25.22004544023738,
"Standard Deviation": null,
"Rank": 45
},
"Probability": {
"Average Score": 21.097169680647767,
"Standard Deviation": null,
"Rank": 46
},
"Logical": {
"Average Score": 23.212667585279515,
"Standard Deviation": null,
"Rank": 45
},
"Social": {
"Average Score": 32.357116321848025,
"Standard Deviation": null,
"Rank": 38
},
"Chemistry": {
"Average Score": 29.376389899632898,
"Standard Deviation": null,
"Rank": 42
},
"CPP": {
"Average Score": 28.01838653090379,
"Standard Deviation": null,
"Rank": 38
}
}
},
{
"config": {
"model_name": "gemma-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 27.609692676933715,
"Standard Deviation": 5.8350892031427435,
"Rank": 45
},
"Geometry": {
"Average Score": 20.127802528542947,
"Standard Deviation": null,
"Rank": 42
},
"Algebra": {
"Average Score": 23.46400816161807,
"Standard Deviation": null,
"Rank": 47
},
"Probability": {
"Average Score": 17.139514453170445,
"Standard Deviation": null,
"Rank": 47
},
"Logical": {
"Average Score": 24.625290351028372,
"Standard Deviation": null,
"Rank": 42
},
"Social": {
"Average Score": 26.715025606557614,
"Standard Deviation": null,
"Rank": 42
},
"Chemistry": {
"Average Score": 29.383105099269972,
"Standard Deviation": null,
"Rank": 41
},
"CPP": {
"Average Score": 28.014658234926813,
"Standard Deviation": null,
"Rank": 39
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-2",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 32.583755237895794,
"Standard Deviation": 1.6860156811686553,
"Rank": 40
},
"Geometry": {
"Average Score": 17.27716649229315,
"Standard Deviation": null,
"Rank": 43
},
"Algebra": {
"Average Score": 23.58916877939791,
"Standard Deviation": null,
"Rank": 46
},
"Probability": {
"Average Score": 25.1012270940144,
"Standard Deviation": null,
"Rank": 44
},
"Logical": {
"Average Score": 29.07002036532878,
"Standard Deviation": null,
"Rank": 40
},
"Social": {
"Average Score": 24.39006275978174,
"Standard Deviation": null,
"Rank": 43
},
"Chemistry": {
"Average Score": 32.76096708662236,
"Standard Deviation": null,
"Rank": 38
},
"CPP": {
"Average Score": 31.382959631870822,
"Standard Deviation": null,
"Rank": 35
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 22.167930858422395,
"Standard Deviation": 3.328543828571604,
"Rank": 50
},
"Geometry": {
"Average Score": 11.300762460776488,
"Standard Deviation": null,
"Rank": 49
},
"Algebra": {
"Average Score": 21.016466430115493,
"Standard Deviation": null,
"Rank": 48
},
"Probability": {
"Average Score": 24.506863192031716,
"Standard Deviation": null,
"Rank": 45
},
"Logical": {
"Average Score": 17.0066100312336,
"Standard Deviation": null,
"Rank": 49
},
"Social": {
"Average Score": 14.049392081101905,
"Standard Deviation": null,
"Rank": 52
},
"Chemistry": {
"Average Score": 20.796521445473058,
"Standard Deviation": null,
"Rank": 45
},
"CPP": {
"Average Score": 18.929093202755805,
"Standard Deviation": null,
"Rank": 42
}
}
},
{
"config": {
"model_name": "vicuna-13b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 20.105123059326157,
"Standard Deviation": 4.100609090750239,
"Rank": 51
},
"Geometry": {
"Average Score": 13.080654946737525,
"Standard Deviation": null,
"Rank": 48
},
"Algebra": {
"Average Score": 20.125194674408167,
"Standard Deviation": null,
"Rank": 49
},
"Probability": {
"Average Score": 13.125942598704368,
"Standard Deviation": null,
"Rank": 49
},
"Logical": {
"Average Score": 17.182300978389822,
"Standard Deviation": null,
"Rank": 48
},
"Social": {
"Average Score": 16.258399348520832,
"Standard Deviation": null,
"Rank": 50
},
"Chemistry": {
"Average Score": 23.79065696739089,
"Standard Deviation": null,
"Rank": 44
},
"CPP": {
"Average Score": 21.840013221590294,
"Standard Deviation": null,
"Rank": 40
}
}
},
{
"config": {
"model_name": "zephyr-7b-beta",
"organization": "HuggingFace",
"license": "MIT",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 11.581258432641418,
"Standard Deviation": 1.677081510212375,
"Rank": 54
},
"Geometry": {
"Average Score": 8.432624521698594,
"Standard Deviation": null,
"Rank": 50
},
"Algebra": {
"Average Score": 12.912859660357217,
"Standard Deviation": null,
"Rank": 51
},
"Probability": {
"Average Score": 7.643552619113196,
"Standard Deviation": null,
"Rank": 54
},
"Logical": {
"Average Score": 7.444095116649809,
"Standard Deviation": null,
"Rank": 55
},
"Social": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 57
},
"Chemistry": {
"Average Score": 16.150157007299235,
"Standard Deviation": null,
"Rank": 49
},
"CPP": {
"Average Score": 18.92902220864132,
"Standard Deviation": null,
"Rank": 43
}
}
},
{
"config": {
"model_name": "gemma-1.1-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 25.06653151900311,
"Standard Deviation": 5.340973431345662,
"Rank": 48
},
"Geometry": {
"Average Score": 13.161686218568628,
"Standard Deviation": null,
"Rank": 47
},
"Algebra": {
"Average Score": 15.592205919293873,
"Standard Deviation": null,
"Rank": 50
},
"Probability": {
"Average Score": 8.305764696120711,
"Standard Deviation": null,
"Rank": 51
},
"Logical": {
"Average Score": 10.940766703849592,
"Standard Deviation": null,
"Rank": 53
},
"Social": {
"Average Score": 21.925546766366356,
"Standard Deviation": null,
"Rank": 46
},
"Chemistry": {
"Average Score": 18.700936936742952,
"Standard Deviation": null,
"Rank": 46
},
"CPP": {
"Average Score": 20.724691953843916,
"Standard Deviation": null,
"Rank": 41
}
}
},
{
"config": {
"model_name": "llama2-7b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 25.633612357313762,
"Standard Deviation": 2.805639153654191,
"Rank": 46
},
"Geometry": {
"Average Score": 5.825877827672446,
"Standard Deviation": null,
"Rank": 51
},
"Algebra": {
"Average Score": 8.58657284915635,
"Standard Deviation": null,
"Rank": 53
},
"Probability": {
"Average Score": 8.164826137672431,
"Standard Deviation": null,
"Rank": 53
},
"Logical": {
"Average Score": 20.697630462723275,
"Standard Deviation": null,
"Rank": 47
},
"Social": {
"Average Score": 18.13821609304045,
"Standard Deviation": null,
"Rank": 47
},
"Chemistry": {
"Average Score": 17.065363968846427,
"Standard Deviation": null,
"Rank": 47
},
"CPP": {
"Average Score": 15.730513733660898,
"Standard Deviation": null,
"Rank": 45
}
}
},
{
"config": {
"model_name": "gemma-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 22.935122315202772,
"Standard Deviation": 1.9451357494738446,
"Rank": 49
},
"Geometry": {
"Average Score": 15.523844579555126,
"Standard Deviation": null,
"Rank": 46
},
"Algebra": {
"Average Score": 8.997563653883809,
"Standard Deviation": null,
"Rank": 52
},
"Probability": {
"Average Score": 6.750305898269558,
"Standard Deviation": null,
"Rank": 55
},
"Logical": {
"Average Score": 5.354222904092569,
"Standard Deviation": null,
"Rank": 56
},
"Social": {
"Average Score": 10.938132042877358,
"Standard Deviation": null,
"Rank": 54
},
"Chemistry": {
"Average Score": 17.06532733699507,
"Standard Deviation": null,
"Rank": 47
},
"CPP": {
"Average Score": 17.2715657115764,
"Standard Deviation": null,
"Rank": 44
}
}
},
{
"config": {
"model_name": "llama2-13b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 25.828530292775856,
"Standard Deviation": 3.2503558704879296,
"Rank": 47
},
"Geometry": {
"Average Score": 4.119943280135397,
"Standard Deviation": null,
"Rank": 53
},
"Algebra": {
"Average Score": 6.355347828676415,
"Standard Deviation": null,
"Rank": 54
},
"Probability": {
"Average Score": 11.5585998384148,
"Standard Deviation": null,
"Rank": 50
},
"Logical": {
"Average Score": 24.172674067890938,
"Standard Deviation": null,
"Rank": 43
},
"Social": {
"Average Score": 17.850287642446094,
"Standard Deviation": null,
"Rank": 49
},
"Chemistry": {
"Average Score": 13.887442704655687,
"Standard Deviation": null,
"Rank": 52
},
"CPP": {
"Average Score": 13.17258252933903,
"Standard Deviation": null,
"Rank": 48
}
}
},
{
"config": {
"model_name": "vicuna-7b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 19.78471384913738,
"Standard Deviation": 3.7936645273402276,
"Rank": 52
},
"Geometry": {
"Average Score": 5.434763675792798,
"Standard Deviation": null,
"Rank": 52
},
"Algebra": {
"Average Score": 5.925959137419872,
"Standard Deviation": null,
"Rank": 55
},
"Probability": {
"Average Score": 8.30566475354697,
"Standard Deviation": null,
"Rank": 51
},
"Logical": {
"Average Score": 11.881223740003346,
"Standard Deviation": null,
"Rank": 52
},
"Social": {
"Average Score": 12.864677350128595,
"Standard Deviation": null,
"Rank": 53
},
"Chemistry": {
"Average Score": 14.187574975522333,
"Standard Deviation": null,
"Rank": 50
},
"CPP": {
"Average Score": 14.255194156624162,
"Standard Deviation": null,
"Rank": 46
}
}
},
{
"config": {
"model_name": "koala-13b",
"organization": "UC Berkeley",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 10.216910767982592,
"Standard Deviation": 2.0597606260293655,
"Rank": 55
},
"Geometry": {
"Average Score": 0.1600118163292883,
"Standard Deviation": null,
"Rank": 54
},
"Algebra": {
"Average Score": 2.2219841274068948,
"Standard Deviation": null,
"Rank": 56
},
"Probability": {
"Average Score": 3.353938470588142,
"Standard Deviation": null,
"Rank": 56
},
"Logical": {
"Average Score": 8.24436273551765,
"Standard Deviation": null,
"Rank": 54
},
"Social": {
"Average Score": 10.96000067573448,
"Standard Deviation": null,
"Rank": 54
},
"Chemistry": {
"Average Score": 6.272570799004611,
"Standard Deviation": null,
"Rank": 53
},
"CPP": {
"Average Score": 6.36433272373514,
"Standard Deviation": null,
"Rank": 49
}
}
},
{
"config": {
"model_name": "openassistant-pythia-12b",
"organization": "OpenAssistant",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 56
},
"Geometry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 55
},
"Algebra": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 57
},
"Probability": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 57
},
"Logical": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 57
},
"Social": {
"Average Score": 1.859688217710296,
"Standard Deviation": null,
"Rank": 56
},
"Chemistry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 54
},
"CPP": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 50
}
}
},
{
"config": {
"model_name": "nemotron-70b",
"organization": "NVIDIA",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 100.0,
"Standard Deviation": 0.0,
"Rank": 1
},
"Geometry": {
"Average Score": 68.72757963233221,
"Standard Deviation": null,
"Rank": 12
},
"Algebra": {
"Average Score": 73.71625129267943,
"Standard Deviation": null,
"Rank": 16
},
"Chemistry": {
"Average Score": 72.48678626772566,
"Standard Deviation": null,
"Rank": 14
},
"Logical": {
"Average Score": 92.57864400540329,
"Standard Deviation": null,
"Rank": 5
},
"Social": {
"Average Score": 99.63342284899149,
"Standard Deviation": null,
"Rank": 2
},
"Probability": {
"Average Score": 75.30735899300154,
"Standard Deviation": null,
"Rank": 10
}
}
},
{
"config": {
"model_name": "llama-3.2-3b-it",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 29.47099904114387,
"Standard Deviation": 1.6836027650802912,
"Rank": 43
},
"Geometry": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 50
},
"Algebra": {
"Average Score": 55.31592410564261,
"Standard Deviation": null,
"Rank": 29
},
"Chemistry": {
"Average Score": 28.667640602193643,
"Standard Deviation": null,
"Rank": 43
},
"Logical": {
"Average Score": 15.35430947415723,
"Standard Deviation": null,
"Rank": 49
},
"Social": {
"Average Score": 18.087938295545133,
"Standard Deviation": null,
"Rank": 48
},
"Probability": {
"Average Score": 37.84631410688676,
"Standard Deviation": null,
"Rank": 35
}
}
},
{
"config": {
"model_name": "yi-lightning",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 96.10303362688546,
"Standard Deviation": 0.5365246195716372,
"Rank": 3
},
"Geometry": {
"Average Score": 77.09570683128703,
"Standard Deviation": null,
"Rank": 8
},
"Algebra": {
"Average Score": 85.92132293392635,
"Standard Deviation": null,
"Rank": 6
},
"Chemistry": {
"Average Score": 95.7205664118507,
"Standard Deviation": null,
"Rank": 2
},
"Logical": {
"Average Score": 94.60171867702756,
"Standard Deviation": null,
"Rank": 4
},
"Social": {
"Average Score": 93.93680225135506,
"Standard Deviation": null,
"Rank": 6
},
"Probability": {
"Average Score": 90.23858748317501,
"Standard Deviation": null,
"Rank": 3
}
}
},
{
"config": {
"model_name": "glm-4-plus",
"organization": "Zhipu AI",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 90.50303579501356,
"Standard Deviation": 5.202472970969946,
"Rank": 6
},
"Geometry": {
"Average Score": 76.37543021571776,
"Standard Deviation": null,
"Rank": 9
},
"Algebra": {
"Average Score": 81.39859078752944,
"Standard Deviation": null,
"Rank": 10
},
"Chemistry": {
"Average Score": 90.15506569759444,
"Standard Deviation": null,
"Rank": 6
},
"Logical": {
"Average Score": 92.26403821208403,
"Standard Deviation": null,
"Rank": 6
},
"Social": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Probability": {
"Average Score": 73.99418447190348,
"Standard Deviation": null,
"Rank": 11
}
}
}
]