|
[ |
|
{ |
|
"config": { |
|
"model_name": "ChatGPT-4o-latest (2024-09-03)", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 93.51557945652831, |
|
"Standard Deviation": 3.1900396436407785, |
|
"Rank": 4 |
|
}, |
|
"Geometry": { |
|
"Average Score": 81.8536937387725, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
}, |
|
"Algebra": { |
|
"Average Score": 89.3642910524324, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
}, |
|
"Probability": { |
|
"Average Score": 86.55761073510537, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"Logical": { |
|
"Average Score": 97.39734315785844, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Social": { |
|
"Average Score": 91.03727530739368, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"CPP": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4o-2024-08-06", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 79.7806321863411, |
|
"Standard Deviation": 0.8302330946013555, |
|
"Rank": 14 |
|
}, |
|
"Geometry": { |
|
"Average Score": 86.29041459755453, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Algebra": { |
|
"Average Score": 88.53373721863113, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"Probability": { |
|
"Average Score": 78.694360721361, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Logical": { |
|
"Average Score": 78.3116623496895, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"Social": { |
|
"Average Score": 79.90944696263446, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 86.96011263543132, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"CPP": { |
|
"Average Score": 92.43090226400756, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4o-2024-05-13", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 86.40675398236253, |
|
"Standard Deviation": 6.473604235710212, |
|
"Rank": 9 |
|
}, |
|
"Geometry": { |
|
"Average Score": 82.42032988843268, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"Algebra": { |
|
"Average Score": 83.51580675782952, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Probability": { |
|
"Average Score": 81.88434691830915, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
}, |
|
"Logical": { |
|
"Average Score": 87.92744931984977, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Social": { |
|
"Average Score": 76.12369632852445, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 90.93459148149344, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"CPP": { |
|
"Average Score": 79.1592634699295, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4-turbo-2024-04-09", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 87.17581147282237, |
|
"Standard Deviation": 8.716963621850567, |
|
"Rank": 8 |
|
}, |
|
"Geometry": { |
|
"Average Score": 78.76635545274637, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Algebra": { |
|
"Average Score": 79.96323615621023, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Probability": { |
|
"Average Score": 77.65333799733705, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Logical": { |
|
"Average Score": 89.33307138659873, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Social": { |
|
"Average Score": 76.86597570996584, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 84.02855687506661, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"CPP": { |
|
"Average Score": 70.73143363230263, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemini-1.5-pro-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 80.38345723548734, |
|
"Standard Deviation": 2.4635699815143584, |
|
"Rank": 13 |
|
}, |
|
"Geometry": { |
|
"Average Score": 84.30455076458965, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
}, |
|
"Algebra": { |
|
"Average Score": 85.9212061409364, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Probability": { |
|
"Average Score": 73.11806712394745, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
}, |
|
"Logical": { |
|
"Average Score": 78.27369746632996, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"Social": { |
|
"Average Score": 79.57606824531047, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen2-72b-instruct", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/09" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 74.44059692248071, |
|
"Standard Deviation": 2.3957041566666697, |
|
"Rank": 16 |
|
}, |
|
"Geometry": { |
|
"Average Score": 72.58490369919883, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Algebra": { |
|
"Average Score": 88.53359632761772, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"Probability": { |
|
"Average Score": 80.19789976985243, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Logical": { |
|
"Average Score": 72.76843081200641, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Social": { |
|
"Average Score": 57.256064868444426, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 75.47190401351077, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"CPP": { |
|
"Average Score": 73.54037778797029, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt-4o-mini-2024-07-18", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 82.82456893277315, |
|
"Standard Deviation": 7.714840109805867, |
|
"Rank": 12 |
|
}, |
|
"Geometry": { |
|
"Average Score": 78.89323869622943, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Algebra": { |
|
"Average Score": 84.8722603687823, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Probability": { |
|
"Average Score": 78.6942843346463, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Logical": { |
|
"Average Score": 85.68921109829361, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
}, |
|
"Social": { |
|
"Average Score": 81.79892848722542, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 81.46805623180109, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
}, |
|
"CPP": { |
|
"Average Score": 88.3877070580296, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3.5-sonnet", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 88.43557924843628, |
|
"Standard Deviation": 5.680338106806327, |
|
"Rank": 7 |
|
}, |
|
"Geometry": { |
|
"Average Score": 76.26169400931595, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
}, |
|
"Algebra": { |
|
"Average Score": 77.15040433072186, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
}, |
|
"Probability": { |
|
"Average Score": 73.9942759783754, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Logical": { |
|
"Average Score": 89.70827617930533, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Social": { |
|
"Average Score": 97.3810636467068, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 94.92819763202698, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
}, |
|
"CPP": { |
|
"Average Score": 82.37734076815008, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "o1-mini", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 96.12399889226096, |
|
"Standard Deviation": 0.5674965705992511, |
|
"Rank": 2 |
|
}, |
|
"Geometry": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Algebra": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Probability": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Logical": { |
|
"Average Score": 96.52089445393929, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
}, |
|
"Social": { |
|
"Average Score": 95.00695256918654, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "o1-preview", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 91.08240629161766, |
|
"Standard Deviation": 4.83378135710071, |
|
"Rank": 5 |
|
}, |
|
"Geometry": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Algebra": { |
|
"Average Score": 98.1870991822192, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Probability": { |
|
"Average Score": 94.12657646584134, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Logical": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Social": { |
|
"Average Score": 96.56802743955569, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemini-1.5-flash-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 66.25275609135964, |
|
"Standard Deviation": 2.5314573702881438, |
|
"Rank": 20 |
|
}, |
|
"Geometry": { |
|
"Average Score": 66.8010242138006, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
}, |
|
"Algebra": { |
|
"Average Score": 78.24639082497596, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"Probability": { |
|
"Average Score": 67.84602916736804, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Logical": { |
|
"Average Score": 72.76845749138818, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Social": { |
|
"Average Score": 68.57728479711058, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 75.47188329078935, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"CPP": { |
|
"Average Score": 72.1127762005651, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt4-1106", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 85.660054434658, |
|
"Standard Deviation": 7.392502344300497, |
|
"Rank": 10 |
|
}, |
|
"Geometry": { |
|
"Average Score": 63.36396165140893, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Algebra": { |
|
"Average Score": 74.67191687355754, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Probability": { |
|
"Average Score": 71.35141952665965, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
}, |
|
"Logical": { |
|
"Average Score": 76.34506017196868, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Social": { |
|
"Average Score": 46.00126575332808, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 78.70156756289569, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"CPP": { |
|
"Average Score": 69.11824072252848, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2-27b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/06" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 70.82622192650408, |
|
"Standard Deviation": 0.18962869075029884, |
|
"Rank": 18 |
|
}, |
|
"Geometry": { |
|
"Average Score": 58.25724467150374, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Algebra": { |
|
"Average Score": 73.71614711121721, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Probability": { |
|
"Average Score": 66.08200742339983, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Logical": { |
|
"Average Score": 72.76841354275011, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Social": { |
|
"Average Score": 53.736358144621576, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 68.1178055540124, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"CPP": { |
|
"Average Score": 63.28920072143611, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3-opus", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 82.28903171580336, |
|
"Standard Deviation": 10.093273304495547, |
|
"Rank": 11 |
|
}, |
|
"Geometry": { |
|
"Average Score": 57.98602891013921, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Algebra": { |
|
"Average Score": 73.54334730242743, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
}, |
|
"Probability": { |
|
"Average Score": 67.8341594991468, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"Logical": { |
|
"Average Score": 78.31155849680502, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"Social": { |
|
"Average Score": 90.45833112761075, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 85.97349470177741, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"CPP": { |
|
"Average Score": 73.5404403567132, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2-9b-it-simpo", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": "N/A", |
|
"Standard Deviation": "N/A", |
|
"Rank": "N/A" |
|
}, |
|
"Geometry": { |
|
"Average Score": 52.80896798216458, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"Algebra": { |
|
"Average Score": 69.60260038105677, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"Probability": { |
|
"Average Score": 59.52630271491633, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Logical": { |
|
"Average Score": 63.57920031465781, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Social": { |
|
"Average Score": 79.90950201631269, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 90.36508196626548, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
}, |
|
"CPP": { |
|
"Average Score": 73.43757596214863, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-72b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 65.26710370586439, |
|
"Standard Deviation": 9.198700753743012, |
|
"Rank": 19 |
|
}, |
|
"Geometry": { |
|
"Average Score": 48.52417714351894, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"Algebra": { |
|
"Average Score": 68.55765479604507, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"Probability": { |
|
"Average Score": 49.52382148131357, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"Logical": { |
|
"Average Score": 37.33563924001827, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"Social": { |
|
"Average Score": 46.00141195402727, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 52.625823960166215, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"CPP": { |
|
"Average Score": 48.69302376665551, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-32b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 46.74335731441104, |
|
"Standard Deviation": 4.096227849530709, |
|
"Rank": 28 |
|
}, |
|
"Geometry": { |
|
"Average Score": 44.96670224519297, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"Algebra": { |
|
"Average Score": 63.19715848628476, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Probability": { |
|
"Average Score": 48.59873650270336, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"Logical": { |
|
"Average Score": 42.028753105249216, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Social": { |
|
"Average Score": 43.183938768454986, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 47.84488021045937, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"CPP": { |
|
"Average Score": 45.14284028264288, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "google-gemma-2-9b-it", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/06" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 60.71065949101693, |
|
"Standard Deviation": 0.12283018509137462, |
|
"Rank": 23 |
|
}, |
|
"Geometry": { |
|
"Average Score": 52.49270527783856, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"Algebra": { |
|
"Average Score": 63.446032975128176, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Probability": { |
|
"Average Score": 63.95287475488081, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"Logical": { |
|
"Average Score": 70.18644584116615, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"Social": { |
|
"Average Score": 86.45401862572464, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 57.56342217758078, |
|
"Standard Deviation": null, |
|
"Rank": 20 |
|
}, |
|
"CPP": { |
|
"Average Score": 54.03167523687635, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "yi-1.5-34b-chat", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024/05" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 71.53811567931923, |
|
"Standard Deviation": 0.4838075734512934, |
|
"Rank": 17 |
|
}, |
|
"Geometry": { |
|
"Average Score": 53.98343904373819, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
}, |
|
"Algebra": { |
|
"Average Score": 63.317896075817885, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
}, |
|
"Probability": { |
|
"Average Score": 64.73492918491159, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"Logical": { |
|
"Average Score": 66.39420245024361, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Social": { |
|
"Average Score": 53.73650350964252, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 56.722360677914686, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"CPP": { |
|
"Average Score": 52.148798061768964, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "meta-llama-3.1-70b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3.1 Community", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 74.01502078434305, |
|
"Standard Deviation": 0.24116839515156926, |
|
"Rank": 15 |
|
}, |
|
"Geometry": { |
|
"Average Score": 66.80097850274383, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
}, |
|
"Algebra": { |
|
"Average Score": 74.7667367179752, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
}, |
|
"Probability": { |
|
"Average Score": 66.0819470113051, |
|
"Standard Deviation": null, |
|
"Rank": 17 |
|
}, |
|
"Logical": { |
|
"Average Score": 73.68238947162197, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Social": { |
|
"Average Score": 68.577541438994, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 70.4019514562452, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"CPP": { |
|
"Average Score": 84.36815192532764, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "meta-llama-3.1-8b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3.1 Community", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 55.268736955905695, |
|
"Standard Deviation": 7.060517225126177, |
|
"Rank": 26 |
|
}, |
|
"Geometry": { |
|
"Average Score": 42.44262022417502, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Algebra": { |
|
"Average Score": 60.632347391080486, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Probability": { |
|
"Average Score": 52.372362507453694, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"Logical": { |
|
"Average Score": 54.17571378414435, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Social": { |
|
"Average Score": 39.07966801070027, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 45.0170262190059, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"CPP": { |
|
"Average Score": 44.41846841004584, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gpt3.5-turbo-0125", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2021/09" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 29.17379433602279, |
|
"Standard Deviation": 2.6813415847393878, |
|
"Rank": 44 |
|
}, |
|
"Geometry": { |
|
"Average Score": 51.47279337094397, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Algebra": { |
|
"Average Score": 59.03601450977881, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"Probability": { |
|
"Average Score": 46.71541304474977, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Logical": { |
|
"Average Score": 20.82026871015984, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"Social": { |
|
"Average Score": 28.31096293069848, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 42.899594571904004, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"CPP": { |
|
"Average Score": 40.46958736582551, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama-3-70b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 65.90407336557487, |
|
"Standard Deviation": 66.63940143516267, |
|
"Rank": 24 |
|
}, |
|
"Geometry": { |
|
"Average Score": 46.40555349958932, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Algebra": { |
|
"Average Score": 60.86276607976933, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"Probability": { |
|
"Average Score": 55.0233135868055, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
}, |
|
"Logical": { |
|
"Average Score": 83.99546392889077, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
}, |
|
"Social": { |
|
"Average Score": 47.90189246663785, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 70.40198909396582, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
}, |
|
"CPP": { |
|
"Average Score": 65.32140697218945, |
|
"Standard Deviation": null, |
|
"Rank": 13 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3-sonnet", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 64.4278622266347, |
|
"Standard Deviation": 3.089828107392469, |
|
"Rank": 21 |
|
}, |
|
"Geometry": { |
|
"Average Score": 51.4677627365698, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
}, |
|
"Algebra": { |
|
"Average Score": 57.157810499255426, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"Probability": { |
|
"Average Score": 54.68761427070592, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Logical": { |
|
"Average Score": 65.8346271849297, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
}, |
|
"Social": { |
|
"Average Score": 62.842721798877186, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 66.1914400411681, |
|
"Standard Deviation": null, |
|
"Rank": 18 |
|
}, |
|
"CPP": { |
|
"Average Score": 61.33538592327427, |
|
"Standard Deviation": null, |
|
"Rank": 15 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-14b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 44.920016997055804, |
|
"Standard Deviation": 0.3041914765974254, |
|
"Rank": 30 |
|
}, |
|
"Geometry": { |
|
"Average Score": 36.40735570120079, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Algebra": { |
|
"Average Score": 56.004717588310726, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Probability": { |
|
"Average Score": 39.24866255465088, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Logical": { |
|
"Average Score": 35.15462916949486, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"Social": { |
|
"Average Score": 35.236185321936766, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 40.803706763362605, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"CPP": { |
|
"Average Score": 38.552779976347026, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-3-haiku", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 53.46814061793852, |
|
"Standard Deviation": 10.143567097006747, |
|
"Rank": 25 |
|
}, |
|
"Geometry": { |
|
"Average Score": 42.87542087805953, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"Algebra": { |
|
"Average Score": 53.706856083803686, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Probability": { |
|
"Average Score": 49.80372052799326, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Logical": { |
|
"Average Score": 62.585349577709394, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"Social": { |
|
"Average Score": 57.25601125762336, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 60.48921113945562, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
}, |
|
"CPP": { |
|
"Average Score": 56.40200048817984, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-2.1", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 39.855928282633364, |
|
"Standard Deviation": 8.396129652430814, |
|
"Rank": 35 |
|
}, |
|
"Geometry": { |
|
"Average Score": 51.1749207092159, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Algebra": { |
|
"Average Score": 53.05386216145516, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"Probability": { |
|
"Average Score": 44.42150447611455, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Logical": { |
|
"Average Score": 60.51381867118053, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"Social": { |
|
"Average Score": 38.492280755756035, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 50.66182745698702, |
|
"Standard Deviation": null, |
|
"Rank": 24 |
|
}, |
|
"CPP": { |
|
"Average Score": 47.23672563994903, |
|
"Standard Deviation": null, |
|
"Rank": 21 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "mistral-8x7b-instruct-v0.1", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 42.70451051343715, |
|
"Standard Deviation": 9.965602920103015, |
|
"Rank": 31 |
|
}, |
|
"Geometry": { |
|
"Average Score": 33.473933494899164, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"Algebra": { |
|
"Average Score": 48.99207852115047, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"Probability": { |
|
"Average Score": 44.46936520340586, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Logical": { |
|
"Average Score": 42.656238987207246, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"Social": { |
|
"Average Score": 30.32900110312259, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 47.047104057571026, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"CPP": { |
|
"Average Score": 44.533118241976666, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "claude-2.0", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 33.53990717968659, |
|
"Standard Deviation": 7.640386327990536, |
|
"Rank": 41 |
|
}, |
|
"Geometry": { |
|
"Average Score": 38.40953902052666, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"Algebra": { |
|
"Average Score": 49.07235259762855, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Probability": { |
|
"Average Score": 46.71546649299419, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"Logical": { |
|
"Average Score": 56.26908965013192, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"Social": { |
|
"Average Score": 47.84034165469707, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 55.20362543510563, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
}, |
|
"CPP": { |
|
"Average Score": 50.773143448036464, |
|
"Standard Deviation": null, |
|
"Rank": 19 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "starling-lm-7b-beta", |
|
"organization": "Nexusflow", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2024/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 50.90398580969381, |
|
"Standard Deviation": 0.2839403187065694, |
|
"Rank": 27 |
|
}, |
|
"Geometry": { |
|
"Average Score": 34.653904247826965, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Algebra": { |
|
"Average Score": 49.66265150940668, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"Probability": { |
|
"Average Score": 40.04695085773174, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"Logical": { |
|
"Average Score": 48.02284849364292, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"Social": { |
|
"Average Score": 42.82322308642107, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 40.54467030566931, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"CPP": { |
|
"Average Score": 38.27587102395908, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemini-1.0-pro-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 37.91102687366529, |
|
"Standard Deviation": 15.15111885239772, |
|
"Rank": 38 |
|
}, |
|
"Geometry": { |
|
"Average Score": 35.480853719259684, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"Algebra": { |
|
"Average Score": 48.08542847805497, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"Probability": { |
|
"Average Score": 29.862669786973395, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Logical": { |
|
"Average Score": 24.141794297157134, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Social": { |
|
"Average Score": 15.062345665891504, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 46.52522766257804, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
}, |
|
"CPP": { |
|
"Average Score": 45.22204471452975, |
|
"Standard Deviation": null, |
|
"Rank": 23 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "openchat-3.5-0106", |
|
"organization": "OpenChat", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2024/01" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 41.34314082389491, |
|
"Standard Deviation": 4.394481877390224, |
|
"Rank": 32 |
|
}, |
|
"Geometry": { |
|
"Average Score": 29.859015723426758, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"Algebra": { |
|
"Average Score": 45.79428201943078, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"Probability": { |
|
"Average Score": 38.766888608782956, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
}, |
|
"Logical": { |
|
"Average Score": 42.1345774485532, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"Social": { |
|
"Average Score": 32.07155544930587, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 35.28601797606463, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"CPP": { |
|
"Average Score": 33.70639271807677, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "openchat-3.5", |
|
"organization": "OpenChat", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 39.60454188051808, |
|
"Standard Deviation": 0.8232501722386516, |
|
"Rank": 36 |
|
}, |
|
"Geometry": { |
|
"Average Score": 30.77657388742533, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"Algebra": { |
|
"Average Score": 42.13028451761782, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"Probability": { |
|
"Average Score": 34.817635171077754, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"Logical": { |
|
"Average Score": 36.21944706732088, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"Social": { |
|
"Average Score": 37.59265084241427, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 37.21911183748652, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"CPP": { |
|
"Average Score": 33.020911255646965, |
|
"Standard Deviation": null, |
|
"Rank": 34 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "command-r-(08-2024)", |
|
"organization": "Cohere", |
|
"license": "CC-BY-NC-4.0", |
|
"knowledge_cutoff": "2024/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 45.84310421663912, |
|
"Standard Deviation": 0.14535750785421472, |
|
"Rank": 29 |
|
}, |
|
"Geometry": { |
|
"Average Score": 36.33550343578038, |
|
"Standard Deviation": null, |
|
"Rank": 31 |
|
}, |
|
"Algebra": { |
|
"Average Score": 41.87079446639028, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"Probability": { |
|
"Average Score": 36.87662939858684, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"Logical": { |
|
"Average Score": 26.22482921268266, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"Social": { |
|
"Average Score": 35.11019761697373, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 41.81772722027254, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"CPP": { |
|
"Average Score": 39.61492485677676, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-1.1-7b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 35.873210924652795, |
|
"Standard Deviation": 6.462625645064649, |
|
"Rank": 37 |
|
}, |
|
"Geometry": { |
|
"Average Score": 25.79207201693066, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"Algebra": { |
|
"Average Score": 40.58046616460041, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"Probability": { |
|
"Average Score": 29.581773053230897, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Logical": { |
|
"Average Score": 41.99821650962693, |
|
"Standard Deviation": null, |
|
"Rank": 33 |
|
}, |
|
"Social": { |
|
"Average Score": 24.39015213949678, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 45.01706482033765, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"CPP": { |
|
"Average Score": 42.666504105798204, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama3-8b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "2023/03" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 39.00917270775336, |
|
"Standard Deviation": 3.999506140299149, |
|
"Rank": 39 |
|
}, |
|
"Geometry": { |
|
"Average Score": 29.224089668837465, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"Algebra": { |
|
"Average Score": 42.90961619082775, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"Probability": { |
|
"Average Score": 34.15721355738147, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"Logical": { |
|
"Average Score": 58.39773915370141, |
|
"Standard Deviation": null, |
|
"Rank": 26 |
|
}, |
|
"Social": { |
|
"Average Score": 40.88535401371015, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 49.70839372661025, |
|
"Standard Deviation": null, |
|
"Rank": 25 |
|
}, |
|
"CPP": { |
|
"Average Score": 45.35392139264795, |
|
"Standard Deviation": null, |
|
"Rank": 22 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 57.45780847204313, |
|
"Standard Deviation": 16.310023687014333, |
|
"Rank": 22 |
|
}, |
|
"Geometry": { |
|
"Average Score": 29.820233374501843, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
}, |
|
"Algebra": { |
|
"Average Score": 39.873024674507214, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"Probability": { |
|
"Average Score": 31.85692359301203, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"Logical": { |
|
"Average Score": 43.93437465788311, |
|
"Standard Deviation": null, |
|
"Rank": 30 |
|
}, |
|
"Social": { |
|
"Average Score": 44.689420554662476, |
|
"Standard Deviation": null, |
|
"Rank": 27 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 32.05704364512495, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"CPP": { |
|
"Average Score": 30.53406933106768, |
|
"Standard Deviation": null, |
|
"Rank": 36 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "starling-lm-7b-alpha", |
|
"organization": "Nexusflow", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2023/11" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 40.625443347641045, |
|
"Standard Deviation": 3.0544259540377268, |
|
"Rank": 34 |
|
}, |
|
"Geometry": { |
|
"Average Score": 26.171147508308422, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"Algebra": { |
|
"Average Score": 39.149463007523856, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Probability": { |
|
"Average Score": 32.36862021879827, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"Logical": { |
|
"Average Score": 34.17344938419256, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"Social": { |
|
"Average Score": 35.06966333212518, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 32.15932739848045, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
}, |
|
"CPP": { |
|
"Average Score": 30.07926487356878, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "qwen1.5-4b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 11.723779019126527, |
|
"Standard Deviation": 0.856230353584155, |
|
"Rank": 53 |
|
}, |
|
"Geometry": { |
|
"Average Score": 16.072772563608115, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"Algebra": { |
|
"Average Score": 32.22626131587612, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
}, |
|
"Probability": { |
|
"Average Score": 13.98282712349133, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
}, |
|
"Logical": { |
|
"Average Score": 13.993097991375581, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Social": { |
|
"Average Score": 22.955898106386442, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 13.907481529463642, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"CPP": { |
|
"Average Score": 13.21208067122554, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "command-r-(04-2024)", |
|
"organization": "Cohere", |
|
"license": "CC-BY-NC-4.0", |
|
"knowledge_cutoff": "2024/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 43.08187135994592, |
|
"Standard Deviation": 0.7654553730614279, |
|
"Rank": 33 |
|
}, |
|
"Geometry": { |
|
"Average Score": 24.037084801508428, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"Algebra": { |
|
"Average Score": 32.37474440275246, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Probability": { |
|
"Average Score": 31.014039425232298, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"Logical": { |
|
"Average Score": 35.49507014348235, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"Social": { |
|
"Average Score": 34.782695172510856, |
|
"Standard Deviation": null, |
|
"Rank": 37 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 42.46395478814961, |
|
"Standard Deviation": null, |
|
"Rank": 32 |
|
}, |
|
"CPP": { |
|
"Average Score": 41.346336503003236, |
|
"Standard Deviation": null, |
|
"Rank": 28 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "vicuna-33b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/08" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 30.8582386682731, |
|
"Standard Deviation": 2.3851186735858945, |
|
"Rank": 42 |
|
}, |
|
"Geometry": { |
|
"Average Score": 17.058968577112452, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
}, |
|
"Algebra": { |
|
"Average Score": 25.22004544023738, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"Probability": { |
|
"Average Score": 21.097169680647767, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"Logical": { |
|
"Average Score": 23.212667585279515, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"Social": { |
|
"Average Score": 32.357116321848025, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 29.376389899632898, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"CPP": { |
|
"Average Score": 28.01838653090379, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-7b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 27.609692676933715, |
|
"Standard Deviation": 5.8350892031427435, |
|
"Rank": 45 |
|
}, |
|
"Geometry": { |
|
"Average Score": 20.127802528542947, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Algebra": { |
|
"Average Score": 23.46400816161807, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"Probability": { |
|
"Average Score": 17.139514453170445, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"Logical": { |
|
"Average Score": 24.625290351028372, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Social": { |
|
"Average Score": 26.715025606557614, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 29.383105099269972, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
}, |
|
"CPP": { |
|
"Average Score": 28.014658234926813, |
|
"Standard Deviation": null, |
|
"Rank": 39 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "mistral-7b-instruct-2", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 32.583755237895794, |
|
"Standard Deviation": 1.6860156811686553, |
|
"Rank": 40 |
|
}, |
|
"Geometry": { |
|
"Average Score": 17.27716649229315, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Algebra": { |
|
"Average Score": 23.58916877939791, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"Probability": { |
|
"Average Score": 25.1012270940144, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
}, |
|
"Logical": { |
|
"Average Score": 29.07002036532878, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
}, |
|
"Social": { |
|
"Average Score": 24.39006275978174, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 32.76096708662236, |
|
"Standard Deviation": null, |
|
"Rank": 38 |
|
}, |
|
"CPP": { |
|
"Average Score": 31.382959631870822, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "mistral-7b-instruct-1", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 22.167930858422395, |
|
"Standard Deviation": 3.328543828571604, |
|
"Rank": 50 |
|
}, |
|
"Geometry": { |
|
"Average Score": 11.300762460776488, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Algebra": { |
|
"Average Score": 21.016466430115493, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
}, |
|
"Probability": { |
|
"Average Score": 24.506863192031716, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"Logical": { |
|
"Average Score": 17.0066100312336, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Social": { |
|
"Average Score": 14.049392081101905, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 20.796521445473058, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
}, |
|
"CPP": { |
|
"Average Score": 18.929093202755805, |
|
"Standard Deviation": null, |
|
"Rank": 42 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "vicuna-13b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 20.105123059326157, |
|
"Standard Deviation": 4.100609090750239, |
|
"Rank": 51 |
|
}, |
|
"Geometry": { |
|
"Average Score": 13.080654946737525, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
}, |
|
"Algebra": { |
|
"Average Score": 20.125194674408167, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Probability": { |
|
"Average Score": 13.125942598704368, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Logical": { |
|
"Average Score": 17.182300978389822, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
}, |
|
"Social": { |
|
"Average Score": 16.258399348520832, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 23.79065696739089, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
}, |
|
"CPP": { |
|
"Average Score": 21.840013221590294, |
|
"Standard Deviation": null, |
|
"Rank": 40 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "zephyr-7b-beta", |
|
"organization": "HuggingFace", |
|
"license": "MIT", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 11.581258432641418, |
|
"Standard Deviation": 1.677081510212375, |
|
"Rank": 54 |
|
}, |
|
"Geometry": { |
|
"Average Score": 8.432624521698594, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"Algebra": { |
|
"Average Score": 12.912859660357217, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Probability": { |
|
"Average Score": 7.643552619113196, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Logical": { |
|
"Average Score": 7.444095116649809, |
|
"Standard Deviation": null, |
|
"Rank": 55 |
|
}, |
|
"Social": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 57 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 16.150157007299235, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"CPP": { |
|
"Average Score": 18.92902220864132, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-1.1-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 25.06653151900311, |
|
"Standard Deviation": 5.340973431345662, |
|
"Rank": 48 |
|
}, |
|
"Geometry": { |
|
"Average Score": 13.161686218568628, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"Algebra": { |
|
"Average Score": 15.592205919293873, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"Probability": { |
|
"Average Score": 8.305764696120711, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Logical": { |
|
"Average Score": 10.940766703849592, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"Social": { |
|
"Average Score": 21.925546766366356, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 18.700936936742952, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"CPP": { |
|
"Average Score": 20.724691953843916, |
|
"Standard Deviation": null, |
|
"Rank": 41 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama2-7b-chat", |
|
"organization": "Meta", |
|
"license": "Llama 2 Community", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 25.633612357313762, |
|
"Standard Deviation": 2.805639153654191, |
|
"Rank": 46 |
|
}, |
|
"Geometry": { |
|
"Average Score": 5.825877827672446, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Algebra": { |
|
"Average Score": 8.58657284915635, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"Probability": { |
|
"Average Score": 8.164826137672431, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"Logical": { |
|
"Average Score": 20.697630462723275, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"Social": { |
|
"Average Score": 18.13821609304045, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 17.065363968846427, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"CPP": { |
|
"Average Score": 15.730513733660898, |
|
"Standard Deviation": null, |
|
"Rank": 45 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "gemma-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024/02" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 22.935122315202772, |
|
"Standard Deviation": 1.9451357494738446, |
|
"Rank": 49 |
|
}, |
|
"Geometry": { |
|
"Average Score": 15.523844579555126, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
}, |
|
"Algebra": { |
|
"Average Score": 8.997563653883809, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"Probability": { |
|
"Average Score": 6.750305898269558, |
|
"Standard Deviation": null, |
|
"Rank": 55 |
|
}, |
|
"Logical": { |
|
"Average Score": 5.354222904092569, |
|
"Standard Deviation": null, |
|
"Rank": 56 |
|
}, |
|
"Social": { |
|
"Average Score": 10.938132042877358, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 17.06532733699507, |
|
"Standard Deviation": null, |
|
"Rank": 47 |
|
}, |
|
"CPP": { |
|
"Average Score": 17.2715657115764, |
|
"Standard Deviation": null, |
|
"Rank": 44 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama2-13b-chat", |
|
"organization": "Meta", |
|
"license": "Llama 2 Community", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 25.828530292775856, |
|
"Standard Deviation": 3.2503558704879296, |
|
"Rank": 47 |
|
}, |
|
"Geometry": { |
|
"Average Score": 4.119943280135397, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"Algebra": { |
|
"Average Score": 6.355347828676415, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Probability": { |
|
"Average Score": 11.5585998384148, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"Logical": { |
|
"Average Score": 24.172674067890938, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Social": { |
|
"Average Score": 17.850287642446094, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 13.887442704655687, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"CPP": { |
|
"Average Score": 13.17258252933903, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "vicuna-7b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/07" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 19.78471384913738, |
|
"Standard Deviation": 3.7936645273402276, |
|
"Rank": 52 |
|
}, |
|
"Geometry": { |
|
"Average Score": 5.434763675792798, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"Algebra": { |
|
"Average Score": 5.925959137419872, |
|
"Standard Deviation": null, |
|
"Rank": 55 |
|
}, |
|
"Probability": { |
|
"Average Score": 8.30566475354697, |
|
"Standard Deviation": null, |
|
"Rank": 51 |
|
}, |
|
"Logical": { |
|
"Average Score": 11.881223740003346, |
|
"Standard Deviation": null, |
|
"Rank": 52 |
|
}, |
|
"Social": { |
|
"Average Score": 12.864677350128595, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 14.187574975522333, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
}, |
|
"CPP": { |
|
"Average Score": 14.255194156624162, |
|
"Standard Deviation": null, |
|
"Rank": 46 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "koala-13b", |
|
"organization": "UC Berkeley", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 10.216910767982592, |
|
"Standard Deviation": 2.0597606260293655, |
|
"Rank": 55 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.1600118163292883, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Algebra": { |
|
"Average Score": 2.2219841274068948, |
|
"Standard Deviation": null, |
|
"Rank": 56 |
|
}, |
|
"Probability": { |
|
"Average Score": 3.353938470588142, |
|
"Standard Deviation": null, |
|
"Rank": 56 |
|
}, |
|
"Logical": { |
|
"Average Score": 8.24436273551765, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Social": { |
|
"Average Score": 10.96000067573448, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 6.272570799004611, |
|
"Standard Deviation": null, |
|
"Rank": 53 |
|
}, |
|
"CPP": { |
|
"Average Score": 6.36433272373514, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "openassistant-pythia-12b", |
|
"organization": "OpenAssistant", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023/04" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 56 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 55 |
|
}, |
|
"Algebra": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 57 |
|
}, |
|
"Probability": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 57 |
|
}, |
|
"Logical": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 57 |
|
}, |
|
"Social": { |
|
"Average Score": 1.859688217710296, |
|
"Standard Deviation": null, |
|
"Rank": 56 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 54 |
|
}, |
|
"CPP": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": null, |
|
"Rank": 50 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "nemotron-70b", |
|
"organization": "NVIDIA", |
|
"license": "Unknown", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 1 |
|
}, |
|
"Geometry": { |
|
"Average Score": 68.72757963233221, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
}, |
|
"Algebra": { |
|
"Average Score": 73.71625129267943, |
|
"Standard Deviation": null, |
|
"Rank": 16 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 72.48678626772566, |
|
"Standard Deviation": null, |
|
"Rank": 14 |
|
}, |
|
"Logical": { |
|
"Average Score": 92.57864400540329, |
|
"Standard Deviation": null, |
|
"Rank": 5 |
|
}, |
|
"Social": { |
|
"Average Score": 99.63342284899149, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Probability": { |
|
"Average Score": 75.30735899300154, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "llama-3.2-3b-it", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 29.47099904114387, |
|
"Standard Deviation": 1.6836027650802912, |
|
"Rank": 43 |
|
}, |
|
"Geometry": { |
|
"Average Score": 0.0, |
|
"Standard Deviation": 0.0, |
|
"Rank": 50 |
|
}, |
|
"Algebra": { |
|
"Average Score": 55.31592410564261, |
|
"Standard Deviation": null, |
|
"Rank": 29 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 28.667640602193643, |
|
"Standard Deviation": null, |
|
"Rank": 43 |
|
}, |
|
"Logical": { |
|
"Average Score": 15.35430947415723, |
|
"Standard Deviation": null, |
|
"Rank": 49 |
|
}, |
|
"Social": { |
|
"Average Score": 18.087938295545133, |
|
"Standard Deviation": null, |
|
"Rank": 48 |
|
}, |
|
"Probability": { |
|
"Average Score": 37.84631410688676, |
|
"Standard Deviation": null, |
|
"Rank": 35 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "yi-lightning", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 96.10303362688546, |
|
"Standard Deviation": 0.5365246195716372, |
|
"Rank": 3 |
|
}, |
|
"Geometry": { |
|
"Average Score": 77.09570683128703, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Algebra": { |
|
"Average Score": 85.92132293392635, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 95.7205664118507, |
|
"Standard Deviation": null, |
|
"Rank": 2 |
|
}, |
|
"Logical": { |
|
"Average Score": 94.60171867702756, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"Social": { |
|
"Average Score": 93.93680225135506, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Probability": { |
|
"Average Score": 90.23858748317501, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "glm-4-plus", |
|
"organization": "Zhipu AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 90.50303579501356, |
|
"Standard Deviation": 5.202472970969946, |
|
"Rank": 6 |
|
}, |
|
"Geometry": { |
|
"Average Score": 76.37543021571776, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Algebra": { |
|
"Average Score": 81.39859078752944, |
|
"Standard Deviation": null, |
|
"Rank": 10 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 90.15506569759444, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Logical": { |
|
"Average Score": 92.26403821208403, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Social": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Probability": { |
|
"Average Score": 73.99418447190348, |
|
"Standard Deviation": null, |
|
"Rank": 11 |
|
} |
|
} |
|
} |
|
] |