diff --git "a/src/results/models_2024-11-08-08:36:00.464224.json" "b/src/results/models_2024-11-08-08:36:00.464224.json" new file mode 100644--- /dev/null +++ "b/src/results/models_2024-11-08-08:36:00.464224.json" @@ -0,0 +1,3637 @@ +[ + { + "config": { + "model_name": "ChatGPT-4o-latest (2024-09-03)", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 94.49771627042422, + "Standard Deviation": 0.251607817784525, + "Rank": 4 + }, + "Geometry": { + "Average Score": 81.11505705795187, + "Standard Deviation": null, + "Rank": 6 + }, + "Algebra": { + "Average Score": 91.79122001491199, + "Standard Deviation": null, + "Rank": 8 + }, + "Probability": { + "Average Score": 88.00190397870577, + "Standard Deviation": null, + "Rank": 4 + }, + "Logical": { + "Average Score": 97.47223448912972, + "Standard Deviation": null, + "Rank": 2 + }, + "Social": { + "Average Score": 89.73262585993845, + "Standard Deviation": null, + "Rank": 7 + }, + "Chemistry": { + "Average Score": 90.48070030738856, + "Standard Deviation": null, + "Rank": 3 + }, + "CPP": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Physics": { + "Average Score": 99.7043774383865, + "Standard Deviation": null, + "Rank": 2 + }, + "Biology": { + "Average Score": 95.98449860487872, + "Standard Deviation": null, + "Rank": 3 + } + } + }, + { + "config": { + "model_name": "gpt-4o-2024-08-06", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 83.33484787198124, + "Standard Deviation": 3.0334254138998893, + "Rank": 12 + }, + "Geometry": { + "Average Score": 85.73211137938175, + "Standard Deviation": null, + "Rank": 2 + }, + "Algebra": { + "Average Score": 95.29454759516874, + "Standard Deviation": null, + "Rank": 5 + }, + "Probability": { + "Average Score": 80.9483280228488, + "Standard Deviation": null, + "Rank": 7 + }, + "Logical": { + "Average Score": 78.93507998348575, + "Standard Deviation": null, + "Rank": 12 + }, + "Social": { + "Average Score": 78.21553692695771, + "Standard Deviation": null, + "Rank": 11 + }, + "Chemistry": { + "Average Score": 79.46337310221962, + "Standard Deviation": null, + "Rank": 9 + }, + "CPP": { + "Average Score": 92.43090226400756, + "Standard Deviation": null, + "Rank": 2 + }, + "Physics": { + "Average Score": 92.63882355350016, + "Standard Deviation": null, + "Rank": 6 + }, + "Biology": { + "Average Score": 79.88713500945879, + "Standard Deviation": null, + "Rank": 14 + } + } + }, + { + "config": { + "model_name": "gpt-4o-2024-05-13", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 89.31218128337491, + "Standard Deviation": 0.5511990686487255, + "Rank": 8 + }, + "Geometry": { + "Average Score": 81.70458958633901, + "Standard Deviation": null, + "Rank": 4 + }, + "Algebra": { + "Average Score": 90.16488595415144, + "Standard Deviation": null, + "Rank": 9 + }, + "Probability": { + "Average Score": 83.8098272382245, + "Standard Deviation": null, + "Rank": 5 + }, + "Logical": { + "Average Score": 88.2742970015626, + "Standard Deviation": null, + "Rank": 9 + }, + "Social": { + "Average Score": 71.51855733216095, + "Standard Deviation": null, + "Rank": 15 + }, + "Chemistry": { + "Average Score": 84.0147961443266, + "Standard Deviation": null, + "Rank": 7 + }, + "CPP": { + "Average Score": 79.1592634699295, + "Standard Deviation": null, + "Rank": 6 + }, + "Physics": { + "Average Score": 96.44583156689123, + "Standard Deviation": null, + "Rank": 3 + }, + "Biology": { + "Average Score": 86.17947030919935, + "Standard Deviation": null, + "Rank": 10 + } + } + }, + { + "config": { + "model_name": "gpt-4-turbo-2024-04-09", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 89.20222265636137, + "Standard Deviation": 0.9498836008363539, + "Rank": 9 + }, + "Geometry": { + "Average Score": 77.90202019775627, + "Standard Deviation": null, + "Rank": 8 + }, + "Algebra": { + "Average Score": 84.83537307564205, + "Standard Deviation": null, + "Rank": 12 + }, + "Probability": { + "Average Score": 80.01448545719413, + "Standard Deviation": null, + "Rank": 9 + }, + "Logical": { + "Average Score": 89.63955736396734, + "Standard Deviation": null, + "Rank": 8 + }, + "Social": { + "Average Score": 77.25088451567024, + "Standard Deviation": null, + "Rank": 12 + }, + "Chemistry": { + "Average Score": 78.97054235015905, + "Standard Deviation": null, + "Rank": 11 + }, + "CPP": { + "Average Score": 70.73143363230263, + "Standard Deviation": null, + "Rank": 11 + }, + "Physics": { + "Average Score": 90.33497346058968, + "Standard Deviation": null, + "Rank": 7 + }, + "Biology": { + "Average Score": 86.17949760404831, + "Standard Deviation": null, + "Rank": 9 + } + } + }, + { + "config": { + "model_name": "gemini-1.5-pro-001", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 82.91139866415075, + "Standard Deviation": 3.013751980804677, + "Rank": 13 + }, + "Geometry": { + "Average Score": 83.6654007694722, + "Standard Deviation": null, + "Rank": 3 + }, + "Algebra": { + "Average Score": 98.84487439119522, + "Standard Deviation": null, + "Rank": 3 + }, + "Probability": { + "Average Score": 75.94594518060929, + "Standard Deviation": null, + "Rank": 13 + }, + "Logical": { + "Average Score": 78.89834475831927, + "Standard Deviation": null, + "Rank": 14 + }, + "Social": { + "Average Score": 78.21569899283614, + "Standard Deviation": null, + "Rank": 10 + }, + "Physics": { + "Average Score": 88.41290613720335, + "Standard Deviation": null, + "Rank": 10 + }, + "Biology": { + "Average Score": 86.45347978614136, + "Standard Deviation": null, + "Rank": 8 + } + } + }, + { + "config": { + "model_name": "qwen2-72b-instruct", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/09" + }, + "results": { + "OVERALL": { + "Average Score": 80.51855735113782, + "Standard Deviation": 2.389693257324127, + "Rank": 15 + }, + "Geometry": { + "Average Score": 68.80768467173304, + "Standard Deviation": null, + "Rank": 13 + }, + "Algebra": { + "Average Score": 95.86210030199506, + "Standard Deviation": null, + "Rank": 4 + }, + "Probability": { + "Average Score": 82.29702731445691, + "Standard Deviation": null, + "Rank": 6 + }, + "Logical": { + "Average Score": 73.55135235722557, + "Standard Deviation": null, + "Rank": 19 + }, + "Social": { + "Average Score": 57.41502695932332, + "Standard Deviation": null, + "Rank": 19 + }, + "Chemistry": { + "Average Score": 75.8879803782176, + "Standard Deviation": null, + "Rank": 13 + }, + "CPP": { + "Average Score": 73.54037778797029, + "Standard Deviation": null, + "Rank": 7 + }, + "Physics": { + "Average Score": 82.02738090295061, + "Standard Deviation": null, + "Rank": 16 + }, + "Biology": { + "Average Score": 66.99838962851355, + "Standard Deviation": null, + "Rank": 22 + } + } + }, + { + "config": { + "model_name": "gpt-4o-mini-2024-07-18", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 85.79551424780102, + "Standard Deviation": 2.25059599602412, + "Rank": 11 + }, + "Geometry": { + "Average Score": 78.03415885586699, + "Standard Deviation": null, + "Rank": 7 + }, + "Algebra": { + "Average Score": 90.10621818673319, + "Standard Deviation": null, + "Rank": 10 + }, + "Probability": { + "Average Score": 80.94824796859724, + "Standard Deviation": null, + "Rank": 8 + }, + "Logical": { + "Average Score": 86.1004659652016, + "Standard Deviation": null, + "Rank": 10 + }, + "Social": { + "Average Score": 74.20253943841105, + "Standard Deviation": null, + "Rank": 13 + }, + "Chemistry": { + "Average Score": 75.44768883899778, + "Standard Deviation": null, + "Rank": 15 + }, + "CPP": { + "Average Score": 88.3877070580296, + "Standard Deviation": null, + "Rank": 3 + }, + "Physics": { + "Average Score": 90.33492089386435, + "Standard Deviation": null, + "Rank": 8 + }, + "Biology": { + "Average Score": 79.03781031583883, + "Standard Deviation": null, + "Rank": 15 + } + } + }, + { + "config": { + "model_name": "claude-3.5-sonnet", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2024/04" + }, + "results": { + "OVERALL": { + "Average Score": 90.30644459276833, + "Standard Deviation": 0.6105034066546057, + "Rank": 7 + }, + "Geometry": { + "Average Score": 72.63402106402285, + "Standard Deviation": null, + "Rank": 12 + }, + "Algebra": { + "Average Score": 83.32075177480141, + "Standard Deviation": null, + "Rank": 14 + }, + "Probability": { + "Average Score": 76.7319625254773, + "Standard Deviation": null, + "Rank": 11 + }, + "Logical": { + "Average Score": 90.00404188010565, + "Standard Deviation": null, + "Rank": 7 + }, + "Social": { + "Average Score": 99.89849499454823, + "Standard Deviation": null, + "Rank": 2 + }, + "Chemistry": { + "Average Score": 85.86402884262867, + "Standard Deviation": null, + "Rank": 4 + }, + "CPP": { + "Average Score": 82.37734076815008, + "Standard Deviation": null, + "Rank": 5 + }, + "Physics": { + "Average Score": 92.83215449096147, + "Standard Deviation": null, + "Rank": 5 + }, + "Biology": { + "Average Score": 85.76627192038262, + "Standard Deviation": null, + "Rank": 11 + } + } + }, + { + "config": { + "model_name": "claude-3.5-sonnet-20241022", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "UNKNOW" + }, + "results": { + "OVERALL": { + "Average Score": 81.7399750668719, + "Standard Deviation": 6.158375141726245, + "Rank": 14 + }, + "Geometry": { + "Average Score": 72.63581025178527, + "Standard Deviation": null, + "Rank": 11 + }, + "Algebra": { + "Average Score": 89.50323347048936, + "Standard Deviation": null, + "Rank": 11 + }, + "Probability": { + "Average Score": 73.919, + "Standard Deviation": null, + "Rank": 13 + }, + "Logical": { + "Average Score": 90.514, + "Standard Deviation": null, + "Rank": 7 + }, + "Social": { + "Average Score": 84.505, + "Standard Deviation": null, + "Rank": 7 + }, + "Chemistry": { + "Average Score": 85.15970597010583, + "Standard Deviation": null, + "Rank": 6 + }, + "Physics": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Biology": { + "Average Score": 85.56526806360797, + "Standard Deviation": null, + "Rank": 12 + } + } + }, + { + "config": { + "model_name": "o1-mini", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 97.50448224920098, + "Standard Deviation": 0.18820973784944708, + "Rank": 2 + }, + "Geometry": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Algebra": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Probability": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Logical": { + "Average Score": 96.62093396445893, + "Standard Deviation": null, + "Rank": 3 + }, + "Social": { + "Average Score": 98.93701302706319, + "Standard Deviation": null, + "Rank": 4 + }, + "Chemistry": { + "Average Score": 93.52027415963765, + "Standard Deviation": null, + "Rank": 2 + }, + "Biology": { + "Average Score": 99.9210788257773, + "Standard Deviation": null, + "Rank": 2 + } + } + }, + { + "config": { + "model_name": "o1-preview", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 92.95670511909181, + "Standard Deviation": 0.26193636312885404, + "Rank": 5 + }, + "Geometry": { + "Average Score": 81.70453162182778, + "Standard Deviation": null, + "Rank": 5 + }, + "Algebra": { + "Average Score": 99.2204666813678, + "Standard Deviation": null, + "Rank": 2 + }, + "Probability": { + "Average Score": 96.11141903959506, + "Standard Deviation": null, + "Rank": 2 + }, + "Logical": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Social": { + "Average Score": 99.35681400812317, + "Standard Deviation": null, + "Rank": 3 + }, + "Biology": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + } + } + }, + { + "config": { + "model_name": "gemini-1.5-flash-001", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 63.90738369106308, + "Standard Deviation": 2.5840022803072342, + "Rank": 20 + }, + "Geometry": { + "Average Score": 62.78784730869374, + "Standard Deviation": null, + "Rank": 16 + }, + "Algebra": { + "Average Score": 84.4516255656167, + "Standard Deviation": null, + "Rank": 13 + }, + "Probability": { + "Average Score": 71.21668893483972, + "Standard Deviation": null, + "Rank": 15 + }, + "Logical": { + "Average Score": 73.55137041991937, + "Standard Deviation": null, + "Rank": 17 + }, + "Social": { + "Average Score": 71.51839473022034, + "Standard Deviation": null, + "Rank": 16 + }, + "Chemistry": { + "Average Score": 78.9281328399534, + "Standard Deviation": null, + "Rank": 12 + }, + "CPP": { + "Average Score": 72.1127762005651, + "Standard Deviation": null, + "Rank": 10 + }, + "Physics": { + "Average Score": 86.21163726768592, + "Standard Deviation": null, + "Rank": 14 + }, + "Biology": { + "Average Score": 77.50881946688955, + "Standard Deviation": null, + "Rank": 16 + } + } + }, + { + "config": { + "model_name": "gpt4-1106", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2024/04" + }, + "results": { + "OVERALL": { + "Average Score": 88.08481721079524, + "Standard Deviation": 1.4421920877285703, + "Rank": 10 + }, + "Geometry": { + "Average Score": 59.2110329866853, + "Standard Deviation": null, + "Rank": 17 + }, + "Algebra": { + "Average Score": 80.79050620153212, + "Standard Deviation": null, + "Rank": 15 + }, + "Probability": { + "Average Score": 74.36123524515216, + "Standard Deviation": null, + "Rank": 14 + }, + "Logical": { + "Average Score": 77.02518347398768, + "Standard Deviation": null, + "Rank": 15 + }, + "Social": { + "Average Score": 51.13078063545894, + "Standard Deviation": null, + "Rank": 25 + }, + "Chemistry": { + "Average Score": 72.4125941071821, + "Standard Deviation": null, + "Rank": 16 + }, + "CPP": { + "Average Score": 69.11824072252848, + "Standard Deviation": null, + "Rank": 12 + }, + "Physics": { + "Average Score": 87.0543996394885, + "Standard Deviation": null, + "Rank": 13 + }, + "Biology": { + "Average Score": 82.36213636857161, + "Standard Deviation": null, + "Rank": 13 + } + } + }, + { + "config": { + "model_name": "gemma-2-27b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/06" + }, + "results": { + "OVERALL": { + "Average Score": 70.59188609288081, + "Standard Deviation": 8.717841670213112, + "Rank": 19 + }, + "Geometry": { + "Average Score": 58.00008857041582, + "Standard Deviation": null, + "Rank": 19 + }, + "Algebra": { + "Average Score": 77.82927803658924, + "Standard Deviation": null, + "Rank": 19 + }, + "Probability": { + "Average Score": 69.63382706259532, + "Standard Deviation": null, + "Rank": 18 + }, + "Logical": { + "Average Score": 73.55136762438677, + "Standard Deviation": null, + "Rank": 18 + }, + "Social": { + "Average Score": 57.17847568664103, + "Standard Deviation": null, + "Rank": 20 + }, + "Chemistry": { + "Average Score": 68.65449070488427, + "Standard Deviation": null, + "Rank": 20 + }, + "CPP": { + "Average Score": 63.28920072143611, + "Standard Deviation": null, + "Rank": 14 + }, + "Physics": { + "Average Score": 76.8395150041688, + "Standard Deviation": null, + "Rank": 19 + }, + "Biology": { + "Average Score": 66.99846220210911, + "Standard Deviation": null, + "Rank": 21 + } + } + }, + { + "config": { + "model_name": "claude-3-opus", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 79.77338364506384, + "Standard Deviation": 2.32886155429398, + "Rank": 16 + }, + "Geometry": { + "Average Score": 57.5200576513199, + "Standard Deviation": null, + "Rank": 20 + }, + "Algebra": { + "Average Score": 76.89230078890219, + "Standard Deviation": null, + "Rank": 20 + }, + "Probability": { + "Average Score": 71.20578106177237, + "Standard Deviation": null, + "Rank": 16 + }, + "Logical": { + "Average Score": 78.93505058041774, + "Standard Deviation": null, + "Rank": 13 + }, + "Social": { + "Average Score": 88.40491896661747, + "Standard Deviation": null, + "Rank": 8 + }, + "Chemistry": { + "Average Score": 79.0571776580065, + "Standard Deviation": null, + "Rank": 10 + }, + "CPP": { + "Average Score": 73.5404403567132, + "Standard Deviation": null, + "Rank": 8 + }, + "Physics": { + "Average Score": 87.28118117714033, + "Standard Deviation": null, + "Rank": 12 + }, + "Biology": { + "Average Score": 71.23527633371832, + "Standard Deviation": null, + "Rank": 20 + } + } + }, + { + "config": { + "model_name": "gemma-2-9b-it-simpo", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/07" + }, + "results": { + "OVERALL": { + "Average Score": "N/A", + "Standard Deviation": "N/A", + "Rank": "N/A" + }, + "Geometry": { + "Average Score": 57.520011750672175, + "Standard Deviation": null, + "Rank": 21 + }, + "Algebra": { + "Average Score": 72.3731046476544, + "Standard Deviation": null, + "Rank": 21 + }, + "Probability": { + "Average Score": 61.79614379365174, + "Standard Deviation": null, + "Rank": 22 + }, + "Logical": { + "Average Score": 64.62661472571767, + "Standard Deviation": null, + "Rank": 23 + }, + "Social": { + "Average Score": 87.65488278831526, + "Standard Deviation": null, + "Rank": 9 + }, + "Chemistry": { + "Average Score": 85.36850564169866, + "Standard Deviation": null, + "Rank": 5 + }, + "CPP": { + "Average Score": 73.43757596214863, + "Standard Deviation": null, + "Rank": 9 + }, + "Physics": { + "Average Score": 82.02727994935249, + "Standard Deviation": null, + "Rank": 17 + }, + "Biology": { + "Average Score": 88.80821937078267, + "Standard Deviation": null, + "Rank": 7 + } + } + }, + { + "config": { + "model_name": "qwen1.5-72b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/03" + }, + "results": { + "OVERALL": { + "Average Score": 61.57517122936127, + "Standard Deviation": 5.01096656930536, + "Rank": 21 + }, + "Geometry": { + "Average Score": 49.36591842356095, + "Standard Deviation": null, + "Rank": 28 + }, + "Algebra": { + "Average Score": 71.12615153442515, + "Standard Deviation": null, + "Rank": 22 + }, + "Probability": { + "Average Score": 51.76027345875035, + "Standard Deviation": null, + "Rank": 28 + }, + "Logical": { + "Average Score": 34.74438889550426, + "Standard Deviation": null, + "Rank": 39 + }, + "Social": { + "Average Score": 47.47112348597555, + "Standard Deviation": null, + "Rank": 27 + }, + "Chemistry": { + "Average Score": 51.65772092991593, + "Standard Deviation": null, + "Rank": 25 + }, + "CPP": { + "Average Score": 48.69302376665551, + "Standard Deviation": null, + "Rank": 20 + }, + "Physics": { + "Average Score": 62.45893584822384, + "Standard Deviation": null, + "Rank": 27 + }, + "Biology": { + "Average Score": 56.96571500324531, + "Standard Deviation": null, + "Rank": 27 + } + } + }, + { + "config": { + "model_name": "qwen1.5-32b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/03" + }, + "results": { + "OVERALL": { + "Average Score": 45.3199699974334, + "Standard Deviation": 3.7527776450894996, + "Rank": 31 + }, + "Geometry": { + "Average Score": 45.66389348479106, + "Standard Deviation": null, + "Rank": 30 + }, + "Algebra": { + "Average Score": 64.9403510842088, + "Standard Deviation": null, + "Rank": 25 + }, + "Probability": { + "Average Score": 51.99376831114535, + "Standard Deviation": null, + "Rank": 27 + }, + "Logical": { + "Average Score": 39.30230377209954, + "Standard Deviation": null, + "Rank": 36 + }, + "Social": { + "Average Score": 45.679222078247186, + "Standard Deviation": null, + "Rank": 28 + }, + "Chemistry": { + "Average Score": 46.41262433996582, + "Standard Deviation": null, + "Rank": 28 + }, + "CPP": { + "Average Score": 45.14284028264288, + "Standard Deviation": null, + "Rank": 24 + }, + "Physics": { + "Average Score": 65.80533740982938, + "Standard Deviation": null, + "Rank": 25 + }, + "Biology": { + "Average Score": 50.767985684362536, + "Standard Deviation": null, + "Rank": 33 + } + } + }, + { + "config": { + "model_name": "google-gemma-2-9b-it", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2024/06" + }, + "results": { + "OVERALL": { + "Average Score": 59.024943267290716, + "Standard Deviation": 3.979239820929726, + "Rank": 23 + }, + "Geometry": { + "Average Score": 53.495866814128156, + "Standard Deviation": null, + "Rank": 24 + }, + "Algebra": { + "Average Score": 65.98776390439404, + "Standard Deviation": null, + "Rank": 23 + }, + "Probability": { + "Average Score": 65.76699220336998, + "Standard Deviation": null, + "Rank": 21 + }, + "Logical": { + "Average Score": 71.04386923330611, + "Standard Deviation": null, + "Rank": 20 + }, + "Social": { + "Average Score": 73.74087367208867, + "Standard Deviation": null, + "Rank": 14 + }, + "Chemistry": { + "Average Score": 57.074735438190935, + "Standard Deviation": null, + "Rank": 22 + }, + "CPP": { + "Average Score": 54.03167523687635, + "Standard Deviation": null, + "Rank": 17 + }, + "Physics": { + "Average Score": 63.03919029129539, + "Standard Deviation": null, + "Rank": 26 + }, + "Biology": { + "Average Score": 63.18363754826406, + "Standard Deviation": null, + "Rank": 23 + } + } + }, + { + "config": { + "model_name": "yi-1.5-34b-chat", + "organization": "01 AI", + "license": "Proprietary", + "knowledge_cutoff": "2024/05" + }, + "results": { + "OVERALL": { + "Average Score": 71.78031967728624, + "Standard Deviation": 12.994861744386325, + "Rank": 18 + }, + "Geometry": { + "Average Score": 54.06826621860964, + "Standard Deviation": null, + "Rank": 23 + }, + "Algebra": { + "Average Score": 65.66679210942144, + "Standard Deviation": null, + "Rank": 24 + }, + "Probability": { + "Average Score": 66.46858903563573, + "Standard Deviation": null, + "Rank": 20 + }, + "Logical": { + "Average Score": 67.36081192984079, + "Standard Deviation": null, + "Rank": 21 + }, + "Social": { + "Average Score": 53.898293694371446, + "Standard Deviation": null, + "Rank": 22 + }, + "Chemistry": { + "Average Score": 56.1520167017115, + "Standard Deviation": null, + "Rank": 23 + }, + "CPP": { + "Average Score": 52.148798061768964, + "Standard Deviation": null, + "Rank": 18 + }, + "Physics": { + "Average Score": 73.06547347263036, + "Standard Deviation": null, + "Rank": 21 + }, + "Biology": { + "Average Score": 72.47949036617567, + "Standard Deviation": null, + "Rank": 18 + } + } + }, + { + "config": { + "model_name": "meta-llama-3.1-70b-instruct", + "organization": "Meta", + "license": "Llama 3.1 Community", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 72.88379857527117, + "Standard Deviation": 3.7053577253028176, + "Rank": 17 + }, + "Geometry": { + "Average Score": 62.78788327507421, + "Standard Deviation": null, + "Rank": 15 + }, + "Algebra": { + "Average Score": 80.79028754890449, + "Standard Deviation": null, + "Rank": 16 + }, + "Probability": { + "Average Score": 69.6338691921361, + "Standard Deviation": null, + "Rank": 17 + }, + "Logical": { + "Average Score": 74.43905975120572, + "Standard Deviation": null, + "Rank": 16 + }, + "Social": { + "Average Score": 61.22534257022315, + "Standard Deviation": null, + "Rank": 18 + }, + "Chemistry": { + "Average Score": 70.9160725889497, + "Standard Deviation": null, + "Rank": 18 + }, + "CPP": { + "Average Score": 84.36815192532764, + "Standard Deviation": null, + "Rank": 4 + }, + "Physics": { + "Average Score": 82.02759904132307, + "Standard Deviation": null, + "Rank": 15 + }, + "Biology": { + "Average Score": 72.47948013923437, + "Standard Deviation": null, + "Rank": 19 + } + } + }, + { + "config": { + "model_name": "meta-llama-3.1-8b-instruct", + "organization": "Meta", + "license": "Llama 3.1 Community", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 52.21824740443002, + "Standard Deviation": 3.7833302779202937, + "Rank": 27 + }, + "Geometry": { + "Average Score": 43.03691891008171, + "Standard Deviation": null, + "Rank": 32 + }, + "Algebra": { + "Average Score": 64.13661497122277, + "Standard Deviation": null, + "Rank": 26 + }, + "Probability": { + "Average Score": 55.37882298464668, + "Standard Deviation": null, + "Rank": 25 + }, + "Logical": { + "Average Score": 53.843773408414144, + "Standard Deviation": null, + "Rank": 29 + }, + "Social": { + "Average Score": 44.993575656549545, + "Standard Deviation": null, + "Rank": 30 + }, + "Chemistry": { + "Average Score": 43.98798267082055, + "Standard Deviation": null, + "Rank": 31 + }, + "CPP": { + "Average Score": 44.41846841004584, + "Standard Deviation": null, + "Rank": 26 + }, + "Physics": { + "Average Score": 49.65976817230991, + "Standard Deviation": null, + "Rank": 37 + }, + "Biology": { + "Average Score": 52.132998637966764, + "Standard Deviation": null, + "Rank": 32 + } + } + }, + { + "config": { + "model_name": "gpt3.5-turbo-0125", + "organization": "OpenAI", + "license": "Proprietary", + "knowledge_cutoff": "2021/09" + }, + "results": { + "OVERALL": { + "Average Score": 32.61987548870099, + "Standard Deviation": 7.421068133219178, + "Rank": 41 + }, + "Geometry": { + "Average Score": 52.43446046073764, + "Standard Deviation": null, + "Rank": 25 + }, + "Algebra": { + "Average Score": 62.62345918733465, + "Standard Deviation": null, + "Rank": 27 + }, + "Probability": { + "Average Score": 46.778615832700474, + "Standard Deviation": null, + "Rank": 30 + }, + "Logical": { + "Average Score": 20.161483818418485, + "Standard Deviation": null, + "Rank": 48 + }, + "Social": { + "Average Score": 36.005021312700556, + "Standard Deviation": null, + "Rank": 43 + }, + "Chemistry": { + "Average Score": 41.27375172990709, + "Standard Deviation": null, + "Rank": 34 + }, + "CPP": { + "Average Score": 40.46958736582551, + "Standard Deviation": null, + "Rank": 29 + }, + "Physics": { + "Average Score": 53.13517938912883, + "Standard Deviation": null, + "Rank": 33 + }, + "Biology": { + "Average Score": 40.750963952571375, + "Standard Deviation": null, + "Rank": 43 + } + } + }, + { + "config": { + "model_name": "llama-3-70b-instruct", + "organization": "Meta", + "license": "Llama 3 Community", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 58.67095788786492, + "Standard Deviation": 3.916500171452786, + "Rank": 25 + }, + "Geometry": { + "Average Score": 47.16123420770543, + "Standard Deviation": null, + "Rank": 29 + }, + "Algebra": { + "Average Score": 62.38398769226985, + "Standard Deviation": null, + "Rank": 28 + }, + "Probability": { + "Average Score": 57.7568005808253, + "Standard Deviation": null, + "Rank": 23 + }, + "Logical": { + "Average Score": 84.45551822980201, + "Standard Deviation": null, + "Rank": 11 + }, + "Social": { + "Average Score": 52.450283668620365, + "Standard Deviation": null, + "Rank": 23 + }, + "Chemistry": { + "Average Score": 70.91630635362482, + "Standard Deviation": null, + "Rank": 17 + }, + "CPP": { + "Average Score": 65.32140697218945, + "Standard Deviation": null, + "Rank": 13 + }, + "Physics": { + "Average Score": 78.08120808341037, + "Standard Deviation": null, + "Rank": 18 + }, + "Biology": { + "Average Score": 60.6111504865126, + "Standard Deviation": null, + "Rank": 25 + } + } + }, + { + "config": { + "model_name": "claude-3-sonnet", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 61.25499749085383, + "Standard Deviation": 5.012226129836105, + "Rank": 22 + }, + "Geometry": { + "Average Score": 52.4291917862642, + "Standard Deviation": null, + "Rank": 26 + }, + "Algebra": { + "Average Score": 60.40928261066776, + "Standard Deviation": null, + "Rank": 29 + }, + "Probability": { + "Average Score": 57.4556182999398, + "Standard Deviation": null, + "Rank": 24 + }, + "Logical": { + "Average Score": 66.81740129837053, + "Standard Deviation": null, + "Rank": 22 + }, + "Social": { + "Average Score": 69.99747730347514, + "Standard Deviation": null, + "Rank": 17 + }, + "Chemistry": { + "Average Score": 68.8316074174692, + "Standard Deviation": null, + "Rank": 19 + }, + "CPP": { + "Average Score": 61.33538592327427, + "Standard Deviation": null, + "Rank": 15 + }, + "Physics": { + "Average Score": 75.18056969699853, + "Standard Deviation": null, + "Rank": 20 + }, + "Biology": { + "Average Score": 77.09610271458331, + "Standard Deviation": null, + "Rank": 17 + } + } + }, + { + "config": { + "model_name": "qwen1.5-14b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 44.55620746942043, + "Standard Deviation": 3.997156497824947, + "Rank": 32 + }, + "Geometry": { + "Average Score": 36.7560037779628, + "Standard Deviation": null, + "Rank": 34 + }, + "Algebra": { + "Average Score": 59.50136116119945, + "Standard Deviation": null, + "Rank": 30 + }, + "Probability": { + "Average Score": 40.080049006314795, + "Standard Deviation": null, + "Rank": 35 + }, + "Logical": { + "Average Score": 34.744529623515994, + "Standard Deviation": null, + "Rank": 38 + }, + "Social": { + "Average Score": 40.62146960769885, + "Standard Deviation": null, + "Rank": 36 + }, + "Chemistry": { + "Average Score": 38.9739127306118, + "Standard Deviation": null, + "Rank": 37 + }, + "CPP": { + "Average Score": 38.552779976347026, + "Standard Deviation": null, + "Rank": 31 + }, + "Physics": { + "Average Score": 57.98313138991904, + "Standard Deviation": null, + "Rank": 31 + }, + "Biology": { + "Average Score": 45.732215792439575, + "Standard Deviation": null, + "Rank": 40 + } + } + }, + { + "config": { + "model_name": "claude-3-haiku", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 54.96475677538885, + "Standard Deviation": 5.908641649857827, + "Rank": 26 + }, + "Geometry": { + "Average Score": 43.48740351644307, + "Standard Deviation": null, + "Rank": 31 + }, + "Algebra": { + "Average Score": 55.72045911130164, + "Standard Deviation": null, + "Rank": 33 + }, + "Probability": { + "Average Score": 53.07470665022828, + "Standard Deviation": null, + "Rank": 26 + }, + "Logical": { + "Average Score": 63.661198382201675, + "Standard Deviation": null, + "Rank": 24 + }, + "Social": { + "Average Score": 56.49297908205363, + "Standard Deviation": null, + "Rank": 21 + }, + "Chemistry": { + "Average Score": 60.28485867590517, + "Standard Deviation": null, + "Rank": 21 + }, + "CPP": { + "Average Score": 56.40200048817984, + "Standard Deviation": null, + "Rank": 16 + }, + "Physics": { + "Average Score": 67.69802411023282, + "Standard Deviation": null, + "Rank": 24 + }, + "Biology": { + "Average Score": 60.63801358326118, + "Standard Deviation": null, + "Rank": 24 + } + } + }, + { + "config": { + "model_name": "claude-2.1", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 39.436633770824685, + "Standard Deviation": 1.0979568551024126, + "Rank": 36 + }, + "Geometry": { + "Average Score": 52.12445910303711, + "Standard Deviation": null, + "Rank": 27 + }, + "Algebra": { + "Average Score": 55.51421646167608, + "Standard Deviation": null, + "Rank": 34 + }, + "Probability": { + "Average Score": 44.720527688076, + "Standard Deviation": null, + "Rank": 33 + }, + "Logical": { + "Average Score": 61.64930710809233, + "Standard Deviation": null, + "Rank": 25 + }, + "Social": { + "Average Score": 41.24714538607354, + "Standard Deviation": null, + "Rank": 35 + }, + "Chemistry": { + "Average Score": 49.503134730071984, + "Standard Deviation": null, + "Rank": 26 + }, + "CPP": { + "Average Score": 47.23672563994903, + "Standard Deviation": null, + "Rank": 21 + }, + "Physics": { + "Average Score": 71.80748688814478, + "Standard Deviation": null, + "Rank": 22 + }, + "Biology": { + "Average Score": 56.35051024959833, + "Standard Deviation": null, + "Rank": 28 + } + } + }, + { + "config": { + "model_name": "mistral-8x7b-instruct-v0.1", + "organization": "Mistral", + "license": "Apache 2.0", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 41.89229040550289, + "Standard Deviation": 1.0093122675555612, + "Rank": 33 + }, + "Geometry": { + "Average Score": 33.703560702831055, + "Standard Deviation": null, + "Rank": 38 + }, + "Algebra": { + "Average Score": 50.89266418264096, + "Standard Deviation": null, + "Rank": 37 + }, + "Probability": { + "Average Score": 44.763608895327415, + "Standard Deviation": null, + "Rank": 32 + }, + "Logical": { + "Average Score": 40.32090734088309, + "Standard Deviation": null, + "Rank": 35 + }, + "Social": { + "Average Score": 36.25120096194333, + "Standard Deviation": null, + "Rank": 42 + }, + "Chemistry": { + "Average Score": 45.537417249801685, + "Standard Deviation": null, + "Rank": 29 + }, + "CPP": { + "Average Score": 44.533118241976666, + "Standard Deviation": null, + "Rank": 25 + }, + "Physics": { + "Average Score": 59.27177919021739, + "Standard Deviation": null, + "Rank": 29 + }, + "Biology": { + "Average Score": 53.73577835290789, + "Standard Deviation": null, + "Rank": 29 + } + } + }, + { + "config": { + "model_name": "claude-2.0", + "organization": "Anthropic", + "license": "Proprietary", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 29.746629448410072, + "Standard Deviation": 2.904279782741168, + "Rank": 44 + }, + "Geometry": { + "Average Score": 38.83959305205546, + "Standard Deviation": null, + "Rank": 33 + }, + "Algebra": { + "Average Score": 50.95581898913443, + "Standard Deviation": null, + "Rank": 36 + }, + "Probability": { + "Average Score": 46.77856061078482, + "Standard Deviation": null, + "Rank": 31 + }, + "Logical": { + "Average Score": 55.87663184155831, + "Standard Deviation": null, + "Rank": 28 + }, + "Social": { + "Average Score": 52.418630462591864, + "Standard Deviation": null, + "Rank": 24 + }, + "Chemistry": { + "Average Score": 54.485802241006866, + "Standard Deviation": null, + "Rank": 24 + }, + "CPP": { + "Average Score": 50.773143448036464, + "Standard Deviation": null, + "Rank": 19 + }, + "Physics": { + "Average Score": 70.21815140033613, + "Standard Deviation": null, + "Rank": 23 + }, + "Biology": { + "Average Score": 58.06960426451617, + "Standard Deviation": null, + "Rank": 26 + } + } + }, + { + "config": { + "model_name": "starling-lm-7b-beta", + "organization": "Nexusflow", + "license": "Apache-2.0", + "knowledge_cutoff": "2024/03" + }, + "results": { + "OVERALL": { + "Average Score": 49.37320778476737, + "Standard Deviation": 3.6745696228749076, + "Rank": 28 + }, + "Geometry": { + "Average Score": 34.931531551032506, + "Standard Deviation": null, + "Rank": 37 + }, + "Algebra": { + "Average Score": 51.66718360952931, + "Standard Deviation": null, + "Rank": 35 + }, + "Probability": { + "Average Score": 40.79623349276488, + "Standard Deviation": null, + "Rank": 34 + }, + "Logical": { + "Average Score": 47.86775375284415, + "Standard Deviation": null, + "Rank": 30 + }, + "Social": { + "Average Score": 42.30631821350664, + "Standard Deviation": null, + "Rank": 33 + }, + "Chemistry": { + "Average Score": 38.68957842968336, + "Standard Deviation": null, + "Rank": 38 + }, + "CPP": { + "Average Score": 38.27587102395908, + "Standard Deviation": null, + "Rank": 32 + }, + "Physics": { + "Average Score": 43.122496379867655, + "Standard Deviation": null, + "Rank": 40 + }, + "Biology": { + "Average Score": 49.80517713841127, + "Standard Deviation": null, + "Rank": 35 + } + } + }, + { + "config": { + "model_name": "gemini-1.0-pro-001", + "organization": "Google", + "license": "Proprietary", + "knowledge_cutoff": "2023/04" + }, + "results": { + "OVERALL": { + "Average Score": 37.757029496159134, + "Standard Deviation": 2.4871563947325797, + "Rank": 38 + }, + "Geometry": { + "Average Score": 35.792088134579124, + "Standard Deviation": null, + "Rank": 36 + }, + "Algebra": { + "Average Score": 50.157930404365224, + "Standard Deviation": null, + "Rank": 38 + }, + "Probability": { + "Average Score": 25.033769367203313, + "Standard Deviation": null, + "Rank": 47 + }, + "Logical": { + "Average Score": 23.38732786204667, + "Standard Deviation": null, + "Rank": 46 + }, + "Social": { + "Average Score": 26.25171796810704, + "Standard Deviation": null, + "Rank": 51 + }, + "Chemistry": { + "Average Score": 43.59712830576298, + "Standard Deviation": null, + "Rank": 32 + }, + "CPP": { + "Average Score": 45.22204471452975, + "Standard Deviation": null, + "Rank": 23 + }, + "Physics": { + "Average Score": 62.1145967631314, + "Standard Deviation": null, + "Rank": 28 + }, + "Biology": { + "Average Score": 38.93328880463975, + "Standard Deviation": null, + "Rank": 46 + } + } + }, + { + "config": { + "model_name": "openchat-3.5-0106", + "organization": "OpenChat", + "license": "Apache-2.0", + "knowledge_cutoff": "2024/01" + }, + "results": { + "OVERALL": { + "Average Score": 39.892305843585234, + "Standard Deviation": 2.147396504115797, + "Rank": 35 + }, + "Geometry": { + "Average Score": 29.941588970091672, + "Standard Deviation": null, + "Rank": 40 + }, + "Algebra": { + "Average Score": 47.48449168554534, + "Standard Deviation": null, + "Rank": 39 + }, + "Probability": { + "Average Score": 39.64777697224284, + "Standard Deviation": null, + "Rank": 36 + }, + "Logical": { + "Average Score": 41.361836834955504, + "Standard Deviation": null, + "Rank": 33 + }, + "Social": { + "Average Score": 36.716597579856675, + "Standard Deviation": null, + "Rank": 41 + }, + "Chemistry": { + "Average Score": 32.618034432282414, + "Standard Deviation": null, + "Rank": 41 + }, + "CPP": { + "Average Score": 33.70639271807677, + "Standard Deviation": null, + "Rank": 33 + }, + "Physics": { + "Average Score": 41.117269227834775, + "Standard Deviation": null, + "Rank": 42 + }, + "Biology": { + "Average Score": 46.46694211682319, + "Standard Deviation": null, + "Rank": 38 + } + } + }, + { + "config": { + "model_name": "openchat-3.5", + "organization": "OpenChat", + "license": "Apache-2.0", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 38.241198423073044, + "Standard Deviation": 0.5484943791516782, + "Rank": 37 + }, + "Geometry": { + "Average Score": 30.89638678506991, + "Standard Deviation": null, + "Rank": 39 + }, + "Algebra": { + "Average Score": 41.83128388520244, + "Standard Deviation": null, + "Rank": 42 + }, + "Probability": { + "Average Score": 36.10478976665624, + "Standard Deviation": null, + "Rank": 39 + }, + "Logical": { + "Average Score": 40.320934300651516, + "Standard Deviation": null, + "Rank": 34 + }, + "Social": { + "Average Score": 43.49055300551458, + "Standard Deviation": null, + "Rank": 31 + }, + "Chemistry": { + "Average Score": 34.73882038803731, + "Standard Deviation": null, + "Rank": 40 + }, + "CPP": { + "Average Score": 33.020911255646965, + "Standard Deviation": null, + "Rank": 34 + }, + "Physics": { + "Average Score": 43.28671808104924, + "Standard Deviation": null, + "Rank": 39 + }, + "Biology": { + "Average Score": 37.18520956253795, + "Standard Deviation": null, + "Rank": 47 + } + } + }, + { + "config": { + "model_name": "command-r-(08-2024)", + "organization": "Cohere", + "license": "CC-BY-NC-4.0", + "knowledge_cutoff": "2024/08" + }, + "results": { + "OVERALL": { + "Average Score": 45.419599943563604, + "Standard Deviation": 3.867586763039621, + "Rank": 30 + }, + "Geometry": { + "Average Score": 36.68143035371426, + "Standard Deviation": null, + "Rank": 35 + }, + "Algebra": { + "Average Score": 41.64517540472657, + "Standard Deviation": null, + "Rank": 43 + }, + "Probability": { + "Average Score": 37.95189112967414, + "Standard Deviation": null, + "Rank": 38 + }, + "Logical": { + "Average Score": 25.409088658564166, + "Standard Deviation": null, + "Rank": 43 + }, + "Social": { + "Average Score": 40.389393367109264, + "Standard Deviation": null, + "Rank": 37 + }, + "Chemistry": { + "Average Score": 40.08660883479598, + "Standard Deviation": null, + "Rank": 36 + }, + "CPP": { + "Average Score": 39.61492485677676, + "Standard Deviation": null, + "Rank": 30 + }, + "Physics": { + "Average Score": 49.51833550380945, + "Standard Deviation": null, + "Rank": 38 + }, + "Biology": { + "Average Score": 46.55085862120477, + "Standard Deviation": null, + "Rank": 37 + } + } + }, + { + "config": { + "model_name": "gemma-1.1-7b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 31.46481370727848, + "Standard Deviation": 5.403408635399989, + "Rank": 42 + }, + "Geometry": { + "Average Score": 26.078500005143134, + "Standard Deviation": null, + "Rank": 45 + }, + "Algebra": { + "Average Score": 40.92453155837702, + "Standard Deviation": null, + "Rank": 44 + }, + "Probability": { + "Average Score": 31.502661407350192, + "Standard Deviation": null, + "Rank": 44 + }, + "Logical": { + "Average Score": 39.27282391466396, + "Standard Deviation": null, + "Rank": 37 + }, + "Social": { + "Average Score": 31.639615427886643, + "Standard Deviation": null, + "Rank": 46 + }, + "Chemistry": { + "Average Score": 43.59704806585925, + "Standard Deviation": null, + "Rank": 33 + }, + "CPP": { + "Average Score": 42.666504105798204, + "Standard Deviation": null, + "Rank": 27 + }, + "Physics": { + "Average Score": 49.845369349755345, + "Standard Deviation": null, + "Rank": 36 + }, + "Biology": { + "Average Score": 45.813201684684124, + "Standard Deviation": null, + "Rank": 39 + } + } + }, + { + "config": { + "model_name": "llama3-8b-instruct", + "organization": "Meta", + "license": "Llama 3 Community", + "knowledge_cutoff": "2023/03" + }, + "results": { + "OVERALL": { + "Average Score": 36.30010331322555, + "Standard Deviation": 2.6021295258334334, + "Rank": 40 + }, + "Geometry": { + "Average Score": 28.61237715170709, + "Standard Deviation": null, + "Rank": 42 + }, + "Algebra": { + "Average Score": 42.6394310988214, + "Standard Deviation": null, + "Rank": 41 + }, + "Probability": { + "Average Score": 35.51226405104781, + "Standard Deviation": null, + "Rank": 40 + }, + "Logical": { + "Average Score": 59.594410427422616, + "Standard Deviation": null, + "Rank": 26 + }, + "Social": { + "Average Score": 42.58469219441349, + "Standard Deviation": null, + "Rank": 32 + }, + "Chemistry": { + "Average Score": 48.45708298495634, + "Standard Deviation": null, + "Rank": 27 + }, + "CPP": { + "Average Score": 45.35392139264795, + "Standard Deviation": null, + "Rank": 22 + }, + "Physics": { + "Average Score": 58.61979255906953, + "Standard Deviation": null, + "Rank": 30 + }, + "Biology": { + "Average Score": 50.39755478099045, + "Standard Deviation": null, + "Rank": 34 + } + } + }, + { + "config": { + "model_name": "gemma-2-2b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/07" + }, + "results": { + "OVERALL": { + "Average Score": 58.76741528626868, + "Standard Deviation": 5.683174110350625, + "Rank": 24 + }, + "Geometry": { + "Average Score": 29.901411513695468, + "Standard Deviation": null, + "Rank": 41 + }, + "Algebra": { + "Average Score": 40.60048971047775, + "Standard Deviation": null, + "Rank": 45 + }, + "Probability": { + "Average Score": 33.448597365831304, + "Standard Deviation": null, + "Rank": 42 + }, + "Logical": { + "Average Score": 43.89688208707135, + "Standard Deviation": null, + "Rank": 31 + }, + "Social": { + "Average Score": 48.769368715100335, + "Standard Deviation": null, + "Rank": 26 + }, + "Chemistry": { + "Average Score": 28.982153819366474, + "Standard Deviation": null, + "Rank": 44 + }, + "CPP": { + "Average Score": 30.53406933106768, + "Standard Deviation": null, + "Rank": 36 + }, + "Physics": { + "Average Score": 22.78354134298823, + "Standard Deviation": null, + "Rank": 49 + }, + "Biology": { + "Average Score": 53.59359459245764, + "Standard Deviation": null, + "Rank": 30 + } + } + }, + { + "config": { + "model_name": "starling-lm-7b-alpha", + "organization": "Nexusflow", + "license": "Apache-2.0", + "knowledge_cutoff": "2023/11" + }, + "results": { + "OVERALL": { + "Average Score": 36.98646367219327, + "Standard Deviation": 0.5488180472607256, + "Rank": 39 + }, + "Geometry": { + "Average Score": 26.472892835994372, + "Standard Deviation": null, + "Rank": 44 + }, + "Algebra": { + "Average Score": 38.4553696839335, + "Standard Deviation": null, + "Rank": 47 + }, + "Probability": { + "Average Score": 33.907837077924526, + "Standard Deviation": null, + "Rank": 41 + }, + "Logical": { + "Average Score": 33.129169647630114, + "Standard Deviation": null, + "Rank": 41 + }, + "Social": { + "Average Score": 39.97855588617487, + "Standard Deviation": null, + "Rank": 38 + }, + "Chemistry": { + "Average Score": 29.187364253387454, + "Standard Deviation": null, + "Rank": 43 + }, + "CPP": { + "Average Score": 30.07926487356878, + "Standard Deviation": null, + "Rank": 37 + }, + "Physics": { + "Average Score": 32.39068796677421, + "Standard Deviation": null, + "Rank": 43 + }, + "Biology": { + "Average Score": 40.884001946009214, + "Standard Deviation": null, + "Rank": 41 + } + } + }, + { + "config": { + "model_name": "qwen1.5-4b-chat", + "organization": "Alibaba", + "license": "Qianwen LICENSE", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 9.87888465860545, + "Standard Deviation": 0.8496756485041839, + "Rank": 58 + }, + "Geometry": { + "Average Score": 16.727214095722648, + "Standard Deviation": null, + "Rank": 51 + }, + "Algebra": { + "Average Score": 30.868954326245674, + "Standard Deviation": null, + "Rank": 48 + }, + "Probability": { + "Average Score": 12.542151831707827, + "Standard Deviation": null, + "Rank": 52 + }, + "Logical": { + "Average Score": 13.591142976589552, + "Standard Deviation": null, + "Rank": 55 + }, + "Social": { + "Average Score": 29.86221951671923, + "Standard Deviation": null, + "Rank": 47 + }, + "Chemistry": { + "Average Score": 15.258365841050109, + "Standard Deviation": null, + "Rank": 57 + }, + "CPP": { + "Average Score": 13.21208067122554, + "Standard Deviation": null, + "Rank": 47 + }, + "Physics": { + "Average Score": 12.8962411286233, + "Standard Deviation": null, + "Rank": 56 + }, + "Biology": { + "Average Score": 8.598267308776672, + "Standard Deviation": null, + "Rank": 61 + } + } + }, + { + "config": { + "model_name": "command-r-(04-2024)", + "organization": "Cohere", + "license": "CC-BY-NC-4.0", + "knowledge_cutoff": "2024/04" + }, + "results": { + "OVERALL": { + "Average Score": 41.52933196050375, + "Standard Deviation": 2.241081240676662, + "Rank": 34 + }, + "Geometry": { + "Average Score": 25.015789717085156, + "Standard Deviation": null, + "Rank": 47 + }, + "Algebra": { + "Average Score": 30.86273392294722, + "Standard Deviation": null, + "Rank": 49 + }, + "Probability": { + "Average Score": 32.69230455171987, + "Standard Deviation": null, + "Rank": 43 + }, + "Logical": { + "Average Score": 34.412636294090625, + "Standard Deviation": null, + "Rank": 40 + }, + "Social": { + "Average Score": 41.24738365139523, + "Standard Deviation": null, + "Rank": 34 + }, + "Chemistry": { + "Average Score": 40.79571212108303, + "Standard Deviation": null, + "Rank": 35 + }, + "CPP": { + "Average Score": 41.346336503003236, + "Standard Deviation": null, + "Rank": 28 + }, + "Physics": { + "Average Score": 52.309001772076435, + "Standard Deviation": null, + "Rank": 34 + }, + "Biology": { + "Average Score": 49.100219607909104, + "Standard Deviation": null, + "Rank": 36 + } + } + }, + { + "config": { + "model_name": "vicuna-33b", + "organization": "LMSYS", + "license": "Non-commercial", + "knowledge_cutoff": "2023/08" + }, + "results": { + "OVERALL": { + "Average Score": 26.771867469042252, + "Standard Deviation": 2.2628124527776685, + "Rank": 45 + }, + "Geometry": { + "Average Score": 17.75361072083444, + "Standard Deviation": null, + "Rank": 50 + }, + "Algebra": { + "Average Score": 24.801410292720103, + "Standard Deviation": null, + "Rank": 50 + }, + "Probability": { + "Average Score": 18.923598681430988, + "Standard Deviation": null, + "Rank": 50 + }, + "Logical": { + "Average Score": 22.485046383293895, + "Standard Deviation": null, + "Rank": 47 + }, + "Social": { + "Average Score": 37.63057970959196, + "Standard Deviation": null, + "Rank": 40 + }, + "Chemistry": { + "Average Score": 28.982029986253178, + "Standard Deviation": null, + "Rank": 45 + }, + "CPP": { + "Average Score": 28.01838653090379, + "Standard Deviation": null, + "Rank": 38 + }, + "Physics": { + "Average Score": 28.904101398112875, + "Standard Deviation": null, + "Rank": 45 + }, + "Biology": { + "Average Score": 40.66824421437282, + "Standard Deviation": null, + "Rank": 44 + } + } + }, + { + "config": { + "model_name": "gemma-7b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 23.946098797294113, + "Standard Deviation": 1.882540513317503, + "Rank": 48 + }, + "Geometry": { + "Average Score": 20.947476737376597, + "Standard Deviation": null, + "Rank": 48 + }, + "Algebra": { + "Average Score": 23.018014851651127, + "Standard Deviation": null, + "Rank": 52 + }, + "Probability": { + "Average Score": 15.37360248124904, + "Standard Deviation": null, + "Rank": 51 + }, + "Logical": { + "Average Score": 23.856001036256362, + "Standard Deviation": null, + "Rank": 44 + }, + "Social": { + "Average Score": 33.803173718782276, + "Standard Deviation": null, + "Rank": 44 + }, + "Chemistry": { + "Average Score": 28.96403210090221, + "Standard Deviation": null, + "Rank": 46 + }, + "CPP": { + "Average Score": 28.014658234926813, + "Standard Deviation": null, + "Rank": 39 + }, + "Physics": { + "Average Score": 31.52560551567879, + "Standard Deviation": null, + "Rank": 44 + }, + "Biology": { + "Average Score": 33.30740831237261, + "Standard Deviation": null, + "Rank": 48 + } + } + }, + { + "config": { + "model_name": "mistral-7b-instruct-2", + "organization": "Mistral", + "license": "Apache 2.0", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 30.425212839239084, + "Standard Deviation": 3.2420324833230745, + "Rank": 43 + }, + "Geometry": { + "Average Score": 17.98077256453581, + "Standard Deviation": null, + "Rank": 49 + }, + "Algebra": { + "Average Score": 23.03227606898818, + "Standard Deviation": null, + "Rank": 51 + }, + "Probability": { + "Average Score": 22.515548503444595, + "Standard Deviation": null, + "Rank": 48 + }, + "Logical": { + "Average Score": 28.172299674407935, + "Standard Deviation": null, + "Rank": 42 + }, + "Social": { + "Average Score": 32.34681006422513, + "Standard Deviation": null, + "Rank": 45 + }, + "Chemistry": { + "Average Score": 29.847754052571794, + "Standard Deviation": null, + "Rank": 42 + }, + "CPP": { + "Average Score": 31.382959631870822, + "Standard Deviation": null, + "Rank": 35 + }, + "Physics": { + "Average Score": 42.179522893964496, + "Standard Deviation": null, + "Rank": 41 + }, + "Biology": { + "Average Score": 40.80741758174906, + "Standard Deviation": null, + "Rank": 42 + } + } + }, + { + "config": { + "model_name": "mistral-7b-instruct-1", + "organization": "Mistral", + "license": "Apache 2.0", + "knowledge_cutoff": "2023/12" + }, + "results": { + "OVERALL": { + "Average Score": 19.00770440704137, + "Standard Deviation": 2.5108129577834823, + "Rank": 55 + }, + "Geometry": { + "Average Score": 11.76124122331528, + "Standard Deviation": null, + "Rank": 55 + }, + "Algebra": { + "Average Score": 20.16800788676758, + "Standard Deviation": null, + "Rank": 53 + }, + "Probability": { + "Average Score": 21.982214302316194, + "Standard Deviation": null, + "Rank": 49 + }, + "Logical": { + "Average Score": 16.458119477880455, + "Standard Deviation": null, + "Rank": 51 + }, + "Social": { + "Average Score": 11.83909143203254, + "Standard Deviation": null, + "Rank": 56 + }, + "Chemistry": { + "Average Score": 20.227175038540732, + "Standard Deviation": null, + "Rank": 52 + }, + "CPP": { + "Average Score": 18.929093202755805, + "Standard Deviation": null, + "Rank": 42 + }, + "Physics": { + "Average Score": 16.942666711550366, + "Standard Deviation": null, + "Rank": 53 + }, + "Biology": { + "Average Score": 14.862055999215585, + "Standard Deviation": null, + "Rank": 56 + } + } + }, + { + "config": { + "model_name": "vicuna-13b", + "organization": "LMSYS", + "license": "Non-commercial", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 17.596440211877606, + "Standard Deviation": 2.1378036693126887, + "Rank": 56 + }, + "Geometry": { + "Average Score": 13.613562588758793, + "Standard Deviation": null, + "Rank": 54 + }, + "Algebra": { + "Average Score": 17.777580357601646, + "Standard Deviation": null, + "Rank": 54 + }, + "Probability": { + "Average Score": 11.773651220819335, + "Standard Deviation": null, + "Rank": 53 + }, + "Logical": { + "Average Score": 16.62840722654711, + "Standard Deviation": null, + "Rank": 50 + }, + "Social": { + "Average Score": 12.015284814277452, + "Standard Deviation": null, + "Rank": 54 + }, + "Chemistry": { + "Average Score": 22.59071707495557, + "Standard Deviation": null, + "Rank": 49 + }, + "CPP": { + "Average Score": 21.840013221590294, + "Standard Deviation": null, + "Rank": 40 + }, + "Physics": { + "Average Score": 23.12484986614339, + "Standard Deviation": null, + "Rank": 48 + }, + "Biology": { + "Average Score": 32.46475144310054, + "Standard Deviation": null, + "Rank": 49 + } + } + }, + { + "config": { + "model_name": "zephyr-7b-beta", + "organization": "HuggingFace", + "license": "MIT", + "knowledge_cutoff": "2023/10" + }, + "results": { + "OVERALL": { + "Average Score": 9.430771900746599, + "Standard Deviation": 0.5392686957469028, + "Rank": 59 + }, + "Geometry": { + "Average Score": 8.776172464719641, + "Standard Deviation": null, + "Rank": 56 + }, + "Algebra": { + "Average Score": 12.864251022808256, + "Standard Deviation": null, + "Rank": 55 + }, + "Probability": { + "Average Score": 6.856387198441145, + "Standard Deviation": null, + "Rank": 58 + }, + "Logical": { + "Average Score": 7.23067331414496, + "Standard Deviation": null, + "Rank": 59 + }, + "Social": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 61 + }, + "Chemistry": { + "Average Score": 16.809164907349935, + "Standard Deviation": null, + "Rank": 54 + }, + "CPP": { + "Average Score": 18.92902220864132, + "Standard Deviation": null, + "Rank": 43 + }, + "Physics": { + "Average Score": 17.655293480361614, + "Standard Deviation": null, + "Rank": 52 + }, + "Biology": { + "Average Score": 12.415097886994968, + "Standard Deviation": null, + "Rank": 58 + } + } + }, + { + "config": { + "model_name": "gemma-1.1-2b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 21.90250655573766, + "Standard Deviation": 1.9871388098125085, + "Rank": 52 + }, + "Geometry": { + "Average Score": 13.697788759430225, + "Standard Deviation": null, + "Rank": 53 + }, + "Algebra": { + "Average Score": 12.157310639752737, + "Standard Deviation": null, + "Rank": 56 + }, + "Probability": { + "Average Score": 7.449868080506948, + "Standard Deviation": null, + "Rank": 56 + }, + "Logical": { + "Average Score": 10.62657710416428, + "Standard Deviation": null, + "Rank": 57 + }, + "Social": { + "Average Score": 29.175325965898267, + "Standard Deviation": null, + "Rank": 48 + }, + "Chemistry": { + "Average Score": 21.740619629476075, + "Standard Deviation": null, + "Rank": 50 + }, + "CPP": { + "Average Score": 20.724691953843916, + "Standard Deviation": null, + "Rank": 41 + }, + "Physics": { + "Average Score": 23.632640386132042, + "Standard Deviation": null, + "Rank": 47 + }, + "Biology": { + "Average Score": 29.750661487753543, + "Standard Deviation": null, + "Rank": 50 + } + } + }, + { + "config": { + "model_name": "llama2-7b-chat", + "organization": "Meta", + "license": "Llama 2 Community", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 23.15262700172829, + "Standard Deviation": 1.5180515912969421, + "Rank": 50 + }, + "Geometry": { + "Average Score": 6.062981955604592, + "Standard Deviation": null, + "Rank": 57 + }, + "Algebra": { + "Average Score": 9.702442741719038, + "Standard Deviation": null, + "Rank": 58 + }, + "Probability": { + "Average Score": 7.323764901851239, + "Standard Deviation": null, + "Rank": 57 + }, + "Logical": { + "Average Score": 20.042615636879354, + "Standard Deviation": null, + "Rank": 49 + }, + "Social": { + "Average Score": 28.003092092497983, + "Standard Deviation": null, + "Rank": 49 + }, + "Chemistry": { + "Average Score": 20.22732766050842, + "Standard Deviation": null, + "Rank": 51 + }, + "CPP": { + "Average Score": 15.730513733660898, + "Standard Deviation": null, + "Rank": 45 + }, + "Physics": { + "Average Score": 12.866623115939365, + "Standard Deviation": null, + "Rank": 57 + }, + "Biology": { + "Average Score": 29.435323133887913, + "Standard Deviation": null, + "Rank": 51 + } + } + }, + { + "config": { + "model_name": "gemma-2b-it", + "organization": "Google", + "license": "Gemma License", + "knowledge_cutoff": "2024/02" + }, + "results": { + "OVERALL": { + "Average Score": 20.296640473489866, + "Standard Deviation": 2.333666507610861, + "Rank": 53 + }, + "Geometry": { + "Average Score": 16.155982788407485, + "Standard Deviation": null, + "Rank": 52 + }, + "Algebra": { + "Average Score": 9.997670449242714, + "Standard Deviation": null, + "Rank": 57 + }, + "Probability": { + "Average Score": 6.055292262170126, + "Standard Deviation": null, + "Rank": 59 + }, + "Logical": { + "Average Score": 5.200573121259635, + "Standard Deviation": null, + "Rank": 60 + }, + "Social": { + "Average Score": 9.560337024016134, + "Standard Deviation": null, + "Rank": 58 + }, + "Chemistry": { + "Average Score": 16.613881599313693, + "Standard Deviation": null, + "Rank": 55 + }, + "CPP": { + "Average Score": 17.2715657115764, + "Standard Deviation": null, + "Rank": 44 + }, + "Physics": { + "Average Score": 17.72258050873005, + "Standard Deviation": null, + "Rank": 51 + }, + "Biology": { + "Average Score": 10.891363209321185, + "Standard Deviation": null, + "Rank": 59 + } + } + }, + { + "config": { + "model_name": "llama2-13b-chat", + "organization": "Meta", + "license": "Llama 2 Community", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 22.40246822660458, + "Standard Deviation": 1.5744155926563603, + "Rank": 51 + }, + "Geometry": { + "Average Score": 4.287260426268335, + "Standard Deviation": null, + "Rank": 59 + }, + "Algebra": { + "Average Score": 7.122650832792122, + "Standard Deviation": null, + "Rank": 59 + }, + "Probability": { + "Average Score": 10.367779885088286, + "Standard Deviation": null, + "Rank": 54 + }, + "Logical": { + "Average Score": 23.416885515011753, + "Standard Deviation": null, + "Rank": 45 + }, + "Social": { + "Average Score": 26.251837552806705, + "Standard Deviation": null, + "Rank": 50 + }, + "Chemistry": { + "Average Score": 15.236408439765913, + "Standard Deviation": null, + "Rank": 58 + }, + "CPP": { + "Average Score": 13.17258252933903, + "Standard Deviation": null, + "Rank": 48 + }, + "Physics": { + "Average Score": 9.756032013938237, + "Standard Deviation": null, + "Rank": 58 + }, + "Biology": { + "Average Score": 14.373926163839833, + "Standard Deviation": null, + "Rank": 57 + } + } + }, + { + "config": { + "model_name": "vicuna-7b", + "organization": "LMSYS", + "license": "Non-commercial", + "knowledge_cutoff": "2023/07" + }, + "results": { + "OVERALL": { + "Average Score": 16.947504584923095, + "Standard Deviation": 2.1935303160759494, + "Rank": 57 + }, + "Geometry": { + "Average Score": 5.6556788835908565, + "Standard Deviation": null, + "Rank": 58 + }, + "Algebra": { + "Average Score": 6.937810777972691, + "Standard Deviation": null, + "Rank": 60 + }, + "Probability": { + "Average Score": 7.449902539116639, + "Standard Deviation": null, + "Rank": 55 + }, + "Logical": { + "Average Score": 11.53991650872671, + "Standard Deviation": null, + "Rank": 56 + }, + "Social": { + "Average Score": 10.510431618145562, + "Standard Deviation": null, + "Rank": 57 + }, + "Chemistry": { + "Average Score": 15.565621989451936, + "Standard Deviation": null, + "Rank": 56 + }, + "CPP": { + "Average Score": 14.255194156624162, + "Standard Deviation": null, + "Rank": 46 + }, + "Physics": { + "Average Score": 13.654470501928998, + "Standard Deviation": null, + "Rank": 55 + }, + "Biology": { + "Average Score": 16.31264249867034, + "Standard Deviation": null, + "Rank": 55 + } + } + }, + { + "config": { + "model_name": "koala-13b", + "organization": "UC Berkeley", + "license": "Non-commercial", + "knowledge_cutoff": "2023/04" + }, + "results": { + "OVERALL": { + "Average Score": 8.83755726181737, + "Standard Deviation": 0.6967904064276641, + "Rank": 60 + }, + "Geometry": { + "Average Score": 0.16630617078665783, + "Standard Deviation": null, + "Rank": 60 + }, + "Algebra": { + "Average Score": 2.2176438662182405, + "Standard Deviation": null, + "Rank": 61 + }, + "Probability": { + "Average Score": 3.0086045641099886, + "Standard Deviation": null, + "Rank": 60 + }, + "Logical": { + "Average Score": 8.007902379487398, + "Standard Deviation": null, + "Rank": 58 + }, + "Social": { + "Average Score": 9.267400643797334, + "Standard Deviation": null, + "Rank": 59 + }, + "Chemistry": { + "Average Score": 6.881971917535636, + "Standard Deviation": null, + "Rank": 59 + }, + "CPP": { + "Average Score": 6.36433272373514, + "Standard Deviation": null, + "Rank": 49 + }, + "Physics": { + "Average Score": 1.4745736403582252, + "Standard Deviation": null, + "Rank": 59 + }, + "Biology": { + "Average Score": 10.173901160370301, + "Standard Deviation": null, + "Rank": 60 + } + } + }, + { + "config": { + "model_name": "openassistant-pythia-12b", + "organization": "OpenAssistant", + "license": "Non-commercial", + "knowledge_cutoff": "2023/04" + }, + "results": { + "OVERALL": { + "Average Score": 0.0, + "Standard Deviation": 0.0, + "Rank": 61 + }, + "Geometry": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 61 + }, + "Algebra": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 62 + }, + "Probability": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 61 + }, + "Logical": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 61 + }, + "Social": { + "Average Score": 1.5648937446490145, + "Standard Deviation": null, + "Rank": 60 + }, + "Chemistry": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 60 + }, + "CPP": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 50 + }, + "Physics": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 60 + }, + "Biology": { + "Average Score": 0.0, + "Standard Deviation": null, + "Rank": 62 + } + } + }, + { + "config": { + "model_name": "nemotron-70b", + "organization": "NVIDIA", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 100.0, + "Standard Deviation": 0.0, + "Rank": 1 + }, + "Algebra": { + "Average Score": 80.66812253661826, + "Standard Deviation": null, + "Rank": 17 + }, + "Geometry": { + "Average Score": 64.79317124458657, + "Standard Deviation": null, + "Rank": 14 + }, + "Probability": { + "Average Score": 77.90998100977566, + "Standard Deviation": null, + "Rank": 10 + }, + "Logical": { + "Average Score": 92.79205249453312, + "Standard Deviation": null, + "Rank": 5 + }, + "Social": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Chemistry": { + "Average Score": 75.51792600714916, + "Standard Deviation": null, + "Rank": 14 + }, + "Physics": { + "Average Score": 87.87343018217607, + "Standard Deviation": null, + "Rank": 11 + }, + "Biology": { + "Average Score": 89.70989044405452, + "Standard Deviation": null, + "Rank": 6 + } + } + }, + { + "config": { + "model_name": "llama-3.2-3b-it", + "organization": "Meta", + "license": "Llama 3 Community", + "knowledge_cutoff": "Unknown" + }, + "results": { + "OVERALL": { + "Average Score": 24.55648638012998, + "Standard Deviation": 2.7438328116042396, + "Rank": 47 + }, + "Algebra": { + "Average Score": 58.282081682035965, + "Standard Deviation": null, + "Rank": 32 + }, + "Probability": { + "Average Score": 38.82178804612166, + "Standard Deviation": null, + "Rank": 37 + }, + "Logical": { + "Average Score": 14.284884351545829, + "Standard Deviation": null, + "Rank": 53 + }, + "Social": { + "Average Score": 12.015170971293347, + "Standard Deviation": null, + "Rank": 55 + }, + "Chemistry": { + "Average Score": 28.594555260782386, + "Standard Deviation": null, + "Rank": 47 + }, + "Physics": { + "Average Score": 28.49646725691165, + "Standard Deviation": null, + "Rank": 46 + }, + "Biology": { + "Average Score": 19.26616886675504, + "Standard Deviation": null, + "Rank": 54 + } + } + }, + { + "config": { + "model_name": "glm-4-plus", + "organization": "Unknown", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "Physics": { + "Average Score": 93.38486963586884, + "Standard Deviation": null, + "Rank": 4 + }, + "Biology": { + "Average Score": 92.22645537080881, + "Standard Deviation": null, + "Rank": 4 + }, + "Chemistry": { + "Average Score": 83.011021476943, + "Standard Deviation": null, + "Rank": 8 + }, + "Social": { + "Average Score": 96.10166232633848, + "Standard Deviation": null, + "Rank": 5 + }, + "Logical": { + "Average Score": 92.48639421432455, + "Standard Deviation": null, + "Rank": 6 + }, + "Algebra": { + "Average Score": 91.79128700104991, + "Standard Deviation": null, + "Rank": 7 + }, + "Geometry": { + "Average Score": 75.41344471165868, + "Standard Deviation": null, + "Rank": 10 + }, + "Probability": { + "Average Score": 76.73191937524591, + "Standard Deviation": null, + "Rank": 12 + }, + "OVERALL": { + "Average Score": 92.39089671677698, + "Standard Deviation": 0.5005865827133669, + "Rank": 6 + } + } + }, + { + "config": { + "model_name": "yi-lightning", + "organization": "Unknown", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "Physics": { + "Average Score": 88.49402753650628, + "Standard Deviation": null, + "Rank": 9 + }, + "Biology": { + "Average Score": 90.37891957676416, + "Standard Deviation": null, + "Rank": 5 + }, + "Chemistry": { + "Average Score": 100.0, + "Standard Deviation": null, + "Rank": 1 + }, + "Social": { + "Average Score": 92.14580653902937, + "Standard Deviation": null, + "Rank": 6 + }, + "Logical": { + "Average Score": 94.75701503537329, + "Standard Deviation": null, + "Rank": 4 + }, + "Algebra": { + "Average Score": 93.3186019721947, + "Standard Deviation": null, + "Rank": 6 + }, + "Geometry": { + "Average Score": 76.16313216563569, + "Standard Deviation": null, + "Rank": 9 + }, + "Probability": { + "Average Score": 92.54460354742838, + "Standard Deviation": null, + "Rank": 3 + }, + "OVERALL": { + "Average Score": 96.802929532644, + "Standard Deviation": 0.27491691197906704, + "Rank": 3 + } + } + }, + { + "config": { + "model_name": "ministral-8b-it", + "organization": "Unknown", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "Physics": { + "Average Score": 57.14492748742418, + "Standard Deviation": null, + "Rank": 32 + }, + "Biology": { + "Average Score": 53.5479824847229, + "Standard Deviation": null, + "Rank": 31 + }, + "Chemistry": { + "Average Score": 45.51400153833142, + "Standard Deviation": null, + "Rank": 30 + }, + "Social": { + "Average Score": 45.54025353861784, + "Standard Deviation": null, + "Rank": 29 + }, + "Logical": { + "Average Score": 59.25000685096734, + "Standard Deviation": null, + "Rank": 27 + }, + "Algebra": { + "Average Score": 58.56021213895309, + "Standard Deviation": null, + "Rank": 31 + }, + "Geometry": { + "Average Score": 54.902884398306554, + "Standard Deviation": null, + "Rank": 22 + }, + "Probability": { + "Average Score": 49.69358274321923, + "Standard Deviation": null, + "Rank": 29 + }, + "OVERALL": { + "Average Score": 45.88665474541969, + "Standard Deviation": 4.242263667629549, + "Rank": 29 + } + } + }, + { + "config": { + "model_name": "qwen2.5-1.5b", + "organization": "Unknown", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "Physics": { + "Average Score": 50.38291508013627, + "Standard Deviation": null, + "Rank": 35 + }, + "Biology": { + "Average Score": 40.134558844170826, + "Standard Deviation": null, + "Rank": 45 + }, + "Chemistry": { + "Average Score": 34.891253153439166, + "Standard Deviation": null, + "Rank": 39 + }, + "Social": { + "Average Score": 39.812806552940735, + "Standard Deviation": null, + "Rank": 39 + }, + "Logical": { + "Average Score": 42.70305684307474, + "Standard Deviation": null, + "Rank": 32 + }, + "Algebra": { + "Average Score": 79.30455838359877, + "Standard Deviation": null, + "Rank": 18 + }, + "Geometry": { + "Average Score": 58.56739922365014, + "Standard Deviation": null, + "Rank": 18 + }, + "Probability": { + "Average Score": 68.07725566867765, + "Standard Deviation": null, + "Rank": 19 + }, + "OVERALL": { + "Average Score": 23.25904934716627, + "Standard Deviation": 1.5089621200216172, + "Rank": 49 + } + } + }, + { + "config": { + "model_name": "smollm2-1.7b", + "organization": "Unknown", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "Physics": { + "Average Score": 20.328651604714242, + "Standard Deviation": null, + "Rank": 50 + }, + "Biology": { + "Average Score": 23.55167655906088, + "Standard Deviation": null, + "Rank": 53 + }, + "Chemistry": { + "Average Score": 17.90654461263675, + "Standard Deviation": null, + "Rank": 53 + }, + "Social": { + "Average Score": 18.586981509149783, + "Standard Deviation": null, + "Rank": 53 + }, + "Logical": { + "Average Score": 13.753294179366819, + "Standard Deviation": null, + "Rank": 54 + }, + "Algebra": { + "Average Score": 38.86009773073664, + "Standard Deviation": null, + "Rank": 46 + }, + "Geometry": { + "Average Score": 26.65205080537627, + "Standard Deviation": null, + "Rank": 43 + }, + "Probability": { + "Average Score": 28.77646355213561, + "Standard Deviation": null, + "Rank": 45 + }, + "OVERALL": { + "Average Score": 20.14565641258473, + "Standard Deviation": 2.3679638882398857, + "Rank": 54 + } + } + }, + { + "config": { + "model_name": "llama-3.2-1b-it", + "organization": "Unknown", + "license": "Unknown", + "knowledge_cutoff": "Unknown" + }, + "results": { + "Physics": { + "Average Score": 13.730639722217427, + "Standard Deviation": null, + "Rank": 54 + }, + "Biology": { + "Average Score": 25.09504378386352, + "Standard Deviation": null, + "Rank": 52 + }, + "Chemistry": { + "Average Score": 22.71076097859151, + "Standard Deviation": null, + "Rank": 48 + }, + "Social": { + "Average Score": 20.34042449083379, + "Standard Deviation": null, + "Rank": 52 + }, + "Logical": { + "Average Score": 15.338736069283176, + "Standard Deviation": null, + "Rank": 52 + }, + "Algebra": { + "Average Score": 43.69053020706735, + "Standard Deviation": null, + "Rank": 40 + }, + "Geometry": { + "Average Score": 25.35058286701741, + "Standard Deviation": null, + "Rank": 46 + }, + "Probability": { + "Average Score": 28.620674481486535, + "Standard Deviation": null, + "Rank": 46 + }, + "OVERALL": { + "Average Score": 24.93401522355894, + "Standard Deviation": 2.6710490374694014, + "Rank": 46 + } + } + } +] \ No newline at end of file