|
[
|
|
{
|
|
"config": {
|
|
"model_name": "ChatGPT-4o-latest (2024-09-03)",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.974329609,
|
|
"Standard Deviation": 0.005024959031,
|
|
"Rank": 2
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.976028578,
|
|
"Standard Deviation": 0.01507912373,
|
|
"Rank": 3
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.951199453,
|
|
"Standard Deviation": 0.08452452108,
|
|
"Rank": 3
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.842116641,
|
|
"Standard Deviation": 0.006267759054,
|
|
"Rank": 3
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.828490728,
|
|
"Standard Deviation": 0.009134213144,
|
|
"Rank": 3
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.815902987,
|
|
"Standard Deviation": 0.0196254222,
|
|
"Rank": 3
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"CPP": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-2024-08-06",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.846571548,
|
|
"Standard Deviation": 0.03394056554,
|
|
"Rank": 6
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.99773096,
|
|
"Standard Deviation": 0.002835555172,
|
|
"Rank": 1
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 1.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 1
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.78855795,
|
|
"Standard Deviation": 0.008188675452,
|
|
"Rank": 6
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.668635768,
|
|
"Standard Deviation": 0.03466314094,
|
|
"Rank": 11
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.680417314,
|
|
"Standard Deviation": 0.00656867063,
|
|
"Rank": 8
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 92.43090226400756,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"CPP": {
|
|
"Average Score": 92.43090226400756,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-2024-05-13",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.846334477,
|
|
"Standard Deviation": 0.09377911572,
|
|
"Rank": 7
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.972472377,
|
|
"Standard Deviation": 0.01648274205,
|
|
"Rank": 4
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.995511298,
|
|
"Standard Deviation": 0.004097802515,
|
|
"Rank": 2
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.812149974,
|
|
"Standard Deviation": 0.007669585485,
|
|
"Rank": 4
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.755019692,
|
|
"Standard Deviation": 0.008149588572,
|
|
"Rank": 6
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.609875087,
|
|
"Standard Deviation": 0.038729239,
|
|
"Rank": 13
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 79.1592634699295,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"CPP": {
|
|
"Average Score": 79.1592634699295,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4-turbo-2024-04-09",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.855357972,
|
|
"Standard Deviation": 0.1016986368,
|
|
"Rank": 4
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.95374588,
|
|
"Standard Deviation": 0.03109307166,
|
|
"Rank": 5
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.930945223,
|
|
"Standard Deviation": 0.06705136813,
|
|
"Rank": 4
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.750705448,
|
|
"Standard Deviation": 0.05944483103,
|
|
"Rank": 8
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.77906699,
|
|
"Standard Deviation": 0.007406734161,
|
|
"Rank": 4
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.715935163,
|
|
"Standard Deviation": 0.1209141409,
|
|
"Rank": 6
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 70.73143363230263,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"CPP": {
|
|
"Average Score": 70.73143363230263,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.5-pro-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.797187842,
|
|
"Standard Deviation": 0.0272375249,
|
|
"Rank": 10
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.9947169,
|
|
"Standard Deviation": 0.009150597621,
|
|
"Rank": 2
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.857464301,
|
|
"Standard Deviation": 0.05014285338,
|
|
"Rank": 5
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.651781767,
|
|
"Standard Deviation": 0.04156998547,
|
|
"Rank": 10
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.739745471,
|
|
"Standard Deviation": 0.01631532019,
|
|
"Rank": 7
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.649601885,
|
|
"Standard Deviation": 0.104854889,
|
|
"Rank": 11
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen2-72b-instruct",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/09"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.737918558,
|
|
"Standard Deviation": 0.09069077339,
|
|
"Rank": 11
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.796870305,
|
|
"Standard Deviation": 0.0509025346,
|
|
"Rank": 9
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.836194231,
|
|
"Standard Deviation": 0.04517093028,
|
|
"Rank": 6
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.788068004,
|
|
"Standard Deviation": 0.007288989044,
|
|
"Rank": 7
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.619300904,
|
|
"Standard Deviation": 0.06377931612,
|
|
"Rank": 14
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.652578786,
|
|
"Standard Deviation": 0.04259293171,
|
|
"Rank": 10
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 73.54037778797029,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.54037778797029,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-mini-2024-07-18",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.847694133,
|
|
"Standard Deviation": 0.02164304402,
|
|
"Rank": 5
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.946650435,
|
|
"Standard Deviation": 0.01831236482,
|
|
"Rank": 7
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.796243022,
|
|
"Standard Deviation": 0.05537539202,
|
|
"Rank": 7
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.798402685,
|
|
"Standard Deviation": 0.009404491967,
|
|
"Rank": 5
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.727009735,
|
|
"Standard Deviation": 0.02628110141,
|
|
"Rank": 8
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.691949855,
|
|
"Standard Deviation": 0.02072934333,
|
|
"Rank": 7
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 88.3877070580296,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"CPP": {
|
|
"Average Score": 88.3877070580296,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3.5-sonnet",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.839004422,
|
|
"Standard Deviation": 0.1461079564,
|
|
"Rank": 8
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.95316419,
|
|
"Standard Deviation": 0.02081192856,
|
|
"Rank": 6
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.759789952,
|
|
"Standard Deviation": 0.02611765096,
|
|
"Rank": 8
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.707730127,
|
|
"Standard Deviation": 0.0394436664,
|
|
"Rank": 9
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.77342666,
|
|
"Standard Deviation": 0.002892426458,
|
|
"Rank": 5
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.790002247,
|
|
"Standard Deviation": 0.1007410022,
|
|
"Rank": 4
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 82.37734076815008,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"CPP": {
|
|
"Average Score": 82.37734076815008,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "o1-mini",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 1.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 1
|
|
},
|
|
"Geometry": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Algebra": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Probability": {
|
|
"Average Score": 1.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 1
|
|
},
|
|
"Logical": {
|
|
"Average Score": 1.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 1
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.993974241,
|
|
"Standard Deviation": 0.001996882328,
|
|
"Rank": 2
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "o1-preview",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.945884589,
|
|
"Standard Deviation": 0.01059250762,
|
|
"Rank": 3
|
|
},
|
|
"Geometry": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Algebra": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.964666392,
|
|
"Standard Deviation": 0.003139983398,
|
|
"Rank": 2
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.987950057,
|
|
"Standard Deviation": 0.004881220327,
|
|
"Rank": 2
|
|
},
|
|
"Social": {
|
|
"Average Score": 1.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 1
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.5-flash-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.726493401,
|
|
"Standard Deviation": 0.01113913725,
|
|
"Rank": 12
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.804144103,
|
|
"Standard Deviation": 0.1327142178,
|
|
"Rank": 8
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.731776765,
|
|
"Standard Deviation": 0.02594657111,
|
|
"Rank": 9
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.614461891,
|
|
"Standard Deviation": 0.04690131826,
|
|
"Rank": 13
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.630805991,
|
|
"Standard Deviation": 0.04871350612,
|
|
"Rank": 13
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.555933822,
|
|
"Standard Deviation": 0.1029934524,
|
|
"Rank": 15
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 72.1127762005651,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"CPP": {
|
|
"Average Score": 72.1127762005651,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt4-1106",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.816347784,
|
|
"Standard Deviation": 0.1566815755,
|
|
"Rank": 9
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.71843088,
|
|
"Standard Deviation": 0.04778038294,
|
|
"Rank": 11
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.712910417,
|
|
"Standard Deviation": 0.02581828898,
|
|
"Rank": 10
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.623947619,
|
|
"Standard Deviation": 0.03502982933,
|
|
"Rank": 12
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.637482274,
|
|
"Standard Deviation": 0.04158809888,
|
|
"Rank": 12
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.450609816,
|
|
"Standard Deviation": 0.05208655446,
|
|
"Rank": 21
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 69.11824072252848,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"CPP": {
|
|
"Average Score": 69.11824072252848,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-27b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/06"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.624169623,
|
|
"Standard Deviation": 0.1048365121,
|
|
"Rank": 14
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.60112744,
|
|
"Standard Deviation": 0.0469109952,
|
|
"Rank": 17
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.687955914,
|
|
"Standard Deviation": 0.01959958192,
|
|
"Rank": 11
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.589524771,
|
|
"Standard Deviation": 0.03112689325,
|
|
"Rank": 14
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.614978944,
|
|
"Standard Deviation": 0.05710657859,
|
|
"Rank": 15
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.487844257,
|
|
"Standard Deviation": 0.05857760809,
|
|
"Rank": 18
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 63.28920072143611,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"CPP": {
|
|
"Average Score": 63.28920072143611,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-opus",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.650636271,
|
|
"Standard Deviation": 0.1197773541,
|
|
"Rank": 13
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.7215743,
|
|
"Standard Deviation": 0.04712598358,
|
|
"Rank": 10
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.68777327,
|
|
"Standard Deviation": 0.02382683713,
|
|
"Rank": 12
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.626471421,
|
|
"Standard Deviation": 0.02911817976,
|
|
"Rank": 11
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.692346381,
|
|
"Standard Deviation": 0.03617185198,
|
|
"Rank": 10
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.663410854,
|
|
"Standard Deviation": 0.09540220876,
|
|
"Rank": 9
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 73.5404403567132,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.5404403567132,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-9b-it-simpo",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.582787508,
|
|
"Standard Deviation": 0.03965204074,
|
|
"Rank": 18
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.658648133,
|
|
"Standard Deviation": 0.02565919856,
|
|
"Rank": 13
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.547861265,
|
|
"Standard Deviation": 0.02885209131,
|
|
"Rank": 17
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.540720893,
|
|
"Standard Deviation": 0.01970134508,
|
|
"Rank": 19
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.635266187,
|
|
"Standard Deviation": 0.03620021751,
|
|
"Rank": 12
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 73.43757596214863,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.43757596214863,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-72b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.519549796,
|
|
"Standard Deviation": 0.00903634343,
|
|
"Rank": 17
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.543139301,
|
|
"Standard Deviation": 0.03425202326,
|
|
"Rank": 22
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.635228729,
|
|
"Standard Deviation": 0.01944043425,
|
|
"Rank": 14
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.486948658,
|
|
"Standard Deviation": 0.06064655315,
|
|
"Rank": 21
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.284069394,
|
|
"Standard Deviation": 0.02686608506,
|
|
"Rank": 32
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.415007627,
|
|
"Standard Deviation": 0.03920053159,
|
|
"Rank": 22
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 48.69302376665551,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"CPP": {
|
|
"Average Score": 48.69302376665551,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-32b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.393789407,
|
|
"Standard Deviation": 0.05413770095,
|
|
"Rank": 28
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.51086835,
|
|
"Standard Deviation": 0.04052471998,
|
|
"Rank": 25
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.609003168,
|
|
"Standard Deviation": 0.04874143541,
|
|
"Rank": 15
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.476300002,
|
|
"Standard Deviation": 0.05322403912,
|
|
"Rank": 22
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.331781014,
|
|
"Standard Deviation": 0.004938997686,
|
|
"Rank": 29
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.380987334,
|
|
"Standard Deviation": 0.03762251776,
|
|
"Rank": 24
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.14284028264288,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.14284028264288,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "google-gemma-2-9b-it",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/06"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.489663449,
|
|
"Standard Deviation": 0.002595702019,
|
|
"Rank": 20
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.575371308,
|
|
"Standard Deviation": 0.03556220251,
|
|
"Rank": 20
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.597045661,
|
|
"Standard Deviation": 0.0313828123,
|
|
"Rank": 16
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.589221807,
|
|
"Standard Deviation": 0.03110811656,
|
|
"Rank": 16
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.587579897,
|
|
"Standard Deviation": 0.05512716783,
|
|
"Rank": 17
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.768337958,
|
|
"Standard Deviation": 0.04078610476,
|
|
"Rank": 5
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 54.03167523687635,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"CPP": {
|
|
"Average Score": 54.03167523687635,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "yi-1.5-34b-chat",
|
|
"organization": "01 AI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/05"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.607812897,
|
|
"Standard Deviation": 0.1440881293,
|
|
"Rank": 15
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.566666724,
|
|
"Standard Deviation": 0.04001381658,
|
|
"Rank": 21
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.590997292,
|
|
"Standard Deviation": 0.03594087315,
|
|
"Rank": 17
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.589524589,
|
|
"Standard Deviation": 0.03112618772,
|
|
"Rank": 15
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.574105508,
|
|
"Standard Deviation": 0.03441737941,
|
|
"Rank": 18
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.516980832,
|
|
"Standard Deviation": 0.03369347985,
|
|
"Rank": 17
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 52.148798061768964,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"CPP": {
|
|
"Average Score": 52.148798061768964,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "meta-llama-3.1-70b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3.1 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.708874896,
|
|
"Standard Deviation": 0.1315111956,
|
|
"Rank": 13
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.76184398,
|
|
"Standard Deviation": 0.01790377984,
|
|
"Rank": 10
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.732041699,
|
|
"Standard Deviation": 0.02621439062,
|
|
"Rank": 9
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.676208383,
|
|
"Standard Deviation": 0.05131201636,
|
|
"Rank": 10
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.620018631,
|
|
"Standard Deviation": 0.02518873821,
|
|
"Rank": 14
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.45872939,
|
|
"Standard Deviation": 0.05347039576,
|
|
"Rank": 20
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 84.36815192532764,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"CPP": {
|
|
"Average Score": 84.36815192532764,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "meta-llama-3.1-8b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3.1 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.505936324,
|
|
"Standard Deviation": 0.05286756493,
|
|
"Rank": 18
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.522442162,
|
|
"Standard Deviation": 0.03908236317,
|
|
"Rank": 23
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.582702645,
|
|
"Standard Deviation": 0.05002277711,
|
|
"Rank": 18
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.495001149,
|
|
"Standard Deviation": 0.05244587037,
|
|
"Rank": 20
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.443030561,
|
|
"Standard Deviation": 0.01343820628,
|
|
"Rank": 24
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.329195941,
|
|
"Standard Deviation": 0.03925019528,
|
|
"Rank": 28
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 44.41846841004584,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"CPP": {
|
|
"Average Score": 44.41846841004584,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt3.5-turbo-0125",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2021/09"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.313398088,
|
|
"Standard Deviation": 0.09322528606,
|
|
"Rank": 39
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.678714519,
|
|
"Standard Deviation": 0.05926546762,
|
|
"Rank": 12
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.569296173,
|
|
"Standard Deviation": 0.05277281097,
|
|
"Rank": 19
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.448460767,
|
|
"Standard Deviation": 0.05768095196,
|
|
"Rank": 24
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.148521348,
|
|
"Standard Deviation": 0.04033712907,
|
|
"Rank": 44
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.235071541,
|
|
"Standard Deviation": 0.02632892457,
|
|
"Rank": 37
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 40.46958736582551,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"CPP": {
|
|
"Average Score": 40.46958736582551,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama-3-70b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.456689885,
|
|
"Standard Deviation": 0.01385989995,
|
|
"Rank": 22
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.516865529,
|
|
"Standard Deviation": 0.03858112564,
|
|
"Rank": 24
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.566756531,
|
|
"Standard Deviation": 0.03369826926,
|
|
"Rank": 20
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.513857306,
|
|
"Standard Deviation": 0.05453699062,
|
|
"Rank": 19
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.713796415,
|
|
"Standard Deviation": 0.02031215107,
|
|
"Rank": 9
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.45872939,
|
|
"Standard Deviation": 0.05347039576,
|
|
"Rank": 20
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 65.32140697218945,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"CPP": {
|
|
"Average Score": 65.32140697218945,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-sonnet",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.520010833,
|
|
"Standard Deviation": 0.005030563799,
|
|
"Rank": 16
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.675613638,
|
|
"Standard Deviation": 0.05275594408,
|
|
"Rank": 13
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.552025728,
|
|
"Standard Deviation": 0.04122192409,
|
|
"Rank": 21
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.516192848,
|
|
"Standard Deviation": 0.04152293217,
|
|
"Rank": 18
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.588545747,
|
|
"Standard Deviation": 0.06068211943,
|
|
"Rank": 16
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.570437582,
|
|
"Standard Deviation": 0.08607040862,
|
|
"Rank": 14
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 61.33538592327427,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"CPP": {
|
|
"Average Score": 61.33538592327427,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-14b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.415328996,
|
|
"Standard Deviation": 0.0743938717,
|
|
"Rank": 27
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.452504016,
|
|
"Standard Deviation": 0.04225594393,
|
|
"Rank": 26
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.538655725,
|
|
"Standard Deviation": 0.03721542594,
|
|
"Rank": 22
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.397185975,
|
|
"Standard Deviation": 0.05607695946,
|
|
"Rank": 28
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.264573129,
|
|
"Standard Deviation": 0.03936133174,
|
|
"Rank": 34
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.287370142,
|
|
"Standard Deviation": 0.04264085315,
|
|
"Rank": 30
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 38.552779976347026,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"CPP": {
|
|
"Average Score": 38.552779976347026,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-haiku",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.453901163,
|
|
"Standard Deviation": 0.003604084261,
|
|
"Rank": 23
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.607993912,
|
|
"Standard Deviation": 0.05793460748,
|
|
"Rank": 15
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.520054055,
|
|
"Standard Deviation": 0.03333544511,
|
|
"Rank": 23
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.474460688,
|
|
"Standard Deviation": 0.0446501933,
|
|
"Rank": 23
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.512815976,
|
|
"Standard Deviation": 0.0163264281,
|
|
"Rank": 20
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.551083976,
|
|
"Standard Deviation": 0.05374722539,
|
|
"Rank": 16
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 56.40200048817984,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"CPP": {
|
|
"Average Score": 56.40200048817984,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-2.1",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.35814708,
|
|
"Standard Deviation": 0.09168134168,
|
|
"Rank": 35
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.62752395,
|
|
"Standard Deviation": 0.07232659398,
|
|
"Rank": 14
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.508849609,
|
|
"Standard Deviation": 0.0346897465,
|
|
"Rank": 24
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.41477086,
|
|
"Standard Deviation": 0.05964060239,
|
|
"Rank": 27
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.482923674,
|
|
"Standard Deviation": 0.01989147048,
|
|
"Rank": 21
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.333804568,
|
|
"Standard Deviation": 0.03775548253,
|
|
"Rank": 27
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 47.23672563994903,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"CPP": {
|
|
"Average Score": 47.23672563994903,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-8x7b-instruct-v0.1",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.382659161,
|
|
"Standard Deviation": 0.07594496929,
|
|
"Rank": 30
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.432216097,
|
|
"Standard Deviation": 0.04747949254,
|
|
"Rank": 29
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.478314888,
|
|
"Standard Deviation": 0.01998797419,
|
|
"Rank": 25
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.427144725,
|
|
"Standard Deviation": 0.0590923329,
|
|
"Rank": 26
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.340041983,
|
|
"Standard Deviation": 0.008397574592,
|
|
"Rank": 27
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.251949622,
|
|
"Standard Deviation": 0.03346674405,
|
|
"Rank": 35
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 44.533118241976666,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"CPP": {
|
|
"Average Score": 44.533118241976666,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-2.0",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.322718057,
|
|
"Standard Deviation": 0.08369883584,
|
|
"Rank": 37
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.604141967,
|
|
"Standard Deviation": 0.05116441826,
|
|
"Rank": 16
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.474350734,
|
|
"Standard Deviation": 0.01510393066,
|
|
"Rank": 26
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.437950412,
|
|
"Standard Deviation": 0.05985594317,
|
|
"Rank": 25
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.445620646,
|
|
"Standard Deviation": 0.01812614805,
|
|
"Rank": 23
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.469422836,
|
|
"Standard Deviation": 0.05999901796,
|
|
"Rank": 19
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 50.773143448036464,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"CPP": {
|
|
"Average Score": 50.773143448036464,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "starling-lm-7b-beta",
|
|
"organization": "Nexusflow",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.479391856,
|
|
"Standard Deviation": 0.04199990887,
|
|
"Rank": 21
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.446654388,
|
|
"Standard Deviation": 0.05637864999,
|
|
"Rank": 28
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.473952749,
|
|
"Standard Deviation": 0.01584301288,
|
|
"Rank": 27
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.395197837,
|
|
"Standard Deviation": 0.05814798892,
|
|
"Rank": 29
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.39927199,
|
|
"Standard Deviation": 0.02125277518,
|
|
"Rank": 25
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.380021662,
|
|
"Standard Deviation": 0.04622452748,
|
|
"Rank": 25
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 38.27587102395908,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"CPP": {
|
|
"Average Score": 38.27587102395908,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.0-pro-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.449040654,
|
|
"Standard Deviation": 0.0450610177,
|
|
"Rank": 24
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.578347959,
|
|
"Standard Deviation": 0.04242873607,
|
|
"Rank": 19
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.462417786,
|
|
"Standard Deviation": 0.01668313635,
|
|
"Rank": 28
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.289836324,
|
|
"Standard Deviation": 0.05739831115,
|
|
"Rank": 37
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.191140355,
|
|
"Standard Deviation": 0.03394652499,
|
|
"Rank": 40
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.130790863,
|
|
"Standard Deviation": 0.02800188173,
|
|
"Rank": 45
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.22204471452975,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.22204471452975,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openchat-3.5-0106",
|
|
"organization": "OpenChat",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2024/01"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.363929888,
|
|
"Standard Deviation": 0.08602347145,
|
|
"Rank": 33
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.38715246,
|
|
"Standard Deviation": 0.03701851946,
|
|
"Rank": 32
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.441233712,
|
|
"Standard Deviation": 0.01135753754,
|
|
"Rank": 29
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.38802618,
|
|
"Standard Deviation": 0.05663879714,
|
|
"Rank": 30
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.336754383,
|
|
"Standard Deviation": 0.01608478079,
|
|
"Rank": 28
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.250891608,
|
|
"Standard Deviation": 0.03253769914,
|
|
"Rank": 36
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 33.70639271807677,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"CPP": {
|
|
"Average Score": 33.70639271807677,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openchat-3.5",
|
|
"organization": "OpenChat",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.361341296,
|
|
"Standard Deviation": 0.09034869493,
|
|
"Rank": 34
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.401699069,
|
|
"Standard Deviation": 0.03410726557,
|
|
"Rank": 30
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.414095336,
|
|
"Standard Deviation": 0.01881964261,
|
|
"Rank": 31
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.349601002,
|
|
"Standard Deviation": 0.05077455539,
|
|
"Rank": 32
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.331069242,
|
|
"Standard Deviation": 0.02180827173,
|
|
"Rank": 30
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.319991655,
|
|
"Standard Deviation": 0.04502478724,
|
|
"Rank": 29
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 33.020911255646965,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"CPP": {
|
|
"Average Score": 33.020911255646965,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "command-r-(08-2024)",
|
|
"organization": "Cohere",
|
|
"license": "CC-BY-NC-4.0",
|
|
"knowledge_cutoff": "2024/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.427605298,
|
|
"Standard Deviation": 0.01747449163,
|
|
"Rank": 25
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.448300727,
|
|
"Standard Deviation": 0.04996362328,
|
|
"Rank": 27
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.417519167,
|
|
"Standard Deviation": 0.01822196902,
|
|
"Rank": 30
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.366336281,
|
|
"Standard Deviation": 0.04716826942,
|
|
"Rank": 31
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.214657906,
|
|
"Standard Deviation": 0.03003579835,
|
|
"Rank": 37
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.276088379,
|
|
"Standard Deviation": 0.03295234688,
|
|
"Rank": 32
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 39.61492485677676,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"CPP": {
|
|
"Average Score": 39.61492485677676,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-1.1-7b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.339506922,
|
|
"Standard Deviation": 0.1066279108,
|
|
"Rank": 36
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.324170977,
|
|
"Standard Deviation": 0.04668553765,
|
|
"Rank": 35
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.398684697,
|
|
"Standard Deviation": 0.01982398259,
|
|
"Rank": 32
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.293253175,
|
|
"Standard Deviation": 0.05126192191,
|
|
"Rank": 36
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.317750796,
|
|
"Standard Deviation": 0.01101933543,
|
|
"Rank": 31
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.179073276,
|
|
"Standard Deviation": 0.02009658805,
|
|
"Rank": 41
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 42.666504105798204,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"CPP": {
|
|
"Average Score": 42.666504105798204,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama3-8b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "2023/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.367722676,
|
|
"Standard Deviation": 0.1071368221,
|
|
"Rank": 31
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.367143758,
|
|
"Standard Deviation": 0.04363680358,
|
|
"Rank": 33
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.391480973,
|
|
"Standard Deviation": 0.02757445266,
|
|
"Rank": 33
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.317616445,
|
|
"Standard Deviation": 0.04300430361,
|
|
"Rank": 35
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.461607495,
|
|
"Standard Deviation": 0.02185028842,
|
|
"Rank": 22
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.336373622,
|
|
"Standard Deviation": 0.05762408512,
|
|
"Rank": 26
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.35392139264795,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.35392139264795,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.502167612,
|
|
"Standard Deviation": 0.04389786763,
|
|
"Rank": 19
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.395006676,
|
|
"Standard Deviation": 0.05882607713,
|
|
"Rank": 31
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.379391887,
|
|
"Standard Deviation": 0.01722410785,
|
|
"Rank": 34
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.331231097,
|
|
"Standard Deviation": 0.05392499987,
|
|
"Rank": 34
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.367687789,
|
|
"Standard Deviation": 0.02547968808,
|
|
"Rank": 26
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.393482094,
|
|
"Standard Deviation": 0.06450214024,
|
|
"Rank": 23
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 30.53406933106768,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"CPP": {
|
|
"Average Score": 30.53406933106768,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "starling-lm-7b-alpha",
|
|
"organization": "Nexusflow",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.366628765,
|
|
"Standard Deviation": 0.08405492929,
|
|
"Rank": 32
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.336782578,
|
|
"Standard Deviation": 0.04069449132,
|
|
"Rank": 34
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.371551932,
|
|
"Standard Deviation": 0.03367241745,
|
|
"Rank": 35
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.331472505,
|
|
"Standard Deviation": 0.04833324282,
|
|
"Rank": 33
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.260869624,
|
|
"Standard Deviation": 0.03562735237,
|
|
"Rank": 35
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.271975534,
|
|
"Standard Deviation": 0.04266753408,
|
|
"Rank": 33
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 30.07926487356878,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"CPP": {
|
|
"Average Score": 30.07926487356878,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-4b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.111876411,
|
|
"Standard Deviation": 0.04241022785,
|
|
"Rank": 48
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.215834522,
|
|
"Standard Deviation": 0.0363766363,
|
|
"Rank": 39
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.305589811,
|
|
"Standard Deviation": 0.02354198912,
|
|
"Rank": 36
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.149365327,
|
|
"Standard Deviation": 0.03489672675,
|
|
"Rank": 43
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.116210168,
|
|
"Standard Deviation": 0.005927966496,
|
|
"Rank": 46
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.18195615,
|
|
"Standard Deviation": 0.02269805277,
|
|
"Rank": 40
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 13.21208067122554,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"CPP": {
|
|
"Average Score": 13.21208067122554,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "command-r-(04-2024)",
|
|
"organization": "Cohere",
|
|
"license": "CC-BY-NC-4.0",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.388783887,
|
|
"Standard Deviation": 0.07417186783,
|
|
"Rank": 29
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.300416698,
|
|
"Standard Deviation": 0.03485612736,
|
|
"Rank": 36
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.293120231,
|
|
"Standard Deviation": 0.032926484,
|
|
"Rank": 37
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.281271304,
|
|
"Standard Deviation": 0.05697149867,
|
|
"Rank": 38
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.276189906,
|
|
"Standard Deviation": 0.03562914754,
|
|
"Rank": 33
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.283882949,
|
|
"Standard Deviation": 0.03336901148,
|
|
"Rank": 31
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 41.346336503003236,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"CPP": {
|
|
"Average Score": 41.346336503003236,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-33b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.316543555,
|
|
"Standard Deviation": 0.08922095647,
|
|
"Rank": 38
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.208284679,
|
|
"Standard Deviation": 0.03937771461,
|
|
"Rank": 40
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.248994048,
|
|
"Standard Deviation": 0.02668175054,
|
|
"Rank": 39
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.222313995,
|
|
"Standard Deviation": 0.03978859759,
|
|
"Rank": 41
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.180291222,
|
|
"Standard Deviation": 0.021886267,
|
|
"Rank": 41
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.257623798,
|
|
"Standard Deviation": 0.02653724437,
|
|
"Rank": 34
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 28.01838653090379,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"CPP": {
|
|
"Average Score": 28.01838653090379,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-7b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.285077558,
|
|
"Standard Deviation": 0.08871758453,
|
|
"Rank": 40
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.244791417,
|
|
"Standard Deviation": 0.0289612078,
|
|
"Rank": 37
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.250614794,
|
|
"Standard Deviation": 0.01991678295,
|
|
"Rank": 38
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.174313053,
|
|
"Standard Deviation": 0.03765424728,
|
|
"Rank": 42
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.197505536,
|
|
"Standard Deviation": 0.02050298885,
|
|
"Rank": 38
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.202138025,
|
|
"Standard Deviation": 0.02098346639,
|
|
"Rank": 39
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 28.014658234926813,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"CPP": {
|
|
"Average Score": 28.014658234926813,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-7b-instruct-2",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.427513868,
|
|
"Standard Deviation": 0.05553921135,
|
|
"Rank": 26
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.216402626,
|
|
"Standard Deviation": 0.03338414918,
|
|
"Rank": 38
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.233777838,
|
|
"Standard Deviation": 0.0155226054,
|
|
"Rank": 40
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.25118175,
|
|
"Standard Deviation": 0.04065514593,
|
|
"Rank": 39
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.224469136,
|
|
"Standard Deviation": 0.03404706752,
|
|
"Rank": 36
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.209386782,
|
|
"Standard Deviation": 0.02738569921,
|
|
"Rank": 38
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 31.382959631870822,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"CPP": {
|
|
"Average Score": 31.382959631870822,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-7b-instruct-1",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.23016314,
|
|
"Standard Deviation": 0.07137625271,
|
|
"Rank": 45
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.161799938,
|
|
"Standard Deviation": 0.03595278559,
|
|
"Rank": 44
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.210341624,
|
|
"Standard Deviation": 0.01736539119,
|
|
"Rank": 41
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.238417922,
|
|
"Standard Deviation": 0.03744211933,
|
|
"Rank": 40
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.142636601,
|
|
"Standard Deviation": 0.02080406365,
|
|
"Rank": 45
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.117646827,
|
|
"Standard Deviation": 0.009321202779,
|
|
"Rank": 47
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 18.929093202755805,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"CPP": {
|
|
"Average Score": 18.929093202755805,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-13b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.201892849,
|
|
"Standard Deviation": 0.06021749802,
|
|
"Rank": 46
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.200941928,
|
|
"Standard Deviation": 0.03366817781,
|
|
"Rank": 41
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.196123323,
|
|
"Standard Deviation": 0.0135715643,
|
|
"Rank": 42
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.141214079,
|
|
"Standard Deviation": 0.02721328211,
|
|
"Rank": 44
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.148598631,
|
|
"Standard Deviation": 0.02241523892,
|
|
"Rank": 43
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.124655135,
|
|
"Standard Deviation": 0.01122382671,
|
|
"Rank": 46
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 21.840013221590294,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"CPP": {
|
|
"Average Score": 21.840013221590294,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "zephyr-7b-beta",
|
|
"organization": "HuggingFace",
|
|
"license": "MIT",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.102705119,
|
|
"Standard Deviation": 0.03683757312,
|
|
"Rank": 49
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.114005544,
|
|
"Standard Deviation": 0.03144354365,
|
|
"Rank": 45
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.141766633,
|
|
"Standard Deviation": 0.03179520129,
|
|
"Rank": 43
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.089050714,
|
|
"Standard Deviation": 0.002136754266,
|
|
"Rank": 47
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.069520789,
|
|
"Standard Deviation": 0.004477840857,
|
|
"Rank": 50
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 52
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 18.92902220864132,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"CPP": {
|
|
"Average Score": 18.92902220864132,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-1.1-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.257700845,
|
|
"Standard Deviation": 0.07369021445,
|
|
"Rank": 43
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.183974034,
|
|
"Standard Deviation": 0.0215548886,
|
|
"Rank": 43
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.13422252,
|
|
"Standard Deviation": 0.01922819511,
|
|
"Rank": 44
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.095628657,
|
|
"Standard Deviation": 0.007536076456,
|
|
"Rank": 46
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.094965074,
|
|
"Standard Deviation": 0.005019175487,
|
|
"Rank": 48
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.167796727,
|
|
"Standard Deviation": 0.01666541942,
|
|
"Rank": 42
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 20.724691953843916,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"CPP": {
|
|
"Average Score": 20.724691953843916,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama2-7b-chat",
|
|
"organization": "Meta",
|
|
"license": "Llama 2 Community",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.260189428,
|
|
"Standard Deviation": 0.08019299364,
|
|
"Rank": 42
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.087067276,
|
|
"Standard Deviation": 0.04274343402,
|
|
"Rank": 46
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.12308805,
|
|
"Standard Deviation": 0.01856053622,
|
|
"Rank": 45
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.087515438,
|
|
"Standard Deviation": 0.006315053573,
|
|
"Rank": 48
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.17312827,
|
|
"Standard Deviation": 0.01867044092,
|
|
"Rank": 42
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.152905272,
|
|
"Standard Deviation": 0.007166957097,
|
|
"Rank": 43
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 15.730513733660898,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"CPP": {
|
|
"Average Score": 15.730513733660898,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.234172069,
|
|
"Standard Deviation": 0.06522685718,
|
|
"Rank": 44
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.198571153,
|
|
"Standard Deviation": 0.01699161031,
|
|
"Rank": 42
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.109883009,
|
|
"Standard Deviation": 0.01520005833,
|
|
"Rank": 46
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.06467432,
|
|
"Standard Deviation": 0.002117497231,
|
|
"Rank": 50
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.039624492,
|
|
"Standard Deviation": 0.007606972686,
|
|
"Rank": 51
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.087452913,
|
|
"Standard Deviation": 0.008170146562,
|
|
"Rank": 50
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 17.2715657115764,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"CPP": {
|
|
"Average Score": 17.2715657115764,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama2-13b-chat",
|
|
"organization": "Meta",
|
|
"license": "Llama 2 Community",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.263305684,
|
|
"Standard Deviation": 0.07283640689,
|
|
"Rank": 41
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.072729954,
|
|
"Standard Deviation": 0.02315988261,
|
|
"Rank": 48
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.080371692,
|
|
"Standard Deviation": 0.01277569453,
|
|
"Rank": 47
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.117757344,
|
|
"Standard Deviation": 0.02418619619,
|
|
"Rank": 45
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.193149889,
|
|
"Standard Deviation": 0.01776690764,
|
|
"Rank": 39
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.149125922,
|
|
"Standard Deviation": 0.01157416827,
|
|
"Rank": 44
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 13.17258252933903,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"CPP": {
|
|
"Average Score": 13.17258252933903,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-7b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.198839786,
|
|
"Standard Deviation": 0.05725381576,
|
|
"Rank": 47
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.083457058,
|
|
"Standard Deviation": 0.02520989111,
|
|
"Rank": 47
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.070883882,
|
|
"Standard Deviation": 0.007315853253,
|
|
"Rank": 48
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.080987673,
|
|
"Standard Deviation": 0.005474288861,
|
|
"Rank": 49
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.100065588,
|
|
"Standard Deviation": 0.003561886452,
|
|
"Rank": 47
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.111076414,
|
|
"Standard Deviation": 0.004805626512,
|
|
"Rank": 48
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 14.255194156624162,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"CPP": {
|
|
"Average Score": 14.255194156624162,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "koala-13b",
|
|
"organization": "UC Berkeley",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.09387188,
|
|
"Standard Deviation": 0.02642167489,
|
|
"Rank": 50
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.017374001,
|
|
"Standard Deviation": 0.01747053557,
|
|
"Rank": 49
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.018129197,
|
|
"Standard Deviation": 0.01054371383,
|
|
"Rank": 49
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.043654362,
|
|
"Standard Deviation": 0.004288231886,
|
|
"Rank": 51
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.074694053,
|
|
"Standard Deviation": 0.002674646998,
|
|
"Rank": 49
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.096983835,
|
|
"Standard Deviation": 0.007847059783,
|
|
"Rank": 49
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 6.36433272373514,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"CPP": {
|
|
"Average Score": 6.36433272373514,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openassistant-pythia-12b",
|
|
"organization": "OpenAssistant",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 51
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 50
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 50
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 52
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 52
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.030792528,
|
|
"Standard Deviation": 0.007518796391,
|
|
"Rank": 51
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"CPP": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
}
|
|
}
|
|
}
|
|
] |