de-arena / src /results /models_2024-10-18-14:06:13.588399.json
yzabc007's picture
Update space
04e5831
raw
history blame
85.9 kB
[
{
"config": {
"model_name": "ChatGPT-4o-latest (2024-09-03)",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 87.33082346779815,
"Standard Deviation": 1.4853337406399776,
"Rank": 3
},
"Geometry": {
"Average Score": 0.976028578,
"Standard Deviation": 0.01507912373,
"Rank": 3
},
"Algebra": {
"Average Score": 0.951199453,
"Standard Deviation": 0.08452452108,
"Rank": 3
},
"Probability": {
"Average Score": 80.1332207690739,
"Standard Deviation": null,
"Rank": 7
},
"Logical": {
"Average Score": 84.12975867250425,
"Standard Deviation": 0.21211547702245045,
"Rank": 6
},
"Social": {
"Average Score": 0.815902987,
"Standard Deviation": 0.0196254222,
"Rank": 3
},
"Chemistry": {
"Average Score": 89.92480228064885,
"Standard Deviation": null,
"Rank": 4
},
"CPP": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-08-06",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 77.7818546246671,
"Standard Deviation": 2.7097581088879505,
"Rank": 5
},
"Geometry": {
"Average Score": 0.99773096,
"Standard Deviation": 0.002835555172,
"Rank": 1
},
"Algebra": {
"Average Score": 1.0,
"Standard Deviation": 0.0,
"Rank": 1
},
"Probability": {
"Average Score": 74.97136205481755,
"Standard Deviation": null,
"Rank": 11
},
"Logical": {
"Average Score": 66.0597109743056,
"Standard Deviation": 1.5021351704575163,
"Rank": 14
},
"Social": {
"Average Score": 0.680417314,
"Standard Deviation": 0.00656867063,
"Rank": 8
},
"Chemistry": {
"Average Score": 82.55189735524202,
"Standard Deviation": null,
"Rank": 7
},
"CPP": {
"Average Score": 92.43090226400756,
"Standard Deviation": null,
"Rank": 2
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-05-13",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 72.6093654197998,
"Standard Deviation": 13.515345690976028,
"Rank": 10
},
"Geometry": {
"Average Score": 0.972472377,
"Standard Deviation": 0.01648274205,
"Rank": 4
},
"Algebra": {
"Average Score": 0.995511298,
"Standard Deviation": 0.004097802515,
"Rank": 2
},
"Probability": {
"Average Score": 77.97816201050715,
"Standard Deviation": null,
"Rank": 8
},
"Logical": {
"Average Score": 75.65058939137873,
"Standard Deviation": 0.07522785572103825,
"Rank": 9
},
"Social": {
"Average Score": 0.609875087,
"Standard Deviation": 0.038729239,
"Rank": 13
},
"Chemistry": {
"Average Score": 76.03377031297643,
"Standard Deviation": null,
"Rank": 9
},
"CPP": {
"Average Score": 79.1592634699295,
"Standard Deviation": null,
"Rank": 6
}
}
},
{
"config": {
"model_name": "gpt-4-turbo-2024-04-09",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 73.32308543749606,
"Standard Deviation": 6.562777844134629,
"Rank": 9
},
"Geometry": {
"Average Score": 0.95374588,
"Standard Deviation": 0.03109307166,
"Rank": 5
},
"Algebra": {
"Average Score": 0.930945223,
"Standard Deviation": 0.06705136813,
"Rank": 4
},
"Probability": {
"Average Score": 74.97144205445957,
"Standard Deviation": null,
"Rank": 12
},
"Logical": {
"Average Score": 76.82291715624933,
"Standard Deviation": 0.03462548327631355,
"Rank": 7
},
"Social": {
"Average Score": 0.715935163,
"Standard Deviation": 0.1209141409,
"Rank": 6
},
"Chemistry": {
"Average Score": 70.44329321394066,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 70.73143363230263,
"Standard Deviation": null,
"Rank": 11
}
}
},
{
"config": {
"model_name": "gemini-1.5-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 74.27365448117855,
"Standard Deviation": 3.9515447172901847,
"Rank": 8
},
"Geometry": {
"Average Score": 0.9947169,
"Standard Deviation": 0.009150597621,
"Rank": 2
},
"Algebra": {
"Average Score": 0.857464301,
"Standard Deviation": 0.05014285338,
"Rank": 5
},
"Probability": {
"Average Score": 64.77713215500482,
"Standard Deviation": null,
"Rank": 15
},
"Logical": {
"Average Score": 74.3275461555815,
"Standard Deviation": 0.8092355737847541,
"Rank": 10
},
"Social": {
"Average Score": 0.649601885,
"Standard Deviation": 0.104854889,
"Rank": 11
}
}
},
{
"config": {
"model_name": "qwen2-72b-instruct",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/09"
},
"results": {
"OVERALL": {
"Average Score": 71.00423311357184,
"Standard Deviation": 1.6189609141983887,
"Rank": 12
},
"Geometry": {
"Average Score": 0.796870305,
"Standard Deviation": 0.0509025346,
"Rank": 9
},
"Algebra": {
"Average Score": 0.836194231,
"Standard Deviation": 0.04517093028,
"Rank": 6
},
"Probability": {
"Average Score": 76.33751777233937,
"Standard Deviation": null,
"Rank": 10
},
"Logical": {
"Average Score": 61.22020517318166,
"Standard Deviation": 10.241399997578569,
"Rank": 17
},
"Social": {
"Average Score": 0.652578786,
"Standard Deviation": 0.04259293171,
"Rank": 10
},
"Chemistry": {
"Average Score": 70.44342338869497,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 73.54037778797029,
"Standard Deviation": null,
"Rank": 7
}
}
},
{
"config": {
"model_name": "gpt-4o-mini-2024-07-18",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 77.35427394420829,
"Standard Deviation": 3.162321541714492,
"Rank": 6
},
"Geometry": {
"Average Score": 0.946650435,
"Standard Deviation": 0.01831236482,
"Rank": 7
},
"Algebra": {
"Average Score": 0.796243022,
"Standard Deviation": 0.05537539202,
"Rank": 7
},
"Probability": {
"Average Score": 77.63972720989734,
"Standard Deviation": null,
"Rank": 9
},
"Logical": {
"Average Score": 71.81267717239906,
"Standard Deviation": 0.3393593163824375,
"Rank": 11
},
"Social": {
"Average Score": 0.691949855,
"Standard Deviation": 0.02072934333,
"Rank": 7
},
"Chemistry": {
"Average Score": 78.10636943659426,
"Standard Deviation": null,
"Rank": 8
},
"CPP": {
"Average Score": 88.3877070580296,
"Standard Deviation": null,
"Rank": 3
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 75.97534774560863,
"Standard Deviation": 9.237316832705584,
"Rank": 7
},
"Geometry": {
"Average Score": 0.95316419,
"Standard Deviation": 0.02081192856,
"Rank": 6
},
"Algebra": {
"Average Score": 0.759789952,
"Standard Deviation": 0.02611765096,
"Rank": 8
},
"Probability": {
"Average Score": 65.4531881044298,
"Standard Deviation": null,
"Rank": 14
},
"Logical": {
"Average Score": 76.47424588300288,
"Standard Deviation": 0.07699328617321737,
"Rank": 8
},
"Social": {
"Average Score": 0.790002247,
"Standard Deviation": 0.1007410022,
"Rank": 4
},
"Chemistry": {
"Average Score": 85.17654674052096,
"Standard Deviation": null,
"Rank": 6
},
"CPP": {
"Average Score": 82.37734076815008,
"Standard Deviation": null,
"Rank": 5
}
}
},
{
"config": {
"model_name": "o1-mini",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 87.92989248183513,
"Standard Deviation": 1.3401058431409953,
"Rank": 2
},
"Geometry": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Algebra": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Probability": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Logical": {
"Average Score": 99.15920225407733,
"Standard Deviation": 0.49801294410288666,
"Rank": 2
},
"Social": {
"Average Score": 0.993974241,
"Standard Deviation": 0.001996882328,
"Rank": 2
}
}
},
{
"config": {
"model_name": "o1-preview",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 85.40247108906188,
"Standard Deviation": 1.5796898764998464,
"Rank": 4
},
"Geometry": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Algebra": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Probability": {
"Average Score": 90.32625019320989,
"Standard Deviation": null,
"Rank": 5
},
"Logical": {
"Average Score": 98.18241651273537,
"Standard Deviation": 0.16231417987288874,
"Rank": 4
},
"Social": {
"Average Score": 1.0,
"Standard Deviation": 0.0,
"Rank": 1
}
}
},
{
"config": {
"model_name": "gemini-1.5-flash-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 67.67997467963976,
"Standard Deviation": 2.624276751646549,
"Rank": 13
},
"Geometry": {
"Average Score": 0.804144103,
"Standard Deviation": 0.1327142178,
"Rank": 8
},
"Algebra": {
"Average Score": 0.731776765,
"Standard Deviation": 0.02594657111,
"Rank": 9
},
"Probability": {
"Average Score": 61.17190439316032,
"Standard Deviation": null,
"Rank": 19
},
"Logical": {
"Average Score": 62.284381466778335,
"Standard Deviation": 3.9592476945909674,
"Rank": 16
},
"Social": {
"Average Score": 0.555933822,
"Standard Deviation": 0.1029934524,
"Rank": 15
},
"Chemistry": {
"Average Score": 70.24726462490831,
"Standard Deviation": null,
"Rank": 15
},
"CPP": {
"Average Score": 72.1127762005651,
"Standard Deviation": null,
"Rank": 10
}
}
},
{
"config": {
"model_name": "gpt4-1106",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 72.24829405851214,
"Standard Deviation": 13.633826990442946,
"Rank": 11
},
"Geometry": {
"Average Score": 0.71843088,
"Standard Deviation": 0.04778038294,
"Rank": 11
},
"Algebra": {
"Average Score": 0.712910417,
"Standard Deviation": 0.02581828898,
"Rank": 10
},
"Probability": {
"Average Score": 63.29462909293814,
"Standard Deviation": null,
"Rank": 16
},
"Logical": {
"Average Score": 62.987098158883875,
"Standard Deviation": 4.027795425350514,
"Rank": 15
},
"Social": {
"Average Score": 0.450609816,
"Standard Deviation": 0.05208655446,
"Rank": 21
},
"Chemistry": {
"Average Score": 67.34047237109209,
"Standard Deviation": null,
"Rank": 16
},
"CPP": {
"Average Score": 69.11824072252848,
"Standard Deviation": null,
"Rank": 12
}
}
},
{
"config": {
"model_name": "gemma-2-27b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 62.70975283121063,
"Standard Deviation": 6.376450054715319,
"Rank": 15
},
"Geometry": {
"Average Score": 0.60112744,
"Standard Deviation": 0.0469109952,
"Rank": 17
},
"Algebra": {
"Average Score": 0.687955914,
"Standard Deviation": 0.01959958192,
"Rank": 11
},
"Probability": {
"Average Score": 60.04180799425261,
"Standard Deviation": null,
"Rank": 20
},
"Logical": {
"Average Score": 60.77082327163094,
"Standard Deviation": 7.2164902432618625,
"Rank": 19
},
"Social": {
"Average Score": 0.487844257,
"Standard Deviation": 0.05857760809,
"Rank": 18
},
"Chemistry": {
"Average Score": 61.68181926111706,
"Standard Deviation": null,
"Rank": 18
},
"CPP": {
"Average Score": 63.28920072143611,
"Standard Deviation": null,
"Rank": 14
}
}
},
{
"config": {
"model_name": "claude-3-opus",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 60.56449573632771,
"Standard Deviation": 8.485936885427277,
"Rank": 17
},
"Geometry": {
"Average Score": 0.7215743,
"Standard Deviation": 0.04712598358,
"Rank": 10
},
"Algebra": {
"Average Score": 0.68777327,
"Standard Deviation": 0.02382683713,
"Rank": 12
},
"Probability": {
"Average Score": 62.296041016641176,
"Standard Deviation": null,
"Rank": 17
},
"Logical": {
"Average Score": 68.36295609287292,
"Standard Deviation": 1.6558271236588655,
"Rank": 13
},
"Social": {
"Average Score": 0.663410854,
"Standard Deviation": 0.09540220876,
"Rank": 9
},
"Chemistry": {
"Average Score": 70.44337273504232,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 73.5404403567132,
"Standard Deviation": null,
"Rank": 8
}
}
},
{
"config": {
"model_name": "gemma-2-9b-it-simpo",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Geometry": {
"Average Score": 0.582787508,
"Standard Deviation": 0.03965204074,
"Rank": 18
},
"Algebra": {
"Average Score": 0.658648133,
"Standard Deviation": 0.02565919856,
"Rank": 13
},
"Probability": {
"Average Score": 57.545408188912894,
"Standard Deviation": null,
"Rank": 23
},
"Logical": {
"Average Score": 53.1996479262466,
"Standard Deviation": 2.690106544431167,
"Rank": 23
},
"Social": {
"Average Score": 0.635266187,
"Standard Deviation": 0.03620021751,
"Rank": 12
},
"Chemistry": {
"Average Score": 74.44267231381626,
"Standard Deviation": null,
"Rank": 11
},
"CPP": {
"Average Score": 73.43757596214863,
"Standard Deviation": null,
"Rank": 9
}
}
},
{
"config": {
"model_name": "qwen1.5-72b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 52.983715751652085,
"Standard Deviation": 3.097613966427763,
"Rank": 18
},
"Geometry": {
"Average Score": 0.543139301,
"Standard Deviation": 0.03425202326,
"Rank": 22
},
"Algebra": {
"Average Score": 0.635228729,
"Standard Deviation": 0.01944043425,
"Rank": 14
},
"Probability": {
"Average Score": 52.650033879924905,
"Standard Deviation": null,
"Rank": 26
},
"Logical": {
"Average Score": 32.628853250402074,
"Standard Deviation": 3.227745519436025,
"Rank": 37
},
"Social": {
"Average Score": 0.415007627,
"Standard Deviation": 0.03920053159,
"Rank": 22
},
"Chemistry": {
"Average Score": 47.5126781973184,
"Standard Deviation": null,
"Rank": 24
},
"CPP": {
"Average Score": 48.69302376665551,
"Standard Deviation": null,
"Rank": 20
}
}
},
{
"config": {
"model_name": "qwen1.5-32b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 26.978561942890224,
"Standard Deviation": 1.575986887925592,
"Rank": 32
},
"Geometry": {
"Average Score": 0.51086835,
"Standard Deviation": 0.04052471998,
"Rank": 25
},
"Algebra": {
"Average Score": 0.609003168,
"Standard Deviation": 0.04874143541,
"Rank": 15
},
"Probability": {
"Average Score": 49.50617919486678,
"Standard Deviation": null,
"Rank": 29
},
"Logical": {
"Average Score": 34.07387941414556,
"Standard Deviation": 4.616974831074921,
"Rank": 34
},
"Social": {
"Average Score": 0.380987334,
"Standard Deviation": 0.03762251776,
"Rank": 24
},
"Chemistry": {
"Average Score": 44.06627265183811,
"Standard Deviation": null,
"Rank": 28
},
"CPP": {
"Average Score": 45.14284028264288,
"Standard Deviation": null,
"Rank": 24
}
}
},
{
"config": {
"model_name": "google-gemma-2-9b-it",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 52.23013018580635,
"Standard Deviation": 3.3939236141078495,
"Rank": 19
},
"Geometry": {
"Average Score": 0.575371308,
"Standard Deviation": 0.03556220251,
"Rank": 20
},
"Algebra": {
"Average Score": 0.597045661,
"Standard Deviation": 0.0313828123,
"Rank": 16
},
"Probability": {
"Average Score": 58.73062101843859,
"Standard Deviation": null,
"Rank": 21
},
"Logical": {
"Average Score": 58.01791397899675,
"Standard Deviation": 5.751983660134971,
"Rank": 21
},
"Social": {
"Average Score": 0.768337958,
"Standard Deviation": 0.04078610476,
"Rank": 5
},
"Chemistry": {
"Average Score": 52.69494515004607,
"Standard Deviation": null,
"Rank": 21
},
"CPP": {
"Average Score": 54.03167523687635,
"Standard Deviation": null,
"Rank": 17
}
}
},
{
"config": {
"model_name": "yi-1.5-34b-chat",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024/05"
},
"results": {
"OVERALL": {
"Average Score": 62.568637878216464,
"Standard Deviation": 8.554205798418673,
"Rank": 16
},
"Geometry": {
"Average Score": 0.566666724,
"Standard Deviation": 0.04001381658,
"Rank": 21
},
"Algebra": {
"Average Score": 0.590997292,
"Standard Deviation": 0.03594087315,
"Rank": 17
},
"Probability": {
"Average Score": 57.545207891104354,
"Standard Deviation": null,
"Rank": 22
},
"Logical": {
"Average Score": 56.598158131627194,
"Standard Deviation": 1.1072821075127297,
"Rank": 22
},
"Social": {
"Average Score": 0.516980832,
"Standard Deviation": 0.03369347985,
"Rank": 17
},
"Chemistry": {
"Average Score": 50.867343712131174,
"Standard Deviation": null,
"Rank": 22
},
"CPP": {
"Average Score": 52.148798061768964,
"Standard Deviation": null,
"Rank": 18
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-70b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 65.61302047306724,
"Standard Deviation": 7.113338386318571,
"Rank": 14
},
"Geometry": {
"Average Score": 0.76184398,
"Standard Deviation": 0.01790377984,
"Rank": 10
},
"Algebra": {
"Average Score": 0.732041699,
"Standard Deviation": 0.02621439062,
"Rank": 9
},
"Probability": {
"Average Score": 65.4531285887158,
"Standard Deviation": null,
"Rank": 13
},
"Logical": {
"Average Score": 61.16321386785366,
"Standard Deviation": 0.8920966760646541,
"Rank": 18
},
"Social": {
"Average Score": 0.45872939,
"Standard Deviation": 0.05347039576,
"Rank": 20
},
"Chemistry": {
"Average Score": 76.03374498429748,
"Standard Deviation": null,
"Rank": 9
},
"CPP": {
"Average Score": 84.36815192532764,
"Standard Deviation": null,
"Rank": 4
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-8b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 48.86242501618216,
"Standard Deviation": 3.7761459978540257,
"Rank": 21
},
"Geometry": {
"Average Score": 0.522442162,
"Standard Deviation": 0.03908236317,
"Rank": 23
},
"Algebra": {
"Average Score": 0.582702645,
"Standard Deviation": 0.05002277711,
"Rank": 18
},
"Probability": {
"Average Score": 52.44179989233465,
"Standard Deviation": null,
"Rank": 27
},
"Logical": {
"Average Score": 43.3706774850582,
"Standard Deviation": 2.820707319899787,
"Rank": 28
},
"Social": {
"Average Score": 0.329195941,
"Standard Deviation": 0.03925019528,
"Rank": 28
},
"Chemistry": {
"Average Score": 43.36264580455019,
"Standard Deviation": null,
"Rank": 30
},
"CPP": {
"Average Score": 44.41846841004584,
"Standard Deviation": null,
"Rank": 26
}
}
},
{
"config": {
"model_name": "gpt3.5-turbo-0125",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2021/09"
},
"results": {
"OVERALL": {
"Average Score": 18.951737690142235,
"Standard Deviation": 0.7967088395458379,
"Rank": 42
},
"Geometry": {
"Average Score": 0.678714519,
"Standard Deviation": 0.05926546762,
"Rank": 12
},
"Algebra": {
"Average Score": 0.569296173,
"Standard Deviation": 0.05277281097,
"Rank": 19
},
"Probability": {
"Average Score": 45.77959177088119,
"Standard Deviation": null,
"Rank": 30
},
"Logical": {
"Average Score": 17.159084771200394,
"Standard Deviation": 2.5845422782742546,
"Rank": 48
},
"Social": {
"Average Score": 0.235071541,
"Standard Deviation": 0.02632892457,
"Rank": 37
},
"Chemistry": {
"Average Score": 39.52885225927276,
"Standard Deviation": null,
"Rank": 33
},
"CPP": {
"Average Score": 40.46958736582551,
"Standard Deviation": null,
"Rank": 29
}
}
},
{
"config": {
"model_name": "llama-3-70b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 40.57810915454436,
"Standard Deviation": 1.3134243733127455,
"Rank": 26
},
"Geometry": {
"Average Score": 0.516865529,
"Standard Deviation": 0.03858112564,
"Rank": 24
},
"Algebra": {
"Average Score": 0.566756531,
"Standard Deviation": 0.03369826926,
"Rank": 20
},
"Probability": {
"Average Score": 52.64997876875813,
"Standard Deviation": null,
"Rank": 25
},
"Logical": {
"Average Score": 70.51651844158742,
"Standard Deviation": 0.12355022869457871,
"Rank": 12
},
"Social": {
"Average Score": 0.45872939,
"Standard Deviation": 0.05347039576,
"Rank": 20
},
"Chemistry": {
"Average Score": 63.65476403379996,
"Standard Deviation": null,
"Rank": 17
},
"CPP": {
"Average Score": 65.32140697218945,
"Standard Deviation": null,
"Rank": 13
}
}
},
{
"config": {
"model_name": "claude-3-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 52.19088595402735,
"Standard Deviation": 3.743258734262917,
"Rank": 20
},
"Geometry": {
"Average Score": 0.675613638,
"Standard Deviation": 0.05275594408,
"Rank": 13
},
"Algebra": {
"Average Score": 0.552025728,
"Standard Deviation": 0.04122192409,
"Rank": 21
},
"Probability": {
"Average Score": 54.0284459891417,
"Standard Deviation": null,
"Rank": 24
},
"Logical": {
"Average Score": 58.099761779812475,
"Standard Deviation": 7.815595203680491,
"Rank": 20
},
"Social": {
"Average Score": 0.570437582,
"Standard Deviation": 0.08607040862,
"Rank": 14
},
"Chemistry": {
"Average Score": 59.784958090634056,
"Standard Deviation": null,
"Rank": 19
},
"CPP": {
"Average Score": 61.33538592327427,
"Standard Deviation": null,
"Rank": 15
}
}
},
{
"config": {
"model_name": "qwen1.5-14b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 31.56999734729493,
"Standard Deviation": 5.42704987916441,
"Rank": 29
},
"Geometry": {
"Average Score": 0.452504016,
"Standard Deviation": 0.04225594393,
"Rank": 26
},
"Algebra": {
"Average Score": 0.538655725,
"Standard Deviation": 0.03721542594,
"Rank": 22
},
"Probability": {
"Average Score": 41.027908758027046,
"Standard Deviation": null,
"Rank": 35
},
"Logical": {
"Average Score": 31.638560769720616,
"Standard Deviation": 3.175225377796435,
"Rank": 38
},
"Social": {
"Average Score": 0.287370142,
"Standard Deviation": 0.04264085315,
"Rank": 30
},
"Chemistry": {
"Average Score": 37.667977565724996,
"Standard Deviation": null,
"Rank": 35
},
"CPP": {
"Average Score": 38.552779976347026,
"Standard Deviation": null,
"Rank": 31
}
}
},
{
"config": {
"model_name": "claude-3-haiku",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 42.975259650014074,
"Standard Deviation": 2.248602505751528,
"Rank": 25
},
"Geometry": {
"Average Score": 0.607993912,
"Standard Deviation": 0.05793460748,
"Rank": 15
},
"Algebra": {
"Average Score": 0.520054055,
"Standard Deviation": 0.03333544511,
"Rank": 23
},
"Probability": {
"Average Score": 52.44184603289214,
"Standard Deviation": null,
"Rank": 28
},
"Logical": {
"Average Score": 50.38523351226464,
"Standard Deviation": 1.9928131873345676,
"Rank": 24
},
"Social": {
"Average Score": 0.551083976,
"Standard Deviation": 0.05374722539,
"Rank": 16
},
"Chemistry": {
"Average Score": 54.99584531372778,
"Standard Deviation": null,
"Rank": 20
},
"CPP": {
"Average Score": 56.40200048817984,
"Standard Deviation": null,
"Rank": 16
}
}
},
{
"config": {
"model_name": "claude-2.1",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 23.82704986290717,
"Standard Deviation": 1.6337262681919007,
"Rank": 37
},
"Geometry": {
"Average Score": 0.62752395,
"Standard Deviation": 0.07232659398,
"Rank": 14
},
"Algebra": {
"Average Score": 0.508849609,
"Standard Deviation": 0.0346897465,
"Rank": 24
},
"Probability": {
"Average Score": 42.82280874207299,
"Standard Deviation": null,
"Rank": 32
},
"Logical": {
"Average Score": 47.40647506260718,
"Standard Deviation": 3.5140099122016686,
"Rank": 25
},
"Social": {
"Average Score": 0.333804568,
"Standard Deviation": 0.03775548253,
"Rank": 27
},
"Chemistry": {
"Average Score": 46.09889239661357,
"Standard Deviation": null,
"Rank": 25
},
"CPP": {
"Average Score": 47.23672563994903,
"Standard Deviation": null,
"Rank": 21
}
}
},
{
"config": {
"model_name": "mistral-8x7b-instruct-v0.1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 26.279729527476174,
"Standard Deviation": 1.7823676900027476,
"Rank": 33
},
"Geometry": {
"Average Score": 0.432216097,
"Standard Deviation": 0.04747949254,
"Rank": 29
},
"Algebra": {
"Average Score": 0.478314888,
"Standard Deviation": 0.01998797419,
"Rank": 25
},
"Probability": {
"Average Score": 42.27303178662447,
"Standard Deviation": null,
"Rank": 33
},
"Logical": {
"Average Score": 34.58281320758576,
"Standard Deviation": 2.5548927504271073,
"Rank": 33
},
"Social": {
"Average Score": 0.251949622,
"Standard Deviation": 0.03346674405,
"Rank": 35
},
"Chemistry": {
"Average Score": 43.47423835615602,
"Standard Deviation": null,
"Rank": 29
},
"CPP": {
"Average Score": 44.533118241976666,
"Standard Deviation": null,
"Rank": 25
}
}
},
{
"config": {
"model_name": "claude-2.0",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 20.490629074737296,
"Standard Deviation": 0.4821482730133453,
"Rank": 40
},
"Geometry": {
"Average Score": 0.604141967,
"Standard Deviation": 0.05116441826,
"Rank": 16
},
"Algebra": {
"Average Score": 0.474350734,
"Standard Deviation": 0.01510393066,
"Rank": 26
},
"Probability": {
"Average Score": 45.15580067803421,
"Standard Deviation": null,
"Rank": 31
},
"Logical": {
"Average Score": 43.65660021552717,
"Standard Deviation": 4.959029305063026,
"Rank": 27
},
"Social": {
"Average Score": 0.469422836,
"Standard Deviation": 0.05999901796,
"Rank": 19
},
"Chemistry": {
"Average Score": 49.53201090067431,
"Standard Deviation": null,
"Rank": 23
},
"CPP": {
"Average Score": 50.773143448036464,
"Standard Deviation": null,
"Rank": 19
}
}
},
{
"config": {
"model_name": "starling-lm-7b-beta",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 43.0415265396966,
"Standard Deviation": 0.8770524316858576,
"Rank": 24
},
"Geometry": {
"Average Score": 0.446654388,
"Standard Deviation": 0.05637864999,
"Rank": 28
},
"Algebra": {
"Average Score": 0.473952749,
"Standard Deviation": 0.01584301288,
"Rank": 27
},
"Probability": {
"Average Score": 41.320066911500234,
"Standard Deviation": null,
"Rank": 34
},
"Logical": {
"Average Score": 39.79665241383638,
"Standard Deviation": 3.4711628274016544,
"Rank": 30
},
"Social": {
"Average Score": 0.380021662,
"Standard Deviation": 0.04622452748,
"Rank": 25
},
"Chemistry": {
"Average Score": 37.39896886078588,
"Standard Deviation": null,
"Rank": 36
},
"CPP": {
"Average Score": 38.27587102395908,
"Standard Deviation": null,
"Rank": 32
}
}
},
{
"config": {
"model_name": "gemini-1.0-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 45.78126809517331,
"Standard Deviation": 3.7275133674569783,
"Rank": 23
},
"Geometry": {
"Average Score": 0.578347959,
"Standard Deviation": 0.04242873607,
"Rank": 19
},
"Algebra": {
"Average Score": 0.462417786,
"Standard Deviation": 0.01668313635,
"Rank": 28
},
"Probability": {
"Average Score": 31.410607001114293,
"Standard Deviation": null,
"Rank": 42
},
"Logical": {
"Average Score": 21.717362428653246,
"Standard Deviation": 4.392290522642325,
"Rank": 44
},
"Social": {
"Average Score": 0.130790863,
"Standard Deviation": 0.02800188173,
"Rank": 45
},
"Chemistry": {
"Average Score": 44.14314678087462,
"Standard Deviation": null,
"Rank": 27
},
"CPP": {
"Average Score": 45.22204471452975,
"Standard Deviation": null,
"Rank": 23
}
}
},
{
"config": {
"model_name": "openchat-3.5-0106",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/01"
},
"results": {
"OVERALL": {
"Average Score": 23.85666609339201,
"Standard Deviation": 1.341285455536348,
"Rank": 36
},
"Geometry": {
"Average Score": 0.38715246,
"Standard Deviation": 0.03701851946,
"Rank": 32
},
"Algebra": {
"Average Score": 0.441233712,
"Standard Deviation": 0.01135753754,
"Rank": 29
},
"Probability": {
"Average Score": 40.37790468557232,
"Standard Deviation": null,
"Rank": 36
},
"Logical": {
"Average Score": 35.1573373260624,
"Standard Deviation": 2.485128777146724,
"Rank": 32
},
"Social": {
"Average Score": 0.250891608,
"Standard Deviation": 0.03253769914,
"Rank": 36
},
"Chemistry": {
"Average Score": 32.96322247853182,
"Standard Deviation": null,
"Rank": 37
},
"CPP": {
"Average Score": 33.70639271807677,
"Standard Deviation": null,
"Rank": 33
}
}
},
{
"config": {
"model_name": "openchat-3.5",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 23.63538251797928,
"Standard Deviation": 2.0516295921862095,
"Rank": 38
},
"Geometry": {
"Average Score": 0.401699069,
"Standard Deviation": 0.03410726557,
"Rank": 30
},
"Algebra": {
"Average Score": 0.414095336,
"Standard Deviation": 0.01881964261,
"Rank": 31
},
"Probability": {
"Average Score": 36.00454588244476,
"Standard Deviation": null,
"Rank": 38
},
"Logical": {
"Average Score": 34.029859502735654,
"Standard Deviation": 3.354098427500673,
"Rank": 35
},
"Social": {
"Average Score": 0.319991655,
"Standard Deviation": 0.04502478724,
"Rank": 29
},
"Chemistry": {
"Average Score": 32.29778226319944,
"Standard Deviation": null,
"Rank": 38
},
"CPP": {
"Average Score": 33.020911255646965,
"Standard Deviation": null,
"Rank": 34
}
}
},
{
"config": {
"model_name": "command-r-(08-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/08"
},
"results": {
"OVERALL": {
"Average Score": 38.783798277856995,
"Standard Deviation": 1.1948096596199191,
"Rank": 27
},
"Geometry": {
"Average Score": 0.448300727,
"Standard Deviation": 0.04996362328,
"Rank": 27
},
"Algebra": {
"Average Score": 0.417519167,
"Standard Deviation": 0.01822196902,
"Rank": 30
},
"Probability": {
"Average Score": 38.019523941917335,
"Standard Deviation": null,
"Rank": 37
},
"Logical": {
"Average Score": 23.408826179018206,
"Standard Deviation": 0.9355701468205376,
"Rank": 42
},
"Social": {
"Average Score": 0.276088379,
"Standard Deviation": 0.03295234688,
"Rank": 32
},
"Chemistry": {
"Average Score": 38.699171059988636,
"Standard Deviation": null,
"Rank": 34
},
"CPP": {
"Average Score": 39.61492485677676,
"Standard Deviation": null,
"Rank": 30
}
}
},
{
"config": {
"model_name": "gemma-1.1-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 20.965269549151657,
"Standard Deviation": 0.6031600560715249,
"Rank": 39
},
"Geometry": {
"Average Score": 0.324170977,
"Standard Deviation": 0.04668553765,
"Rank": 35
},
"Algebra": {
"Average Score": 0.398684697,
"Standard Deviation": 0.01982398259,
"Rank": 32
},
"Probability": {
"Average Score": 30.98345832281905,
"Standard Deviation": null,
"Rank": 43
},
"Logical": {
"Average Score": 33.36570116785516,
"Standard Deviation": 3.8824795120929765,
"Rank": 36
},
"Social": {
"Average Score": 0.179073276,
"Standard Deviation": 0.02009658805,
"Rank": 41
},
"Chemistry": {
"Average Score": 41.66173653808921,
"Standard Deviation": null,
"Rank": 31
},
"CPP": {
"Average Score": 42.666504105798204,
"Standard Deviation": null,
"Rank": 27
}
}
},
{
"config": {
"model_name": "llama3-8b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/03"
},
"results": {
"OVERALL": {
"Average Score": 30.183633696164936,
"Standard Deviation": 3.5901082045571266,
"Rank": 31
},
"Geometry": {
"Average Score": 0.367143758,
"Standard Deviation": 0.04363680358,
"Rank": 33
},
"Algebra": {
"Average Score": 0.391480973,
"Standard Deviation": 0.02757445266,
"Rank": 33
},
"Probability": {
"Average Score": 34.51621975866105,
"Standard Deviation": null,
"Rank": 39
},
"Logical": {
"Average Score": 45.27560737491475,
"Standard Deviation": 4.639305724878496,
"Rank": 26
},
"Social": {
"Average Score": 0.336373622,
"Standard Deviation": 0.05762408512,
"Rank": 26
},
"Chemistry": {
"Average Score": 44.271144265487514,
"Standard Deviation": null,
"Rank": 26
},
"CPP": {
"Average Score": 45.35392139264795,
"Standard Deviation": null,
"Rank": 22
}
}
},
{
"config": {
"model_name": "gemma-2-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": 47.37377937645159,
"Standard Deviation": 2.72420190928707,
"Rank": 22
},
"Geometry": {
"Average Score": 0.395006676,
"Standard Deviation": 0.05882607713,
"Rank": 31
},
"Algebra": {
"Average Score": 0.379391887,
"Standard Deviation": 0.01722410785,
"Rank": 34
},
"Probability": {
"Average Score": 33.90530403382374,
"Standard Deviation": null,
"Rank": 41
},
"Logical": {
"Average Score": 37.64262561604027,
"Standard Deviation": 3.0627256408495804,
"Rank": 31
},
"Social": {
"Average Score": 0.393482094,
"Standard Deviation": 0.06450214024,
"Rank": 23
},
"Chemistry": {
"Average Score": 29.883648650177584,
"Standard Deviation": null,
"Rank": 40
},
"CPP": {
"Average Score": 30.53406933106768,
"Standard Deviation": null,
"Rank": 36
}
}
},
{
"config": {
"model_name": "starling-lm-7b-alpha",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 24.34505731078066,
"Standard Deviation": 1.4660872513914562,
"Rank": 35
},
"Geometry": {
"Average Score": 0.336782578,
"Standard Deviation": 0.04069449132,
"Rank": 34
},
"Algebra": {
"Average Score": 0.371551932,
"Standard Deviation": 0.03367241745,
"Rank": 35
},
"Probability": {
"Average Score": 34.51613212227484,
"Standard Deviation": null,
"Rank": 40
},
"Logical": {
"Average Score": 29.88612695085449,
"Standard Deviation": 2.4070524024678672,
"Rank": 40
},
"Social": {
"Average Score": 0.271975534,
"Standard Deviation": 0.04266753408,
"Rank": 33
},
"Chemistry": {
"Average Score": 29.442057363491365,
"Standard Deviation": null,
"Rank": 41
},
"CPP": {
"Average Score": 30.07926487356878,
"Standard Deviation": null,
"Rank": 37
}
}
},
{
"config": {
"model_name": "qwen1.5-4b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 7.19753150259024,
"Standard Deviation": 0.6175113365944395,
"Rank": 52
},
"Geometry": {
"Average Score": 0.215834522,
"Standard Deviation": 0.0363766363,
"Rank": 39
},
"Algebra": {
"Average Score": 0.305589811,
"Standard Deviation": 0.02354198912,
"Rank": 36
},
"Probability": {
"Average Score": 15.124506890648007,
"Standard Deviation": null,
"Rank": 49
},
"Logical": {
"Average Score": 11.67206257803879,
"Standard Deviation": 1.140401009846497,
"Rank": 51
},
"Social": {
"Average Score": 0.18195615,
"Standard Deviation": 0.02269805277,
"Rank": 40
},
"Chemistry": {
"Average Score": 12.825435835657133,
"Standard Deviation": null,
"Rank": 52
},
"CPP": {
"Average Score": 13.21208067122554,
"Standard Deviation": null,
"Rank": 47
}
}
},
{
"config": {
"model_name": "command-r-(04-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 26.20787727166716,
"Standard Deviation": 1.6793980036057201,
"Rank": 34
},
"Geometry": {
"Average Score": 0.300416698,
"Standard Deviation": 0.03485612736,
"Rank": 36
},
"Algebra": {
"Average Score": 0.293120231,
"Standard Deviation": 0.032926484,
"Rank": 37
},
"Probability": {
"Average Score": 28.551833516483626,
"Standard Deviation": null,
"Rank": 44
},
"Logical": {
"Average Score": 30.83782425033377,
"Standard Deviation": 3.4266833154577383,
"Rank": 39
},
"Social": {
"Average Score": 0.283882949,
"Standard Deviation": 0.03336901148,
"Rank": 31
},
"Chemistry": {
"Average Score": 40.38004181614496,
"Standard Deviation": null,
"Rank": 32
},
"CPP": {
"Average Score": 41.346336503003236,
"Standard Deviation": null,
"Rank": 28
}
}
},
{
"config": {
"model_name": "vicuna-33b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 19.726298678709266,
"Standard Deviation": 1.0771354692793496,
"Rank": 41
},
"Geometry": {
"Average Score": 0.208284679,
"Standard Deviation": 0.03937771461,
"Rank": 40
},
"Algebra": {
"Average Score": 0.248994048,
"Standard Deviation": 0.02668175054,
"Rank": 39
},
"Probability": {
"Average Score": 23.2308538772627,
"Standard Deviation": null,
"Rank": 47
},
"Logical": {
"Average Score": 19.488409585540122,
"Standard Deviation": 0.7913465863319494,
"Rank": 46
},
"Social": {
"Average Score": 0.257623798,
"Standard Deviation": 0.02653724437,
"Rank": 34
},
"Chemistry": {
"Average Score": 27.198874596635843,
"Standard Deviation": null,
"Rank": 43
},
"CPP": {
"Average Score": 28.01838653090379,
"Standard Deviation": null,
"Rank": 38
}
}
},
{
"config": {
"model_name": "gemma-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 18.339626858215343,
"Standard Deviation": 0.1553156123023995,
"Rank": 43
},
"Geometry": {
"Average Score": 0.244791417,
"Standard Deviation": 0.0289612078,
"Rank": 37
},
"Algebra": {
"Average Score": 0.250614794,
"Standard Deviation": 0.01991678295,
"Rank": 38
},
"Probability": {
"Average Score": 18.066869704202595,
"Standard Deviation": null,
"Rank": 48
},
"Logical": {
"Average Score": 22.446113532575186,
"Standard Deviation": 1.1759308097806727,
"Rank": 43
},
"Social": {
"Average Score": 0.202138025,
"Standard Deviation": 0.02098346639,
"Rank": 39
},
"Chemistry": {
"Average Score": 27.195166540671735,
"Standard Deviation": null,
"Rank": 43
},
"CPP": {
"Average Score": 28.014658234926813,
"Standard Deviation": null,
"Rank": 39
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-2",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 32.27919528900069,
"Standard Deviation": 2.070593349377193,
"Rank": 28
},
"Geometry": {
"Average Score": 0.216402626,
"Standard Deviation": 0.03338414918,
"Rank": 38
},
"Algebra": {
"Average Score": 0.233777838,
"Standard Deviation": 0.0155226054,
"Rank": 40
},
"Probability": {
"Average Score": 25.70261650740474,
"Standard Deviation": null,
"Rank": 45
},
"Logical": {
"Average Score": 26.165635051797608,
"Standard Deviation": 1.5009510944001014,
"Rank": 41
},
"Social": {
"Average Score": 0.209386782,
"Standard Deviation": 0.02738569921,
"Rank": 38
},
"Chemistry": {
"Average Score": 30.70773868184025,
"Standard Deviation": null,
"Rank": 39
},
"CPP": {
"Average Score": 31.382959631870822,
"Standard Deviation": null,
"Rank": 35
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 14.750363553682964,
"Standard Deviation": 0.442399072321264,
"Rank": 48
},
"Geometry": {
"Average Score": 0.161799938,
"Standard Deviation": 0.03595278559,
"Rank": 44
},
"Algebra": {
"Average Score": 0.210341624,
"Standard Deviation": 0.01736539119,
"Rank": 41
},
"Probability": {
"Average Score": 24.69501890202338,
"Standard Deviation": null,
"Rank": 46
},
"Logical": {
"Average Score": 15.957706802740889,
"Standard Deviation": 2.080778273455708,
"Rank": 50
},
"Social": {
"Average Score": 0.117646827,
"Standard Deviation": 0.009321202779,
"Rank": 47
},
"Chemistry": {
"Average Score": 18.375111202411667,
"Standard Deviation": null,
"Rank": 47
},
"CPP": {
"Average Score": 18.929093202755805,
"Standard Deviation": null,
"Rank": 42
}
}
},
{
"config": {
"model_name": "vicuna-13b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 13.302607436757697,
"Standard Deviation": 0.570272227659312,
"Rank": 50
},
"Geometry": {
"Average Score": 0.200941928,
"Standard Deviation": 0.03366817781,
"Rank": 41
},
"Algebra": {
"Average Score": 0.196123323,
"Standard Deviation": 0.0135715643,
"Rank": 42
},
"Probability": {
"Average Score": 15.08476669604627,
"Standard Deviation": null,
"Rank": 50
},
"Logical": {
"Average Score": 16.548339412104294,
"Standard Deviation": 3.443370777556759,
"Rank": 49
},
"Social": {
"Average Score": 0.124655135,
"Standard Deviation": 0.01122382671,
"Rank": 46
},
"Chemistry": {
"Average Score": 21.201173318496842,
"Standard Deviation": null,
"Rank": 45
},
"CPP": {
"Average Score": 21.840013221590294,
"Standard Deviation": null,
"Rank": 40
}
}
},
{
"config": {
"model_name": "zephyr-7b-beta",
"organization": "HuggingFace",
"license": "MIT",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 7.378234886105356,
"Standard Deviation": 1.1456147261693999,
"Rank": 51
},
"Geometry": {
"Average Score": 0.114005544,
"Standard Deviation": 0.03144354365,
"Rank": 45
},
"Algebra": {
"Average Score": 0.141766633,
"Standard Deviation": 0.03179520129,
"Rank": 43
},
"Probability": {
"Average Score": 8.92696070171298,
"Standard Deviation": null,
"Rank": 53
},
"Logical": {
"Average Score": 6.971377981442089,
"Standard Deviation": 0.31669853263737413,
"Rank": 55
},
"Social": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 52
},
"Chemistry": {
"Average Score": 18.374948840997902,
"Standard Deviation": null,
"Rank": 47
},
"CPP": {
"Average Score": 18.92902220864132,
"Standard Deviation": null,
"Rank": 43
}
}
},
{
"config": {
"model_name": "gemma-1.1-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 16.083251992757752,
"Standard Deviation": 0.7340624884005772,
"Rank": 46
},
"Geometry": {
"Average Score": 0.183974034,
"Standard Deviation": 0.0215548886,
"Rank": 43
},
"Algebra": {
"Average Score": 0.13422252,
"Standard Deviation": 0.01922819511,
"Rank": 44
},
"Probability": {
"Average Score": 9.992136776217318,
"Standard Deviation": null,
"Rank": 52
},
"Logical": {
"Average Score": 9.537233946101678,
"Standard Deviation": 0.7567112693269967,
"Rank": 53
},
"Social": {
"Average Score": 0.167796727,
"Standard Deviation": 0.01666541942,
"Rank": 42
},
"Chemistry": {
"Average Score": 20.11834233400297,
"Standard Deviation": null,
"Rank": 46
},
"CPP": {
"Average Score": 20.724691953843916,
"Standard Deviation": null,
"Rank": 41
}
}
},
{
"config": {
"model_name": "llama2-7b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 17.319161859655946,
"Standard Deviation": 0.495520710612214,
"Rank": 45
},
"Geometry": {
"Average Score": 0.087067276,
"Standard Deviation": 0.04274343402,
"Rank": 46
},
"Algebra": {
"Average Score": 0.12308805,
"Standard Deviation": 0.01856053622,
"Rank": 45
},
"Probability": {
"Average Score": 8.860911732515305,
"Standard Deviation": null,
"Rank": 54
},
"Logical": {
"Average Score": 18.812132126028335,
"Standard Deviation": 3.0846832107977433,
"Rank": 47
},
"Social": {
"Average Score": 0.152905272,
"Standard Deviation": 0.007166957097,
"Rank": 43
},
"Chemistry": {
"Average Score": 15.270334671133512,
"Standard Deviation": null,
"Rank": 50
},
"CPP": {
"Average Score": 15.730513733660898,
"Standard Deviation": null,
"Rank": 45
}
}
},
{
"config": {
"model_name": "gemma-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 15.029602991101632,
"Standard Deviation": 0.4529017602377039,
"Rank": 47
},
"Geometry": {
"Average Score": 0.198571153,
"Standard Deviation": 0.01699161031,
"Rank": 42
},
"Algebra": {
"Average Score": 0.109883009,
"Standard Deviation": 0.01520005833,
"Rank": 46
},
"Probability": {
"Average Score": 6.561360414966015,
"Standard Deviation": null,
"Rank": 56
},
"Logical": {
"Average Score": 3.9858662356708785,
"Standard Deviation": 0.5609499073366407,
"Rank": 56
},
"Social": {
"Average Score": 0.087452913,
"Standard Deviation": 0.008170146562,
"Rank": 50
},
"Chemistry": {
"Average Score": 16.766144078336097,
"Standard Deviation": null,
"Rank": 49
},
"CPP": {
"Average Score": 17.2715657115764,
"Standard Deviation": null,
"Rank": 44
}
}
},
{
"config": {
"model_name": "llama2-13b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 17.47902371074294,
"Standard Deviation": 0.4047581815962028,
"Rank": 44
},
"Geometry": {
"Average Score": 0.072729954,
"Standard Deviation": 0.02315988261,
"Rank": 48
},
"Algebra": {
"Average Score": 0.080371692,
"Standard Deviation": 0.01277569453,
"Rank": 47
},
"Probability": {
"Average Score": 12.738302754764042,
"Standard Deviation": null,
"Rank": 51
},
"Logical": {
"Average Score": 21.708359515217182,
"Standard Deviation": 1.4862481594434973,
"Rank": 45
},
"Social": {
"Average Score": 0.149125922,
"Standard Deviation": 0.01157416827,
"Rank": 44
},
"Chemistry": {
"Average Score": 12.786967781868814,
"Standard Deviation": null,
"Rank": 53
},
"CPP": {
"Average Score": 13.17258252933903,
"Standard Deviation": null,
"Rank": 48
}
}
},
{
"config": {
"model_name": "vicuna-7b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 13.31896682669754,
"Standard Deviation": 0.30441157156016124,
"Rank": 49
},
"Geometry": {
"Average Score": 0.083457058,
"Standard Deviation": 0.02520989111,
"Rank": 47
},
"Algebra": {
"Average Score": 0.070883882,
"Standard Deviation": 0.007315853253,
"Rank": 48
},
"Probability": {
"Average Score": 8.255246380068842,
"Standard Deviation": null,
"Rank": 55
},
"Logical": {
"Average Score": 10.046676845257544,
"Standard Deviation": 0.6816182835206797,
"Rank": 52
},
"Social": {
"Average Score": 0.111076414,
"Standard Deviation": 0.004805626512,
"Rank": 48
},
"Chemistry": {
"Average Score": 13.838150481781991,
"Standard Deviation": null,
"Rank": 51
},
"CPP": {
"Average Score": 14.255194156624162,
"Standard Deviation": null,
"Rank": 46
}
}
},
{
"config": {
"model_name": "koala-13b",
"organization": "UC Berkeley",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 6.419305623111718,
"Standard Deviation": 0.19611070515647736,
"Rank": 53
},
"Geometry": {
"Average Score": 0.017374001,
"Standard Deviation": 0.01747053557,
"Rank": 49
},
"Algebra": {
"Average Score": 0.018129197,
"Standard Deviation": 0.01054371383,
"Rank": 49
},
"Probability": {
"Average Score": 4.1717283559090035,
"Standard Deviation": null,
"Rank": 57
},
"Logical": {
"Average Score": 7.484701131693112,
"Standard Deviation": 0.172417770163525,
"Rank": 54
},
"Social": {
"Average Score": 0.096983835,
"Standard Deviation": 0.007847059783,
"Rank": 49
},
"Chemistry": {
"Average Score": 6.177985738164252,
"Standard Deviation": null,
"Rank": 54
},
"CPP": {
"Average Score": 6.36433272373514,
"Standard Deviation": null,
"Rank": 49
}
}
},
{
"config": {
"model_name": "openassistant-pythia-12b",
"organization": "OpenAssistant",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 54
},
"Geometry": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 50
},
"Algebra": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 50
},
"Probability": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 58
},
"Logical": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 57
},
"Social": {
"Average Score": 0.030792528,
"Standard Deviation": 0.007518796391,
"Rank": 51
},
"Chemistry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 55
},
"CPP": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 50
}
}
},
{
"config": {
"model_name": "nemotron-70b",
"organization": "NVIDIA",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 100.0,
"Standard Deviation": 0.0,
"Rank": 1
},
"Chemistry": {
"Average Score": 96.00601450276388,
"Standard Deviation": null,
"Rank": 3
},
"Logical": {
"Average Score": 98.08807085219765,
"Standard Deviation": 0.832489959144682,
"Rank": 5
},
"Probability": {
"Average Score": 91.16755514126538,
"Standard Deviation": null,
"Rank": 4
}
}
},
{
"config": {
"model_name": "llama-3.2-3b-it",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 30.40742747938681,
"Standard Deviation": 1.6816556668351852,
"Rank": 30
},
"Chemistry": {
"Average Score": 27.43049468475638,
"Standard Deviation": null,
"Rank": 42
},
"Logical": {
"Average Score": 41.58905844173492,
"Standard Deviation": 5.2798221527591,
"Rank": 29
},
"Probability": {
"Average Score": 62.02868227997844,
"Standard Deviation": null,
"Rank": 18
}
}
},
{
"config": {
"model_name": "yi-lightning",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"Chemistry": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Logical": {
"Average Score": 98.816765663456,
"Standard Deviation": 0.3271335810663529,
"Rank": 3
},
"Probability": {
"Average Score": 95.8842044402052,
"Standard Deviation": null,
"Rank": 2
}
}
},
{
"config": {
"model_name": "glm-4-plus",
"organization": "Zhipu AI",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"Chemistry": {
"Average Score": 99.05822908668402,
"Standard Deviation": null,
"Rank": 2
},
"Logical": {
"Average Score": 99.45307787995229,
"Standard Deviation": 0.5982476107949444,
"Rank": 1
},
"Probability": {
"Average Score": 92.04426702796823,
"Standard Deviation": null,
"Rank": 3
}
}
}
]