File size: 14,345 Bytes
8553d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
{
    "GPT_4o": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.5187898818829914,
            "micro_mean_score": 0.5127977300993917
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.5251654337401854,
            "micro_mean_score": 0.522332974147119
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2448,
            "macro_mean_score": 0.6478225794744895,
            "micro_mean_score": 0.665391229578676
        },
        "overall_score": 0.5409529871515315
    },
    "Gemini_1.5_pro_002": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.46887846869580546,
            "micro_mean_score": 0.46403536258864253
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.481393687771543,
            "micro_mean_score": 0.4756661334397647
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2448,
            "macro_mean_score": 0.5858190649927173,
            "micro_mean_score": 0.6104901117798793
        },
        "overall_score": 0.4948345779089219
    },
    "Gemini_1.5_flash_002": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.4183865592515826,
            "micro_mean_score": 0.41216971462683855
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.4183865592515826,
            "micro_mean_score": 0.41216971462683855
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2168,
            "macro_mean_score": 0.5691365176285039,
            "micro_mean_score": 0.5987532244196045
        },
        "overall_score": 0.4377900192406913
    },
    "Claude_3.5": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.4863241841253708,
            "micro_mean_score": 0.4798092874490549
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.5023557473841108,
            "micro_mean_score": 0.4985442599850241
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2288,
            "macro_mean_score": 0.6373907158949892,
            "micro_mean_score": 0.6569647463456579
        },
        "overall_score": 0.519736485905313
    },
    "GPT_4o_mini": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.3974259652331149,
            "micro_mean_score": 0.392578163407945
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.4070959243997505,
            "micro_mean_score": 0.40376078514357017
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 1224,
            "macro_mean_score": 0.586537827213665,
            "micro_mean_score": 0.6133276010318144
        },
        "overall_score": 0.43019240694015537
    },
    "Qwen2_VL_72B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.4623988230573754,
            "micro_mean_score": 0.4568583770401895
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.45284699372478177,
            "micro_mean_score": 0.4487693487093462
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2448,
            "macro_mean_score": 0.5639771804231668,
            "micro_mean_score": 0.5835339638865004
        },
        "overall_score": 0.4754732650945565
    },
    "Qwen2_VL_7B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.34725455697890745,
            "micro_mean_score": 0.34344091516995323
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.3284357723853296,
            "micro_mean_score": 0.32443422147119677
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1170,
            "num_total_samples": 2452,
            "macro_mean_score": 0.43955105763038577,
            "micro_mean_score": 0.45508547008546996
        },
        "overall_score": 0.35913430458751355
    },
    "llava_onevision_72B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.31960132549012704,
            "micro_mean_score": 0.3173848563095166
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.29725827011768174,
            "micro_mean_score": 0.2954433666362564
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 1224,
            "macro_mean_score": 0.4599484231632498,
            "micro_mean_score": 0.4850386930352536
        },
        "overall_score": 0.33766580340844976
    },
    "llava_onevision_7B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.2239290419841492,
            "micro_mean_score": 0.22222171180488767
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.21347545703998197,
            "micro_mean_score": 0.210586172002703
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2448,
            "macro_mean_score": 0.33979975321921935,
            "micro_mean_score": 0.36474634565778147
        },
        "overall_score": 0.23884309392529685
    },
    "InternVL2_76B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.34977582844066846,
            "micro_mean_score": 0.3452353155814884
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.35539585884136143,
            "micro_mean_score": 0.35043335903915124
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 1224,
            "macro_mean_score": 0.5192997443033639,
            "micro_mean_score": 0.5421324161650903
        },
        "overall_score": 0.37649239855429245
    },
    "InternVL2_8B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.25920867490737526,
            "micro_mean_score": 0.2543416126895087
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.24055897165959364,
            "micro_mean_score": 0.23784634936127952
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1165,
            "num_total_samples": 2452,
            "macro_mean_score": 0.3978571701460552,
            "micro_mean_score": 0.4108583690987125
        },
        "overall_score": 0.2770545208291856
    },
    "MiniCPM_v2.6": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.22838207666977445,
            "micro_mean_score": 0.22452805919103805
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.22901463640480854,
            "micro_mean_score": 0.2250606411323753
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2448,
            "macro_mean_score": 0.41728623355613875,
            "micro_mean_score": 0.43452278589853827
        },
        "overall_score": 0.25324761425596987
    },
    "Phi-3.5-vision": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.23240864879023493,
            "micro_mean_score": 0.22932978620408923
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.2295097914016776,
            "micro_mean_score": 0.2266573336398296
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2428,
            "macro_mean_score": 0.3947914647737769,
            "micro_mean_score": 0.42459157351676696
        },
        "overall_score": 0.2533094072831661
    },
    "Pixtral_12B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.3186510310643637,
            "micro_mean_score": 0.3151734861550665
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.3132232487306254,
            "micro_mean_score": 0.30971424472967524
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 1224,
            "macro_mean_score": 0.4566234428542061,
            "micro_mean_score": 0.4870593293207223
        },
        "overall_score": 0.3364098563442444
    },
    "Llama_3_2_11B": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.10044261716549671,
            "micro_mean_score": 0.09980638766828835
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.15984490401619783,
            "micro_mean_score": 0.15794038158731832
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 1224,
            "macro_mean_score": 0.3173342406187366,
            "micro_mean_score": 0.3487962166809973
        },
        "overall_score": 0.1801158087274157
    },
    "Idefics3": {
        "core_noncot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.11118980301103833,
            "micro_mean_score": 0.11201785633274061
        },
        "core_cot": {
            "num_eval_tasks": 440,
            "num_eval_samples": 6539,
            "num_not_eval_samples": 0,
            "num_total_samples": 6961,
            "macro_mean_score": 0.08956972487602757,
            "micro_mean_score": 0.08982225274252693
        },
        "open": {
            "num_eval_tasks": 65,
            "num_eval_samples": 1163,
            "num_total_samples": 2448,
            "macro_mean_score": 0.3210866162255635,
            "micro_mean_score": 0.35649183147033553
        },
        "overall_score": 0.138206224513898
    }
}