Linker1907 commited on
Commit
c283445
·
1 Parent(s): 22f9e0d
Files changed (2) hide show
  1. app.py +73 -13
  2. experiments.json +662 -0
app.py CHANGED
@@ -2,23 +2,65 @@ from datasets import load_dataset
2
  import json
3
  import gradio as gr
4
 
5
- # Hardcoded list of subsets from experiments.json
6
- BENCHMARKS = [
7
- "custom|gpqa:diamond|0",
8
- "custom|aime24|0",
9
- "custom|aime25|0",
10
- "extended|ifeval|0"
11
- ]
 
 
 
 
 
12
  from datasets import get_dataset_split_names
13
 
14
  # Add this near the top with other constants
15
  REPO_OPTIONS = [
16
- "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private",
17
- "OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private",
18
- # Add more common repositories as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  ]
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def get_available_splits(repo, benchmark):
 
 
22
  return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_"))
23
 
24
  def load_details_and_results(repo, subset, split):
@@ -158,14 +200,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
158
  with gr.Row():
159
  benchmark = gr.Dropdown(
160
  label="Benchmark",
161
- choices=BENCHMARKS,
162
- value=BENCHMARKS[0],
163
  info="Select the benchmark subset"
164
  )
165
  split = gr.Dropdown(
166
  label="Split",
167
  choices=[],
168
- info="Select the evaluation split"
169
  )
170
 
171
  with gr.Row():
@@ -195,6 +236,25 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
195
  outputs=[repo_dropdown, repo_custom]
196
  )
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  # Update the benchmark change handler
199
  benchmark.change(
200
  fn=lambda selection_method, dropdown, custom, bench: update_splits(
 
2
  import json
3
  import gradio as gr
4
 
5
+ # Load experiments.json to get model configurations
6
+ with open('experiments.json', 'r') as f:
7
+ EXPERIMENTS = json.load(f)
8
+
9
+ # Get all unique benchmark subsets from experiments.json
10
+ BENCHMARKS = []
11
+ for model_config in EXPERIMENTS.values():
12
+ for benchmark in model_config['benchmarks'].values():
13
+ subset = benchmark['subset']
14
+ if subset not in BENCHMARKS:
15
+ BENCHMARKS.append(subset)
16
+
17
  from datasets import get_dataset_split_names
18
 
19
  # Add this near the top with other constants
20
  REPO_OPTIONS = [
21
+ "OpenEvals/details_gpt-4o_private",
22
+ "OpenEvals/details_claude-3-7-sonnet-20250219_private",
23
+ "OpenEvals/details_o3-mini-2025-01-31_private",
24
+ "OpenEvals/details_moonshotai__Moonlight-16B-A3B-Instruct_private",
25
+ "OpenEvals/details_meta-llama__Llama-3.3-70B-Instruct_private",
26
+ "OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Llama-70B_private",
27
+ "OpenEvals/details_qihoo360__TinyR1-32B-Preview_private",
28
+ "OpenEvals/details_openai__gpt-4.5-preview-2025-02-27_private",
29
+ "OpenEvals/details_deepseek-ai__DeepSeek-R1-Distill-Qwen-32B_private",
30
+ "OpenEvals/details_openai__deepseek-ai__DeepSeek-R1_private",
31
+ "OpenEvals/details_Qwen__QwQ-32B_private",
32
+ "OpenEvals/details_google__gemma-3-1b-it_private",
33
+ "OpenEvals/details_google__gemma-3-12b-it_private",
34
+ "OpenEvals/details_google__gemma-3-27b-it_private",
35
+ "OpenEvals/details_openai__deepseek-ai__DeepSeek-V3-0324_private",
36
+ "OpenEvals/details_openai__deepseek-ai__DeepSeek-V3_private",
37
+ "OpenEvals/details_meta-llama__Llama-4-Scout-17B-16E-Instruct_private",
38
+ "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private"
39
  ]
40
 
41
+ def get_model_name_from_repo(repo):
42
+ # Extract model name from repository path
43
+ # Example: "OpenEvals/details_meta-llama__Llama-4-Maverick-17B-128E-Instruct-FP8_private"
44
+ # -> "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
45
+ parts = repo.split('/')
46
+ model_name = parts[1].replace('details_', '').replace('_private', '')
47
+ # Convert double underscores back to forward slashes
48
+ model_name = model_name.replace('__', '/')
49
+ return model_name
50
+
51
+ def get_available_benchmarks(repo):
52
+ model_name = get_model_name_from_repo(repo)
53
+ print(model_name)
54
+ if not model_name or model_name not in EXPERIMENTS:
55
+ return []
56
+
57
+ model_config = EXPERIMENTS[model_name]
58
+ print(model_config)
59
+ return [benchmark['subset'] for benchmark in model_config['benchmarks'].values()]
60
+
61
  def get_available_splits(repo, benchmark):
62
+ if not benchmark:
63
+ return []
64
  return get_dataset_split_names(repo, config_name=benchmark.replace("|", "_").replace(":", "_"))
65
 
66
  def load_details_and_results(repo, subset, split):
 
200
  with gr.Row():
201
  benchmark = gr.Dropdown(
202
  label="Benchmark",
203
+ choices=[],
 
204
  info="Select the benchmark subset"
205
  )
206
  split = gr.Dropdown(
207
  label="Split",
208
  choices=[],
209
+ info="Select evaluation."
210
  )
211
 
212
  with gr.Row():
 
236
  outputs=[repo_dropdown, repo_custom]
237
  )
238
 
239
+ # Update the repository change handler to update available benchmarks
240
+ def update_benchmarks(selection_method, dropdown_value, custom_value):
241
+ repo = get_active_repo(selection_method, dropdown_value, custom_value)
242
+ available_benchmarks = get_available_benchmarks(repo)
243
+ print(available_benchmarks)
244
+ return gr.Dropdown(choices=available_benchmarks, value=available_benchmarks[0] if available_benchmarks else None)
245
+
246
+ repo_dropdown.change(
247
+ fn=update_benchmarks,
248
+ inputs=[repo_select, repo_dropdown, repo_custom],
249
+ outputs=benchmark
250
+ )
251
+
252
+ repo_custom.change(
253
+ fn=update_benchmarks,
254
+ inputs=[repo_select, repo_dropdown, repo_custom],
255
+ outputs=benchmark
256
+ )
257
+
258
  # Update the benchmark change handler
259
  benchmark.change(
260
  fn=lambda selection_method, dropdown, custom, bench: update_splits(
experiments.json ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gpt-4o": {
3
+ "display_name": "gpt 4o",
4
+ "provider": "openai",
5
+ "open": false,
6
+ "size": "?B",
7
+ "thinking": false,
8
+ "benchmarks": {
9
+ "gpqa_diamond": {
10
+ "subset": "lighteval|gpqa:diamond|0",
11
+ "metrics": [
12
+ "extractive_match"
13
+ ],
14
+ "tags": {
15
+ "latest": "2025-02-26T10-14-16.106571"
16
+ }
17
+ },
18
+ "aime_24": {
19
+ "subset": "lighteval|aime24|0",
20
+ "metrics": [
21
+ "extractive_match"
22
+ ],
23
+ "tags": {
24
+ "latest": "2025-02-26T10-14-16.106571"
25
+ }
26
+ },
27
+ "aime_25": {
28
+ "subset": "lighteval|aime25|0",
29
+ "metrics": [
30
+ "extractive_match"
31
+ ],
32
+ "tags": {
33
+ "latest": "2025-02-26T10-14-16.106571"
34
+ }
35
+ },
36
+ "ifeval": {
37
+ "subset": "extended|ifeval|0",
38
+ "metrics": [
39
+ "prompt_level_strict_acc"
40
+ ],
41
+ "tags": {
42
+ "latest": "2025-02-26T10-14-16.106571"
43
+ }
44
+ }
45
+ }
46
+ },
47
+ "claude-3-7-sonnet-20250219": {
48
+ "display_name": "Claude 3.7 Sonnet",
49
+ "provider": "anthropic",
50
+ "open": false,
51
+ "size": "?B",
52
+ "thinking": false,
53
+ "benchmarks": {
54
+ "gpqa_diamond": {
55
+ "subset": "lighteval|gpqa:diamond|0",
56
+ "metrics": [
57
+ "extractive_match"
58
+ ],
59
+ "tags": {
60
+ "default": "2025-02-25T12-43-49.294245",
61
+ "thinking": "2025-03-05T15-37-37.180318"
62
+ }
63
+ },
64
+ "aime_24": {
65
+ "subset": "lighteval|aime24|0",
66
+ "metrics": [
67
+ "extractive_match"
68
+ ],
69
+ "tags": {
70
+ "default": "2025-02-25T12-37-52.771787",
71
+ "thinking": "2025-03-05T12-39-13.627801"
72
+ }
73
+ },
74
+ "aime_25": {
75
+ "subset": "lighteval|aime25|0",
76
+ "metrics": [
77
+ "extractive_match"
78
+ ],
79
+ "tags": {
80
+ "default": "2025-02-25T12-37-52.771787",
81
+ "thinking": "2025-03-05T12-39-13.627801"
82
+ }
83
+ },
84
+ "ifeval": {
85
+ "subset": "extended|ifeval|0",
86
+ "metrics": [
87
+ "prompt_level_strict_acc"
88
+ ],
89
+ "tags": {
90
+ "default": "2025-02-25T12-24-45.750753",
91
+ "thinking": "2025-03-05T15-37-37.180318"
92
+ }
93
+ }
94
+ }
95
+ },
96
+ "o3-mini-2025-01-31": {
97
+ "display_name": "o3-mini",
98
+ "provider": "openai",
99
+ "open": false,
100
+ "size": "?B",
101
+ "thinking": true,
102
+ "benchmarks": {
103
+ "gpqa_diamond": {
104
+ "subset": "lighteval|gpqa:diamond|0",
105
+ "metrics": [
106
+ "extractive_match"
107
+ ],
108
+ "tags": {
109
+ "latest": "2025-02-26T11-37-01.193437"
110
+ }
111
+ },
112
+ "aime_24": {
113
+ "subset": "lighteval|aime24|0",
114
+ "metrics": [
115
+ "extractive_match"
116
+ ],
117
+ "tags": {
118
+ "latest": "2025-02-26T11-37-01.193437"
119
+ }
120
+ },
121
+ "aime_25": {
122
+ "subset": "lighteval|aime25|0",
123
+ "metrics": [
124
+ "extractive_match"
125
+ ],
126
+ "tags": {
127
+ "latest": "2025-02-26T11-37-01.193437"
128
+ }
129
+ },
130
+ "ifeval": {
131
+ "subset": "extended|ifeval|0",
132
+ "metrics": [
133
+ "prompt_level_strict_acc"
134
+ ],
135
+ "tags": {
136
+ "latest": "2025-02-26T11-37-01.193437"
137
+ }
138
+ }
139
+ }
140
+ },
141
+ "moonshotai/Moonlight-16B-A3B-Instruct": {
142
+ "display_name": "Moonlight",
143
+ "provider": "moonshotai",
144
+ "open": true,
145
+ "size": "16B",
146
+ "thinking": false,
147
+ "benchmarks": {
148
+ "gpqa_diamond": {
149
+ "subset": "lighteval|gpqa:diamond|0",
150
+ "metrics": [
151
+ "extractive_match"
152
+ ],
153
+ "tags": {
154
+ "latest": "2025_02_26T13_32_06.104265"
155
+ }
156
+ },
157
+ "aime_24": {
158
+ "subset": "lighteval|aime24|0",
159
+ "metrics": [
160
+ "extractive_match"
161
+ ],
162
+ "tags": {
163
+ "latest": "2025_02_26T13_32_06.104265"
164
+ }
165
+ },
166
+ "aime_25": {
167
+ "subset": "lighteval|aime25|0",
168
+ "metrics": [
169
+ "extractive_match"
170
+ ],
171
+ "tags": {
172
+ "latest": "2025_02_26T13_32_06.104265"
173
+ }
174
+ },
175
+ "ifeval": {
176
+ "subset": "extended|ifeval|0",
177
+ "metrics": [
178
+ "prompt_level_strict_acc"
179
+ ],
180
+ "tags": {
181
+ "latest": "2025_02_26T13_32_06.104265"
182
+ }
183
+ }
184
+ }
185
+ },
186
+ "meta-llama/Llama-3.3-70B-Instruct": {
187
+ "display_name": "Llama 3.3 70B",
188
+ "provider": "meta",
189
+ "open": true,
190
+ "size": "70B",
191
+ "thinking": false,
192
+ "benchmarks": {
193
+ "gpqa_diamond": {
194
+ "subset": "lighteval|gpqa:diamond|0",
195
+ "metrics": ["extractive_match"],
196
+ "tags": {
197
+ "latest": "2025-02-26T17-13-13.448521"
198
+ }
199
+ },
200
+ "aime_24": {
201
+ "subset": "lighteval|aime24|0",
202
+ "metrics": ["extractive_match"],
203
+ "tags": {
204
+ "latest": "2025-02-26T17-13-13.448521"
205
+ }
206
+ },
207
+ "aime_25": {
208
+ "subset": "lighteval|aime25|0",
209
+ "metrics": ["extractive_match"],
210
+ "tags": {
211
+ "latest": "2025-02-26T17-13-13.448521"
212
+ }
213
+ },
214
+ "ifeval": {
215
+ "subset": "extended|ifeval|0",
216
+ "metrics": ["prompt_level_strict_acc"],
217
+ "tags": {
218
+ "latest": "2025-02-26T17-13-13.448521"
219
+ }
220
+ }
221
+ }
222
+ },
223
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
224
+ "display_name": "DeepSeek Llama 70B",
225
+ "provider": "deepseek",
226
+ "open": true,
227
+ "size": "70B",
228
+ "thinking": true,
229
+ "benchmarks": {
230
+ "gpqa_diamond": {
231
+ "subset": "lighteval|gpqa:diamond|0",
232
+ "metrics": ["extractive_match"],
233
+ "tags": {
234
+ "latest": "2025-02-27T11-09-04.037858"
235
+ }
236
+ },
237
+ "aime_24": {
238
+ "subset": "lighteval|aime24|0",
239
+ "metrics": ["extractive_match"],
240
+ "tags": {
241
+ "latest": "2025-02-27T11-09-04.037858"
242
+ }
243
+ },
244
+ "aime_25": {
245
+ "subset": "lighteval|aime25|0",
246
+ "metrics": ["extractive_match"],
247
+ "tags": {
248
+ "latest": "2025-02-27T11-09-04.037858"
249
+ }
250
+ },
251
+ "ifeval": {
252
+ "subset": "extended|ifeval|0",
253
+ "metrics": ["prompt_level_strict_acc"],
254
+ "tags": {
255
+ "latest": "2025-02-27T14-02-02.414381"
256
+ }
257
+ }
258
+ }
259
+ },
260
+ "qihoo360/TinyR1-32B-Preview": {
261
+ "display_name": "TinyR1 32B",
262
+ "provider": "qihoo360",
263
+ "open": true,
264
+ "size": "32B",
265
+ "thinking": false,
266
+ "benchmarks": {
267
+ "gpqa_diamond": {
268
+ "subset": "lighteval|gpqa:diamond|0",
269
+ "metrics": ["extractive_match"],
270
+ "tags": {
271
+ "latest": "2025-02-27T13-32-41.564652"
272
+ }
273
+ },
274
+ "aime_24": {
275
+ "subset": "lighteval|aime24|0",
276
+ "metrics": ["extractive_match"],
277
+ "tags": {
278
+ "latest": "2025-02-27T13-32-41.564652"
279
+ }
280
+ },
281
+ "aime_25": {
282
+ "subset": "lighteval|aime25|0",
283
+ "metrics": ["extractive_match"],
284
+ "tags": {
285
+ "latest": "2025-02-27T13-32-41.564652"
286
+ }
287
+ },
288
+ "ifeval": {
289
+ "subset": "extended|ifeval|0",
290
+ "metrics": ["prompt_level_strict_acc"],
291
+ "tags": {
292
+ "latest": "2025-02-27T13-32-41.564652"
293
+ }
294
+ }
295
+ }
296
+ },
297
+ "openai/gpt-4.5-preview-2025-02-27": {
298
+ "display_name": "gpt 4.5",
299
+ "provider": "openai",
300
+ "open": false,
301
+ "size": "?B",
302
+ "thinking": false,
303
+ "benchmarks": {
304
+ "gpqa_diamond": {
305
+ "subset": "lighteval|gpqa:diamond|0",
306
+ "metrics": ["extractive_match"],
307
+ "tags": {
308
+ "latest": "2025-03-03T11-35-34.241611"
309
+ }
310
+ },
311
+ "aime_24": {
312
+ "subset": "lighteval|aime24|0",
313
+ "metrics": ["extractive_match"],
314
+ "tags": {
315
+ "latest": "2025-03-03T11-15-32.836958"
316
+ }
317
+ },
318
+ "aime_25": {
319
+ "subset": "lighteval|aime25|0",
320
+ "metrics": ["extractive_match"],
321
+ "tags": {
322
+ "latest": "2025-03-03T11-15-32.836958"
323
+ }
324
+ },
325
+ "ifeval": {
326
+ "subset": "extended|ifeval|0",
327
+ "metrics": ["prompt_level_strict_acc"],
328
+ "tags": {
329
+ "latest": "2025-03-03T11-17-20.767980"
330
+ }
331
+ }
332
+ }
333
+ },
334
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {
335
+ "display_name": "DeepSeek Qwen 32B",
336
+ "provider": "deepseek",
337
+ "open": true,
338
+ "size": "32B",
339
+ "thinking": true,
340
+ "benchmarks": {
341
+ "gpqa_diamond": {
342
+ "subset": "lighteval|gpqa:diamond|0",
343
+ "metrics": ["extractive_match"],
344
+ "tags": {
345
+ "latest": "2025-03-03T14-51-09.849491"
346
+ }
347
+ },
348
+ "aime_24": {
349
+ "subset": "lighteval|aime24|0",
350
+ "metrics": ["extractive_match"],
351
+ "tags": {
352
+ "latest": "2025-03-03T14-51-09.849491"
353
+ }
354
+ },
355
+ "aime_25": {
356
+ "subset": "lighteval|aime25|0",
357
+ "metrics": ["extractive_match"],
358
+ "tags": {
359
+ "latest": "2025-03-03T14-51-09.849491"
360
+ }
361
+ },
362
+ "ifeval": {
363
+ "subset": "extended|ifeval|0",
364
+ "metrics": ["prompt_level_strict_acc"],
365
+ "tags": {
366
+ "latest": "2025-03-03T15-06-10.838105"
367
+ }
368
+ }
369
+ }
370
+ },
371
+ "openai/deepseek-ai/DeepSeek-R1": {
372
+ "display_name": "DeepSeek R1",
373
+ "provider": "deepseek",
374
+ "open": true,
375
+ "size": "671B",
376
+ "thinking": true,
377
+ "benchmarks": {
378
+ "gpqa_diamond": {
379
+ "subset": "lighteval|gpqa:diamond|0",
380
+ "metrics": ["extractive_match"],
381
+ "tags": {
382
+ "latest": "2025-03-04T17-06-33.124766"
383
+ }
384
+ },
385
+ "aime_24": {
386
+ "subset": "lighteval|aime24|0",
387
+ "metrics": ["extractive_match"],
388
+ "tags": {
389
+ "latest": "2025-03-04T14-52-35.594174"
390
+ }
391
+ },
392
+ "aime_25": {
393
+ "subset": "lighteval|aime25|0",
394
+ "metrics": ["extractive_match"],
395
+ "tags": {
396
+ "latest": "2025-03-04T14-25-05.009799"
397
+ }
398
+ },
399
+ "ifeval": {
400
+ "subset": "extended|ifeval|0",
401
+ "metrics": ["prompt_level_strict_acc"],
402
+ "tags": {
403
+ "latest": "2025-03-04T15-24-42.488745"
404
+ }
405
+ }
406
+ }
407
+ },
408
+ "Qwen/QwQ-32B": {
409
+ "display_name": "QwQ 32B",
410
+ "provider": "Qwen",
411
+ "open": true,
412
+ "size": "32B",
413
+ "thinking": true,
414
+ "benchmarks": {
415
+ "gpqa_diamond": {
416
+ "subset": "lighteval|gpqa:diamond|0",
417
+ "metrics": ["extractive_match"],
418
+ "tags": {
419
+ "latest": "2025-03-10T11-47-46.303371"
420
+ }
421
+ },
422
+ "aime_24": {
423
+ "subset": "lighteval|aime24|0",
424
+ "metrics": ["extractive_match"],
425
+ "tags": {
426
+ "latest": "2025-03-10T10-36-07.886033"
427
+ }
428
+ },
429
+ "aime_25": {
430
+ "subset": "lighteval|aime25|0",
431
+ "metrics": ["extractive_match"],
432
+ "tags": {
433
+ "latest": "2025-03-10T10-36-07.886033"
434
+ }
435
+ },
436
+ "ifeval": {
437
+ "subset": "extended|ifeval|0",
438
+ "metrics": ["prompt_level_strict_acc"],
439
+ "tags": {
440
+ "latest": "2025-03-10T12-21-36.862202"
441
+ }
442
+ }
443
+ }
444
+ },
445
+ "google/gemma-3-1b-it": {
446
+ "display_name": "Gemma 3",
447
+ "provider": "google",
448
+ "open": true,
449
+ "size": "1B",
450
+ "thinking": false,
451
+ "benchmarks": {
452
+ "aime_25": {
453
+ "subset": "lighteval|aime25|0",
454
+ "metrics": ["extractive_match"],
455
+ "tags": {
456
+ "latest": "2025-03-18T14-25-56.178612"
457
+ }
458
+ }
459
+ }
460
+ },
461
+ "google/gemma-3-12b-it": {
462
+ "display_name": "Gemma 3 12B",
463
+ "provider": "google",
464
+ "open": true,
465
+ "size": "12B",
466
+ "thinking": false,
467
+ "benchmarks": {
468
+ "aime_25": {
469
+ "subset": "lighteval|aime25|0",
470
+ "metrics": ["extractive_match"],
471
+ "tags": {
472
+ "latest": "2025-03-18T14-36-23.368081"
473
+ }
474
+ }
475
+ }
476
+ },
477
+ "google/gemma-3-27b-it": {
478
+ "display_name": "Gemma 3 27B",
479
+ "provider": "google",
480
+ "open": true,
481
+ "size": "27B",
482
+ "thinking": false,
483
+ "benchmarks": {
484
+ "aime_25": {
485
+ "subset": "lighteval|aime25|0",
486
+ "metrics": ["extractive_match"],
487
+ "tags": {
488
+ "latest": "2025-03-18T14-41-33.181467"
489
+ }
490
+ },
491
+ "aime_24": {
492
+ "subset": "lighteval|aime24|0",
493
+ "metrics": ["extractive_match"],
494
+ "tags": {
495
+ "latest": "2025-03-18T15-11-34.174477"
496
+ }
497
+ },
498
+ "ifeval": {
499
+ "subset": "extended|ifeval|0",
500
+ "metrics": ["prompt_level_strict_acc"],
501
+ "tags": {
502
+ "latest": "2025-03-18T15-20-14.979833"
503
+ }
504
+ },
505
+ "gpqa_diamond": {
506
+ "subset": "lighteval|gpqa:diamond|0",
507
+ "metrics": ["extractive_match"],
508
+ "tags": {
509
+ "latest": "2025-03-18T15-20-14.979833"
510
+ }
511
+ }
512
+ }
513
+ },
514
+ "openai/deepseek-ai/DeepSeek-V3-0324": {
515
+ "display_name": "DeepSeek V3 0324",
516
+ "provider": "deepseek",
517
+ "open": true,
518
+ "size": "671B",
519
+ "thinking": false,
520
+ "benchmarks": {
521
+ "aime_25": {
522
+ "subset": "lighteval|aime25|0",
523
+ "metrics": ["extractive_match"],
524
+ "tags": {
525
+ "latest": "2025-03-25T15-00-18.969082"
526
+ }
527
+ },
528
+ "aime_24": {
529
+ "subset": "lighteval|aime24|0",
530
+ "metrics": ["extractive_match"],
531
+ "tags": {
532
+ "latest": "2025-03-25T15-00-18.969082"
533
+ }
534
+ },
535
+ "ifeval": {
536
+ "subset": "extended|ifeval|0",
537
+ "metrics": ["prompt_level_strict_acc"],
538
+ "tags": {
539
+ "latest": "2025-03-25T15-34-22.165555"
540
+ }
541
+ },
542
+ "gpqa_diamond": {
543
+ "subset": "lighteval|gpqa:diamond|0",
544
+ "metrics": ["extractive_match"],
545
+ "tags": {
546
+ "latest": "2025-03-25T14-22-37.175021"
547
+ }
548
+ }
549
+ }
550
+ },
551
+ "openai/deepseek-ai/DeepSeek-V3": {
552
+ "display_name": "DeepSeek V3",
553
+ "provider": "deepseek",
554
+ "open": true,
555
+ "size": "671B",
556
+ "thinking": false,
557
+ "benchmarks": {
558
+ "aime_25": {
559
+ "subset": "lighteval|aime25|0",
560
+ "metrics": ["extractive_match"],
561
+ "tags": {
562
+ "latest": "2025-03-25T19-39-33.880476"
563
+ }
564
+ },
565
+ "aime_24": {
566
+ "subset": "lighteval|aime24|0",
567
+ "metrics": ["extractive_match"],
568
+ "tags": {
569
+ "latest": "2025-03-25T19-39-33.880476"
570
+ }
571
+ },
572
+ "ifeval": {
573
+ "subset": "extended|ifeval|0",
574
+ "metrics": ["prompt_level_strict_acc"],
575
+ "tags": {
576
+ "latest": "2025-03-25T19-39-33.880476"
577
+ }
578
+ },
579
+ "gpqa_diamond": {
580
+ "subset": "lighteval|gpqa:diamond|0",
581
+ "metrics": ["extractive_match"],
582
+ "tags": {
583
+ "latest": "2025-03-25T19-39-33.880476"
584
+ }
585
+ }
586
+ }
587
+ },
588
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": {
589
+ "display_name": "Llama 4 Scout 17B",
590
+ "provider": "meta",
591
+ "open": true,
592
+ "size": "17B (109B params)",
593
+ "thinking": false,
594
+ "benchmarks": {
595
+ "aime_25": {
596
+ "subset": "custom|aime25|0",
597
+ "metrics": ["extractive_match"],
598
+ "tags": {
599
+ "latest": "2025-04-07T12-01-58.793350"
600
+ }
601
+ },
602
+ "aime_24": {
603
+ "subset": "custom|aime24|0",
604
+ "metrics": ["extractive_match"],
605
+ "tags": {
606
+ "latest": "2025-04-07T12-01-58.793350"
607
+ }
608
+ },
609
+ "ifeval": {
610
+ "subset": "extended|ifeval|0",
611
+ "metrics": ["prompt_level_strict_acc"],
612
+ "tags": {
613
+ "latest": "2025-04-07T12-01-58.793350"
614
+ }
615
+ },
616
+ "gpqa_diamond": {
617
+ "subset": "custom|gpqa:diamond|0",
618
+ "metrics": ["extractive_match"],
619
+ "tags": {
620
+ "latest": "2025-04-07T12-01-58.793350"
621
+ }
622
+ }
623
+ }
624
+ },
625
+ "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": {
626
+ "display_name": "Llama 4 Maverick 17B FP8",
627
+ "provider": "meta",
628
+ "open": true,
629
+ "size": "17B (400B params)",
630
+ "thinking": false,
631
+ "benchmarks": {
632
+ "aime_25": {
633
+ "subset": "custom|aime25|0",
634
+ "metrics": ["extractive_match"],
635
+ "tags": {
636
+ "latest": "2025-04-07T13-08-22.017751"
637
+ }
638
+ },
639
+ "aime_24": {
640
+ "subset": "custom|aime24|0",
641
+ "metrics": ["extractive_match"],
642
+ "tags": {
643
+ "latest": "2025-04-07T13-08-22.017751"
644
+ }
645
+ },
646
+ "ifeval": {
647
+ "subset": "extended|ifeval|0",
648
+ "metrics": ["prompt_level_strict_acc"],
649
+ "tags": {
650
+ "latest": "2025-04-07T13-08-22.017751"
651
+ }
652
+ },
653
+ "gpqa_diamond": {
654
+ "subset": "custom|gpqa:diamond|0",
655
+ "metrics": ["extractive_match"],
656
+ "tags": {
657
+ "latest": "2025-04-07T13-08-22.017751"
658
+ }
659
+ }
660
+ }
661
+ }
662
+ }