lomahony commited on
Commit
c261350
1 Parent(s): d878f7e

Upload 8 files

Browse files
base-70m-eval-files/EleutherAI-pythia-70m-0shot-shelloutput.txt ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bootstrapping for stddev: perplexity
2
+ {
3
+ "results": {
4
+ "arc_challenge": {
5
+ "acc,none": 0.1757679180887372,
6
+ "acc_stderr,none": 0.011122850863120485,
7
+ "acc_norm,none": 0.21843003412969283,
8
+ "acc_norm_stderr,none": 0.012074291605700975
9
+ },
10
+ "arc_easy": {
11
+ "acc,none": 0.37542087542087543,
12
+ "acc_stderr,none": 0.00993621852711428,
13
+ "acc_norm,none": 0.3522727272727273,
14
+ "acc_norm_stderr,none": 0.009801753933112785
15
+ },
16
+ "boolq": {
17
+ "acc,none": 0.5886850152905199,
18
+ "acc_stderr,none": 0.00860639542630921
19
+ },
20
+ "hellaswag": {
21
+ "acc,none": 0.2666799442342163,
22
+ "acc_stderr,none": 0.004413198640053973,
23
+ "acc_norm,none": 0.27384983071101376,
24
+ "acc_norm_stderr,none": 0.004450214826707207
25
+ },
26
+ "lambada_openai": {
27
+ "perplexity,none": 130.96389727138103,
28
+ "perplexity_stderr,none": 5.501211486155379,
29
+ "acc,none": 0.22705220260042694,
30
+ "acc_stderr,none": 0.005836466732850104
31
+ },
32
+ "openbookqa": {
33
+ "acc,none": 0.126,
34
+ "acc_stderr,none": 0.014855617750787541,
35
+ "acc_norm,none": 0.254,
36
+ "acc_norm_stderr,none": 0.01948659680164337
37
+ },
38
+ "piqa": {
39
+ "acc,none": 0.5984766050054406,
40
+ "acc_stderr,none": 0.011437324373397844,
41
+ "acc_norm,none": 0.5919477693144722,
42
+ "acc_norm_stderr,none": 0.011466872778651264
43
+ },
44
+ "sciq": {
45
+ "acc,none": 0.64,
46
+ "acc_stderr,none": 0.015186527932040117,
47
+ "acc_norm,none": 0.564,
48
+ "acc_norm_stderr,none": 0.015689173023144064
49
+ },
50
+ "wikitext": {
51
+ "word_perplexity,none": 112.6458354552029,
52
+ "byte_perplexity,none": 2.1788907860475493,
53
+ "bits_per_byte,none": 1.1235938851292067
54
+ },
55
+ "winogrande": {
56
+ "acc,none": 0.5295974743488555,
57
+ "acc_stderr,none": 0.014027843827840085
58
+ }
59
+ },
60
+ "configs": {
61
+ "arc_challenge": {
62
+ "task": "arc_challenge",
63
+ "group": [
64
+ "ai2_arc",
65
+ "multiple_choice"
66
+ ],
67
+ "dataset_path": "ai2_arc",
68
+ "dataset_name": "ARC-Challenge",
69
+ "training_split": "train",
70
+ "validation_split": "validation",
71
+ "test_split": "test",
72
+ "doc_to_text": "Question: {{question}}\nAnswer:",
73
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
74
+ "doc_to_choice": "{{choices.text}}",
75
+ "description": "",
76
+ "target_delimiter": " ",
77
+ "fewshot_delimiter": "\n\n",
78
+ "num_fewshot": 0,
79
+ "metric_list": [
80
+ {
81
+ "metric": "acc",
82
+ "aggregation": "mean",
83
+ "higher_is_better": true
84
+ },
85
+ {
86
+ "metric": "acc_norm",
87
+ "aggregation": "mean",
88
+ "higher_is_better": true
89
+ }
90
+ ],
91
+ "output_type": "multiple_choice",
92
+ "repeats": 1,
93
+ "should_decontaminate": true,
94
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
95
+ },
96
+ "arc_easy": {
97
+ "task": "arc_easy",
98
+ "group": [
99
+ "ai2_arc",
100
+ "multiple_choice"
101
+ ],
102
+ "dataset_path": "ai2_arc",
103
+ "dataset_name": "ARC-Easy",
104
+ "training_split": "train",
105
+ "validation_split": "validation",
106
+ "test_split": "test",
107
+ "doc_to_text": "Question: {{question}}\nAnswer:",
108
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
109
+ "doc_to_choice": "{{choices.text}}",
110
+ "description": "",
111
+ "target_delimiter": " ",
112
+ "fewshot_delimiter": "\n\n",
113
+ "num_fewshot": 0,
114
+ "metric_list": [
115
+ {
116
+ "metric": "acc",
117
+ "aggregation": "mean",
118
+ "higher_is_better": true
119
+ },
120
+ {
121
+ "metric": "acc_norm",
122
+ "aggregation": "mean",
123
+ "higher_is_better": true
124
+ }
125
+ ],
126
+ "output_type": "multiple_choice",
127
+ "repeats": 1,
128
+ "should_decontaminate": true,
129
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
130
+ },
131
+ "boolq": {
132
+ "task": "boolq",
133
+ "group": [
134
+ "super-glue-lm-eval-v1"
135
+ ],
136
+ "dataset_path": "super_glue",
137
+ "dataset_name": "boolq",
138
+ "training_split": "train",
139
+ "validation_split": "validation",
140
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
141
+ "doc_to_target": "label",
142
+ "doc_to_choice": [
143
+ "no",
144
+ "yes"
145
+ ],
146
+ "description": "",
147
+ "target_delimiter": " ",
148
+ "fewshot_delimiter": "\n\n",
149
+ "num_fewshot": 0,
150
+ "metric_list": [
151
+ {
152
+ "metric": "acc"
153
+ }
154
+ ],
155
+ "output_type": "multiple_choice",
156
+ "repeats": 1,
157
+ "should_decontaminate": true,
158
+ "doc_to_decontamination_query": "passage"
159
+ },
160
+ "hellaswag": {
161
+ "task": "hellaswag",
162
+ "group": [
163
+ "multiple_choice"
164
+ ],
165
+ "dataset_path": "hellaswag",
166
+ "training_split": "train",
167
+ "validation_split": "validation",
168
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
169
+ "doc_to_target": "{{label}}",
170
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
171
+ "description": "",
172
+ "target_delimiter": " ",
173
+ "fewshot_delimiter": "\n\n",
174
+ "num_fewshot": 0,
175
+ "metric_list": [
176
+ {
177
+ "metric": "acc",
178
+ "aggregation": "mean",
179
+ "higher_is_better": true
180
+ },
181
+ {
182
+ "metric": "acc_norm",
183
+ "aggregation": "mean",
184
+ "higher_is_better": true
185
+ }
186
+ ],
187
+ "output_type": "multiple_choice",
188
+ "repeats": 1,
189
+ "should_decontaminate": false
190
+ },
191
+ "lambada_openai": {
192
+ "task": "lambada_openai",
193
+ "group": [
194
+ "lambada",
195
+ "loglikelihood",
196
+ "perplexity"
197
+ ],
198
+ "dataset_path": "EleutherAI/lambada_openai",
199
+ "dataset_name": "default",
200
+ "test_split": "test",
201
+ "template_aliases": "",
202
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
203
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
204
+ "description": "",
205
+ "target_delimiter": " ",
206
+ "fewshot_delimiter": "\n\n",
207
+ "num_fewshot": 0,
208
+ "metric_list": [
209
+ {
210
+ "metric": "perplexity",
211
+ "aggregation": "perplexity",
212
+ "higher_is_better": false
213
+ },
214
+ {
215
+ "metric": "acc",
216
+ "aggregation": "mean",
217
+ "higher_is_better": true
218
+ }
219
+ ],
220
+ "output_type": "loglikelihood",
221
+ "repeats": 1,
222
+ "should_decontaminate": true,
223
+ "doc_to_decontamination_query": "{{text}}"
224
+ },
225
+ "openbookqa": {
226
+ "task": "openbookqa",
227
+ "group": [
228
+ "multiple_choice"
229
+ ],
230
+ "dataset_path": "openbookqa",
231
+ "dataset_name": "main",
232
+ "training_split": "train",
233
+ "validation_split": "validation",
234
+ "test_split": "test",
235
+ "doc_to_text": "question_stem",
236
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
237
+ "doc_to_choice": "{{choices.text}}",
238
+ "description": "",
239
+ "target_delimiter": " ",
240
+ "fewshot_delimiter": "\n\n",
241
+ "num_fewshot": 0,
242
+ "metric_list": [
243
+ {
244
+ "metric": "acc",
245
+ "aggregation": "mean",
246
+ "higher_is_better": true
247
+ },
248
+ {
249
+ "metric": "acc_norm",
250
+ "aggregation": "mean",
251
+ "higher_is_better": true
252
+ }
253
+ ],
254
+ "output_type": "multiple_choice",
255
+ "repeats": 1,
256
+ "should_decontaminate": true,
257
+ "doc_to_decontamination_query": "question_stem"
258
+ },
259
+ "piqa": {
260
+ "task": "piqa",
261
+ "group": [
262
+ "multiple_choice"
263
+ ],
264
+ "dataset_path": "piqa",
265
+ "training_split": "train",
266
+ "validation_split": "validation",
267
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
268
+ "doc_to_target": "label",
269
+ "doc_to_choice": "{{[sol1, sol2]}}",
270
+ "description": "",
271
+ "target_delimiter": " ",
272
+ "fewshot_delimiter": "\n\n",
273
+ "num_fewshot": 0,
274
+ "metric_list": [
275
+ {
276
+ "metric": "acc",
277
+ "aggregation": "mean",
278
+ "higher_is_better": true
279
+ },
280
+ {
281
+ "metric": "acc_norm",
282
+ "aggregation": "mean",
283
+ "higher_is_better": true
284
+ }
285
+ ],
286
+ "output_type": "multiple_choice",
287
+ "repeats": 1,
288
+ "should_decontaminate": true,
289
+ "doc_to_decontamination_query": "goal"
290
+ },
291
+ "sciq": {
292
+ "task": "sciq",
293
+ "group": [
294
+ "multiple_choice"
295
+ ],
296
+ "dataset_path": "sciq",
297
+ "training_split": "train",
298
+ "validation_split": "validation",
299
+ "test_split": "test",
300
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
301
+ "doc_to_target": 3,
302
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
303
+ "description": "",
304
+ "target_delimiter": " ",
305
+ "fewshot_delimiter": "\n\n",
306
+ "num_fewshot": 0,
307
+ "metric_list": [
308
+ {
309
+ "metric": "acc",
310
+ "aggregation": "mean",
311
+ "higher_is_better": true
312
+ },
313
+ {
314
+ "metric": "acc_norm",
315
+ "aggregation": "mean",
316
+ "higher_is_better": true
317
+ }
318
+ ],
319
+ "output_type": "multiple_choice",
320
+ "repeats": 1,
321
+ "should_decontaminate": true,
322
+ "doc_to_decontamination_query": "{{support}} {{question}}"
323
+ },
324
+ "wikitext": {
325
+ "task": "wikitext",
326
+ "group": [
327
+ "perplexity",
328
+ "loglikelihood_rolling"
329
+ ],
330
+ "dataset_path": "EleutherAI/wikitext_document_level",
331
+ "dataset_name": "wikitext-2-raw-v1",
332
+ "training_split": "train",
333
+ "validation_split": "validation",
334
+ "test_split": "test",
335
+ "template_aliases": "",
336
+ "doc_to_text": "",
337
+ "doc_to_target": "<function wikitext_detokenizer at 0x7f8221504040>",
338
+ "description": "",
339
+ "target_delimiter": " ",
340
+ "fewshot_delimiter": "\n\n",
341
+ "num_fewshot": 0,
342
+ "metric_list": [
343
+ {
344
+ "metric": "word_perplexity"
345
+ },
346
+ {
347
+ "metric": "byte_perplexity"
348
+ },
349
+ {
350
+ "metric": "bits_per_byte"
351
+ }
352
+ ],
353
+ "output_type": "loglikelihood_rolling",
354
+ "repeats": 1,
355
+ "should_decontaminate": true,
356
+ "doc_to_decontamination_query": "{{page}}"
357
+ },
358
+ "winogrande": {
359
+ "task": "winogrande",
360
+ "dataset_path": "winogrande",
361
+ "dataset_name": "winogrande_xl",
362
+ "training_split": "train",
363
+ "validation_split": "validation",
364
+ "doc_to_text": "<function doc_to_text at 0x7f82214d6ef0>",
365
+ "doc_to_target": "<function doc_to_target at 0x7f82214d7370>",
366
+ "doc_to_choice": "<function doc_to_choice at 0x7f82214d75b0>",
367
+ "description": "",
368
+ "target_delimiter": " ",
369
+ "fewshot_delimiter": "\n\n",
370
+ "num_fewshot": 0,
371
+ "metric_list": [
372
+ {
373
+ "metric": "acc",
374
+ "aggregation": "mean",
375
+ "higher_is_better": true
376
+ }
377
+ ],
378
+ "output_type": "multiple_choice",
379
+ "repeats": 1,
380
+ "should_decontaminate": false
381
+ }
382
+ },
383
+ "versions": {
384
+ "arc_challenge": "Yaml",
385
+ "arc_easy": "Yaml",
386
+ "boolq": "Yaml",
387
+ "hellaswag": "Yaml",
388
+ "lambada_openai": "Yaml",
389
+ "openbookqa": "Yaml",
390
+ "piqa": "Yaml",
391
+ "sciq": "Yaml",
392
+ "wikitext": "Yaml",
393
+ "winogrande": "Yaml"
394
+ },
395
+ "config": {
396
+ "model": "hf",
397
+ "model_args": "pretrained=EleutherAI/pythia-70m",
398
+ "num_fewshot": 0,
399
+ "batch_size": 16,
400
+ "batch_sizes": [],
401
+ "device": "cuda:0",
402
+ "use_cache": null,
403
+ "limit": null,
404
+ "bootstrap_iters": 100000
405
+ },
406
+ "git_hash": "4e44f0a"
407
+ }
408
+ hf (pretrained=EleutherAI/pythia-70m), limit: None, num_fewshot: 0, batch_size: 16
409
+ | Task |Version|Filter| Metric | Value | |Stderr|
410
+ |--------------|-------|------|---------------|-------:|---|-----:|
411
+ |arc_challenge |Yaml |none |acc | 0.1758|± |0.0111|
412
+ | | |none |acc_norm | 0.2184|± |0.0121|
413
+ |arc_easy |Yaml |none |acc | 0.3754|± |0.0099|
414
+ | | |none |acc_norm | 0.3523|± |0.0098|
415
+ |boolq |Yaml |none |acc | 0.5887|± |0.0086|
416
+ |hellaswag |Yaml |none |acc | 0.2667|± |0.0044|
417
+ | | |none |acc_norm | 0.2738|± |0.0045|
418
+ |lambada_openai|Yaml |none |perplexity |130.9639|± |5.5012|
419
+ | | |none |acc | 0.2271|± |0.0058|
420
+ |openbookqa |Yaml |none |acc | 0.1260|± |0.0149|
421
+ | | |none |acc_norm | 0.2540|± |0.0195|
422
+ |piqa |Yaml |none |acc | 0.5985|± |0.0114|
423
+ | | |none |acc_norm | 0.5919|± |0.0115|
424
+ |sciq |Yaml |none |acc | 0.6400|± |0.0152|
425
+ | | |none |acc_norm | 0.5640|± |0.0157|
426
+ |wikitext |Yaml |none |word_perplexity|112.6458| | |
427
+ | | |none |byte_perplexity| 2.1789| | |
428
+ | | |none |bits_per_byte | 1.1236| | |
429
+ |winogrande |Yaml |none |acc | 0.5296|± |0.0140|
430
+
base-70m-eval-files/EleutherAI-pythia-70m-0shot/results.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.1757679180887372,
5
+ "acc_stderr,none": 0.011122850863120485,
6
+ "acc_norm,none": 0.21843003412969283,
7
+ "acc_norm_stderr,none": 0.012074291605700975
8
+ },
9
+ "arc_easy": {
10
+ "acc,none": 0.37542087542087543,
11
+ "acc_stderr,none": 0.00993621852711428,
12
+ "acc_norm,none": 0.3522727272727273,
13
+ "acc_norm_stderr,none": 0.009801753933112785
14
+ },
15
+ "boolq": {
16
+ "acc,none": 0.5886850152905199,
17
+ "acc_stderr,none": 0.00860639542630921
18
+ },
19
+ "hellaswag": {
20
+ "acc,none": 0.2666799442342163,
21
+ "acc_stderr,none": 0.004413198640053973,
22
+ "acc_norm,none": 0.27384983071101376,
23
+ "acc_norm_stderr,none": 0.004450214826707207
24
+ },
25
+ "lambada_openai": {
26
+ "perplexity,none": 130.96389727138103,
27
+ "perplexity_stderr,none": 5.501211486155379,
28
+ "acc,none": 0.22705220260042694,
29
+ "acc_stderr,none": 0.005836466732850104
30
+ },
31
+ "openbookqa": {
32
+ "acc,none": 0.126,
33
+ "acc_stderr,none": 0.014855617750787541,
34
+ "acc_norm,none": 0.254,
35
+ "acc_norm_stderr,none": 0.01948659680164337
36
+ },
37
+ "piqa": {
38
+ "acc,none": 0.5984766050054406,
39
+ "acc_stderr,none": 0.011437324373397844,
40
+ "acc_norm,none": 0.5919477693144722,
41
+ "acc_norm_stderr,none": 0.011466872778651264
42
+ },
43
+ "sciq": {
44
+ "acc,none": 0.64,
45
+ "acc_stderr,none": 0.015186527932040117,
46
+ "acc_norm,none": 0.564,
47
+ "acc_norm_stderr,none": 0.015689173023144064
48
+ },
49
+ "wikitext": {
50
+ "word_perplexity,none": 112.6458354552029,
51
+ "byte_perplexity,none": 2.1788907860475493,
52
+ "bits_per_byte,none": 1.1235938851292067
53
+ },
54
+ "winogrande": {
55
+ "acc,none": 0.5295974743488555,
56
+ "acc_stderr,none": 0.014027843827840085
57
+ }
58
+ },
59
+ "configs": {
60
+ "arc_challenge": {
61
+ "task": "arc_challenge",
62
+ "group": [
63
+ "ai2_arc",
64
+ "multiple_choice"
65
+ ],
66
+ "dataset_path": "ai2_arc",
67
+ "dataset_name": "ARC-Challenge",
68
+ "training_split": "train",
69
+ "validation_split": "validation",
70
+ "test_split": "test",
71
+ "doc_to_text": "Question: {{question}}\nAnswer:",
72
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
73
+ "doc_to_choice": "{{choices.text}}",
74
+ "description": "",
75
+ "target_delimiter": " ",
76
+ "fewshot_delimiter": "\n\n",
77
+ "num_fewshot": 0,
78
+ "metric_list": [
79
+ {
80
+ "metric": "acc",
81
+ "aggregation": "mean",
82
+ "higher_is_better": true
83
+ },
84
+ {
85
+ "metric": "acc_norm",
86
+ "aggregation": "mean",
87
+ "higher_is_better": true
88
+ }
89
+ ],
90
+ "output_type": "multiple_choice",
91
+ "repeats": 1,
92
+ "should_decontaminate": true,
93
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
94
+ },
95
+ "arc_easy": {
96
+ "task": "arc_easy",
97
+ "group": [
98
+ "ai2_arc",
99
+ "multiple_choice"
100
+ ],
101
+ "dataset_path": "ai2_arc",
102
+ "dataset_name": "ARC-Easy",
103
+ "training_split": "train",
104
+ "validation_split": "validation",
105
+ "test_split": "test",
106
+ "doc_to_text": "Question: {{question}}\nAnswer:",
107
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
108
+ "doc_to_choice": "{{choices.text}}",
109
+ "description": "",
110
+ "target_delimiter": " ",
111
+ "fewshot_delimiter": "\n\n",
112
+ "num_fewshot": 0,
113
+ "metric_list": [
114
+ {
115
+ "metric": "acc",
116
+ "aggregation": "mean",
117
+ "higher_is_better": true
118
+ },
119
+ {
120
+ "metric": "acc_norm",
121
+ "aggregation": "mean",
122
+ "higher_is_better": true
123
+ }
124
+ ],
125
+ "output_type": "multiple_choice",
126
+ "repeats": 1,
127
+ "should_decontaminate": true,
128
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
129
+ },
130
+ "boolq": {
131
+ "task": "boolq",
132
+ "group": [
133
+ "super-glue-lm-eval-v1"
134
+ ],
135
+ "dataset_path": "super_glue",
136
+ "dataset_name": "boolq",
137
+ "training_split": "train",
138
+ "validation_split": "validation",
139
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
140
+ "doc_to_target": "label",
141
+ "doc_to_choice": [
142
+ "no",
143
+ "yes"
144
+ ],
145
+ "description": "",
146
+ "target_delimiter": " ",
147
+ "fewshot_delimiter": "\n\n",
148
+ "num_fewshot": 0,
149
+ "metric_list": [
150
+ {
151
+ "metric": "acc"
152
+ }
153
+ ],
154
+ "output_type": "multiple_choice",
155
+ "repeats": 1,
156
+ "should_decontaminate": true,
157
+ "doc_to_decontamination_query": "passage"
158
+ },
159
+ "hellaswag": {
160
+ "task": "hellaswag",
161
+ "group": [
162
+ "multiple_choice"
163
+ ],
164
+ "dataset_path": "hellaswag",
165
+ "training_split": "train",
166
+ "validation_split": "validation",
167
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
168
+ "doc_to_target": "{{label}}",
169
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
170
+ "description": "",
171
+ "target_delimiter": " ",
172
+ "fewshot_delimiter": "\n\n",
173
+ "num_fewshot": 0,
174
+ "metric_list": [
175
+ {
176
+ "metric": "acc",
177
+ "aggregation": "mean",
178
+ "higher_is_better": true
179
+ },
180
+ {
181
+ "metric": "acc_norm",
182
+ "aggregation": "mean",
183
+ "higher_is_better": true
184
+ }
185
+ ],
186
+ "output_type": "multiple_choice",
187
+ "repeats": 1,
188
+ "should_decontaminate": false
189
+ },
190
+ "lambada_openai": {
191
+ "task": "lambada_openai",
192
+ "group": [
193
+ "lambada",
194
+ "loglikelihood",
195
+ "perplexity"
196
+ ],
197
+ "dataset_path": "EleutherAI/lambada_openai",
198
+ "dataset_name": "default",
199
+ "test_split": "test",
200
+ "template_aliases": "",
201
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
202
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
203
+ "description": "",
204
+ "target_delimiter": " ",
205
+ "fewshot_delimiter": "\n\n",
206
+ "num_fewshot": 0,
207
+ "metric_list": [
208
+ {
209
+ "metric": "perplexity",
210
+ "aggregation": "perplexity",
211
+ "higher_is_better": false
212
+ },
213
+ {
214
+ "metric": "acc",
215
+ "aggregation": "mean",
216
+ "higher_is_better": true
217
+ }
218
+ ],
219
+ "output_type": "loglikelihood",
220
+ "repeats": 1,
221
+ "should_decontaminate": true,
222
+ "doc_to_decontamination_query": "{{text}}"
223
+ },
224
+ "openbookqa": {
225
+ "task": "openbookqa",
226
+ "group": [
227
+ "multiple_choice"
228
+ ],
229
+ "dataset_path": "openbookqa",
230
+ "dataset_name": "main",
231
+ "training_split": "train",
232
+ "validation_split": "validation",
233
+ "test_split": "test",
234
+ "doc_to_text": "question_stem",
235
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
236
+ "doc_to_choice": "{{choices.text}}",
237
+ "description": "",
238
+ "target_delimiter": " ",
239
+ "fewshot_delimiter": "\n\n",
240
+ "num_fewshot": 0,
241
+ "metric_list": [
242
+ {
243
+ "metric": "acc",
244
+ "aggregation": "mean",
245
+ "higher_is_better": true
246
+ },
247
+ {
248
+ "metric": "acc_norm",
249
+ "aggregation": "mean",
250
+ "higher_is_better": true
251
+ }
252
+ ],
253
+ "output_type": "multiple_choice",
254
+ "repeats": 1,
255
+ "should_decontaminate": true,
256
+ "doc_to_decontamination_query": "question_stem"
257
+ },
258
+ "piqa": {
259
+ "task": "piqa",
260
+ "group": [
261
+ "multiple_choice"
262
+ ],
263
+ "dataset_path": "piqa",
264
+ "training_split": "train",
265
+ "validation_split": "validation",
266
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
267
+ "doc_to_target": "label",
268
+ "doc_to_choice": "{{[sol1, sol2]}}",
269
+ "description": "",
270
+ "target_delimiter": " ",
271
+ "fewshot_delimiter": "\n\n",
272
+ "num_fewshot": 0,
273
+ "metric_list": [
274
+ {
275
+ "metric": "acc",
276
+ "aggregation": "mean",
277
+ "higher_is_better": true
278
+ },
279
+ {
280
+ "metric": "acc_norm",
281
+ "aggregation": "mean",
282
+ "higher_is_better": true
283
+ }
284
+ ],
285
+ "output_type": "multiple_choice",
286
+ "repeats": 1,
287
+ "should_decontaminate": true,
288
+ "doc_to_decontamination_query": "goal"
289
+ },
290
+ "sciq": {
291
+ "task": "sciq",
292
+ "group": [
293
+ "multiple_choice"
294
+ ],
295
+ "dataset_path": "sciq",
296
+ "training_split": "train",
297
+ "validation_split": "validation",
298
+ "test_split": "test",
299
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
300
+ "doc_to_target": 3,
301
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
302
+ "description": "",
303
+ "target_delimiter": " ",
304
+ "fewshot_delimiter": "\n\n",
305
+ "num_fewshot": 0,
306
+ "metric_list": [
307
+ {
308
+ "metric": "acc",
309
+ "aggregation": "mean",
310
+ "higher_is_better": true
311
+ },
312
+ {
313
+ "metric": "acc_norm",
314
+ "aggregation": "mean",
315
+ "higher_is_better": true
316
+ }
317
+ ],
318
+ "output_type": "multiple_choice",
319
+ "repeats": 1,
320
+ "should_decontaminate": true,
321
+ "doc_to_decontamination_query": "{{support}} {{question}}"
322
+ },
323
+ "wikitext": {
324
+ "task": "wikitext",
325
+ "group": [
326
+ "perplexity",
327
+ "loglikelihood_rolling"
328
+ ],
329
+ "dataset_path": "EleutherAI/wikitext_document_level",
330
+ "dataset_name": "wikitext-2-raw-v1",
331
+ "training_split": "train",
332
+ "validation_split": "validation",
333
+ "test_split": "test",
334
+ "template_aliases": "",
335
+ "doc_to_text": "",
336
+ "doc_to_target": "<function wikitext_detokenizer at 0x7f8221504040>",
337
+ "description": "",
338
+ "target_delimiter": " ",
339
+ "fewshot_delimiter": "\n\n",
340
+ "num_fewshot": 0,
341
+ "metric_list": [
342
+ {
343
+ "metric": "word_perplexity"
344
+ },
345
+ {
346
+ "metric": "byte_perplexity"
347
+ },
348
+ {
349
+ "metric": "bits_per_byte"
350
+ }
351
+ ],
352
+ "output_type": "loglikelihood_rolling",
353
+ "repeats": 1,
354
+ "should_decontaminate": true,
355
+ "doc_to_decontamination_query": "{{page}}"
356
+ },
357
+ "winogrande": {
358
+ "task": "winogrande",
359
+ "dataset_path": "winogrande",
360
+ "dataset_name": "winogrande_xl",
361
+ "training_split": "train",
362
+ "validation_split": "validation",
363
+ "doc_to_text": "<function doc_to_text at 0x7f82214d6ef0>",
364
+ "doc_to_target": "<function doc_to_target at 0x7f82214d7370>",
365
+ "doc_to_choice": "<function doc_to_choice at 0x7f82214d75b0>",
366
+ "description": "",
367
+ "target_delimiter": " ",
368
+ "fewshot_delimiter": "\n\n",
369
+ "num_fewshot": 0,
370
+ "metric_list": [
371
+ {
372
+ "metric": "acc",
373
+ "aggregation": "mean",
374
+ "higher_is_better": true
375
+ }
376
+ ],
377
+ "output_type": "multiple_choice",
378
+ "repeats": 1,
379
+ "should_decontaminate": false
380
+ }
381
+ },
382
+ "versions": {
383
+ "arc_challenge": "Yaml",
384
+ "arc_easy": "Yaml",
385
+ "boolq": "Yaml",
386
+ "hellaswag": "Yaml",
387
+ "lambada_openai": "Yaml",
388
+ "openbookqa": "Yaml",
389
+ "piqa": "Yaml",
390
+ "sciq": "Yaml",
391
+ "wikitext": "Yaml",
392
+ "winogrande": "Yaml"
393
+ },
394
+ "config": {
395
+ "model": "hf",
396
+ "model_args": "pretrained=EleutherAI/pythia-70m",
397
+ "num_fewshot": 0,
398
+ "batch_size": 16,
399
+ "batch_sizes": [],
400
+ "device": "cuda:0",
401
+ "use_cache": null,
402
+ "limit": null,
403
+ "bootstrap_iters": 100000
404
+ },
405
+ "git_hash": "4e44f0a"
406
+ }
base-70m-eval-files/EleutherAI-pythia-70m-5shot-shelloutput.txt ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bootstrapping for stddev: perplexity
2
+ {
3
+ "results": {
4
+ "arc_challenge": {
5
+ "acc,none": 0.16723549488054607,
6
+ "acc_stderr,none": 0.010905532724601189,
7
+ "acc_norm,none": 0.2090443686006826,
8
+ "acc_norm_stderr,none": 0.011882746987406444
9
+ },
10
+ "arc_easy": {
11
+ "acc,none": 0.367003367003367,
12
+ "acc_stderr,none": 0.009890173658452116,
13
+ "acc_norm,none": 0.3581649831649832,
14
+ "acc_norm_stderr,none": 0.009838331651451848
15
+ },
16
+ "boolq": {
17
+ "acc,none": 0.5480122324159021,
18
+ "acc_stderr,none": 0.008704643851177517
19
+ },
20
+ "hellaswag": {
21
+ "acc,none": 0.26598287193786097,
22
+ "acc_stderr,none": 0.004409521343140176,
23
+ "acc_norm,none": 0.2747460665206134,
24
+ "acc_norm_stderr,none": 0.004454739415705064
25
+ },
26
+ "lambada_openai": {
27
+ "perplexity,none": 300.2755221103115,
28
+ "perplexity_stderr,none": 12.327724137522168,
29
+ "acc,none": 0.14341160489035512,
30
+ "acc_stderr,none": 0.004883040360919453
31
+ },
32
+ "openbookqa": {
33
+ "acc,none": 0.144,
34
+ "acc_stderr,none": 0.015716934945725767,
35
+ "acc_norm,none": 0.244,
36
+ "acc_norm_stderr,none": 0.01922673489361458
37
+ },
38
+ "piqa": {
39
+ "acc,none": 0.5875952121871599,
40
+ "acc_stderr,none": 0.011485407152743137,
41
+ "acc_norm,none": 0.588139281828074,
42
+ "acc_norm_stderr,none": 0.011483141106304395
43
+ },
44
+ "sciq": {
45
+ "acc,none": 0.622,
46
+ "acc_stderr,none": 0.015341165254026637,
47
+ "acc_norm,none": 0.589,
48
+ "acc_norm_stderr,none": 0.01556667341859927
49
+ },
50
+ "wikitext": {
51
+ "word_perplexity,none": 112.6458354552029,
52
+ "byte_perplexity,none": 2.1788907860475493,
53
+ "bits_per_byte,none": 1.1235938851292067
54
+ },
55
+ "winogrande": {
56
+ "acc,none": 0.5295974743488555,
57
+ "acc_stderr,none": 0.014027843827840085
58
+ }
59
+ },
60
+ "configs": {
61
+ "arc_challenge": {
62
+ "task": "arc_challenge",
63
+ "group": [
64
+ "ai2_arc",
65
+ "multiple_choice"
66
+ ],
67
+ "dataset_path": "ai2_arc",
68
+ "dataset_name": "ARC-Challenge",
69
+ "training_split": "train",
70
+ "validation_split": "validation",
71
+ "test_split": "test",
72
+ "doc_to_text": "Question: {{question}}\nAnswer:",
73
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
74
+ "doc_to_choice": "{{choices.text}}",
75
+ "description": "",
76
+ "target_delimiter": " ",
77
+ "fewshot_delimiter": "\n\n",
78
+ "num_fewshot": 5,
79
+ "metric_list": [
80
+ {
81
+ "metric": "acc",
82
+ "aggregation": "mean",
83
+ "higher_is_better": true
84
+ },
85
+ {
86
+ "metric": "acc_norm",
87
+ "aggregation": "mean",
88
+ "higher_is_better": true
89
+ }
90
+ ],
91
+ "output_type": "multiple_choice",
92
+ "repeats": 1,
93
+ "should_decontaminate": true,
94
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
95
+ },
96
+ "arc_easy": {
97
+ "task": "arc_easy",
98
+ "group": [
99
+ "ai2_arc",
100
+ "multiple_choice"
101
+ ],
102
+ "dataset_path": "ai2_arc",
103
+ "dataset_name": "ARC-Easy",
104
+ "training_split": "train",
105
+ "validation_split": "validation",
106
+ "test_split": "test",
107
+ "doc_to_text": "Question: {{question}}\nAnswer:",
108
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
109
+ "doc_to_choice": "{{choices.text}}",
110
+ "description": "",
111
+ "target_delimiter": " ",
112
+ "fewshot_delimiter": "\n\n",
113
+ "num_fewshot": 5,
114
+ "metric_list": [
115
+ {
116
+ "metric": "acc",
117
+ "aggregation": "mean",
118
+ "higher_is_better": true
119
+ },
120
+ {
121
+ "metric": "acc_norm",
122
+ "aggregation": "mean",
123
+ "higher_is_better": true
124
+ }
125
+ ],
126
+ "output_type": "multiple_choice",
127
+ "repeats": 1,
128
+ "should_decontaminate": true,
129
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
130
+ },
131
+ "boolq": {
132
+ "task": "boolq",
133
+ "group": [
134
+ "super-glue-lm-eval-v1"
135
+ ],
136
+ "dataset_path": "super_glue",
137
+ "dataset_name": "boolq",
138
+ "training_split": "train",
139
+ "validation_split": "validation",
140
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
141
+ "doc_to_target": "label",
142
+ "doc_to_choice": [
143
+ "no",
144
+ "yes"
145
+ ],
146
+ "description": "",
147
+ "target_delimiter": " ",
148
+ "fewshot_delimiter": "\n\n",
149
+ "num_fewshot": 5,
150
+ "metric_list": [
151
+ {
152
+ "metric": "acc"
153
+ }
154
+ ],
155
+ "output_type": "multiple_choice",
156
+ "repeats": 1,
157
+ "should_decontaminate": true,
158
+ "doc_to_decontamination_query": "passage"
159
+ },
160
+ "hellaswag": {
161
+ "task": "hellaswag",
162
+ "group": [
163
+ "multiple_choice"
164
+ ],
165
+ "dataset_path": "hellaswag",
166
+ "training_split": "train",
167
+ "validation_split": "validation",
168
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
169
+ "doc_to_target": "{{label}}",
170
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
171
+ "description": "",
172
+ "target_delimiter": " ",
173
+ "fewshot_delimiter": "\n\n",
174
+ "num_fewshot": 5,
175
+ "metric_list": [
176
+ {
177
+ "metric": "acc",
178
+ "aggregation": "mean",
179
+ "higher_is_better": true
180
+ },
181
+ {
182
+ "metric": "acc_norm",
183
+ "aggregation": "mean",
184
+ "higher_is_better": true
185
+ }
186
+ ],
187
+ "output_type": "multiple_choice",
188
+ "repeats": 1,
189
+ "should_decontaminate": false
190
+ },
191
+ "lambada_openai": {
192
+ "task": "lambada_openai",
193
+ "group": [
194
+ "lambada",
195
+ "loglikelihood",
196
+ "perplexity"
197
+ ],
198
+ "dataset_path": "EleutherAI/lambada_openai",
199
+ "dataset_name": "default",
200
+ "test_split": "test",
201
+ "template_aliases": "",
202
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
203
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
204
+ "description": "",
205
+ "target_delimiter": " ",
206
+ "fewshot_delimiter": "\n\n",
207
+ "num_fewshot": 5,
208
+ "metric_list": [
209
+ {
210
+ "metric": "perplexity",
211
+ "aggregation": "perplexity",
212
+ "higher_is_better": false
213
+ },
214
+ {
215
+ "metric": "acc",
216
+ "aggregation": "mean",
217
+ "higher_is_better": true
218
+ }
219
+ ],
220
+ "output_type": "loglikelihood",
221
+ "repeats": 1,
222
+ "should_decontaminate": true,
223
+ "doc_to_decontamination_query": "{{text}}"
224
+ },
225
+ "openbookqa": {
226
+ "task": "openbookqa",
227
+ "group": [
228
+ "multiple_choice"
229
+ ],
230
+ "dataset_path": "openbookqa",
231
+ "dataset_name": "main",
232
+ "training_split": "train",
233
+ "validation_split": "validation",
234
+ "test_split": "test",
235
+ "doc_to_text": "question_stem",
236
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
237
+ "doc_to_choice": "{{choices.text}}",
238
+ "description": "",
239
+ "target_delimiter": " ",
240
+ "fewshot_delimiter": "\n\n",
241
+ "num_fewshot": 5,
242
+ "metric_list": [
243
+ {
244
+ "metric": "acc",
245
+ "aggregation": "mean",
246
+ "higher_is_better": true
247
+ },
248
+ {
249
+ "metric": "acc_norm",
250
+ "aggregation": "mean",
251
+ "higher_is_better": true
252
+ }
253
+ ],
254
+ "output_type": "multiple_choice",
255
+ "repeats": 1,
256
+ "should_decontaminate": true,
257
+ "doc_to_decontamination_query": "question_stem"
258
+ },
259
+ "piqa": {
260
+ "task": "piqa",
261
+ "group": [
262
+ "multiple_choice"
263
+ ],
264
+ "dataset_path": "piqa",
265
+ "training_split": "train",
266
+ "validation_split": "validation",
267
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
268
+ "doc_to_target": "label",
269
+ "doc_to_choice": "{{[sol1, sol2]}}",
270
+ "description": "",
271
+ "target_delimiter": " ",
272
+ "fewshot_delimiter": "\n\n",
273
+ "num_fewshot": 5,
274
+ "metric_list": [
275
+ {
276
+ "metric": "acc",
277
+ "aggregation": "mean",
278
+ "higher_is_better": true
279
+ },
280
+ {
281
+ "metric": "acc_norm",
282
+ "aggregation": "mean",
283
+ "higher_is_better": true
284
+ }
285
+ ],
286
+ "output_type": "multiple_choice",
287
+ "repeats": 1,
288
+ "should_decontaminate": true,
289
+ "doc_to_decontamination_query": "goal"
290
+ },
291
+ "sciq": {
292
+ "task": "sciq",
293
+ "group": [
294
+ "multiple_choice"
295
+ ],
296
+ "dataset_path": "sciq",
297
+ "training_split": "train",
298
+ "validation_split": "validation",
299
+ "test_split": "test",
300
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
301
+ "doc_to_target": 3,
302
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
303
+ "description": "",
304
+ "target_delimiter": " ",
305
+ "fewshot_delimiter": "\n\n",
306
+ "num_fewshot": 5,
307
+ "metric_list": [
308
+ {
309
+ "metric": "acc",
310
+ "aggregation": "mean",
311
+ "higher_is_better": true
312
+ },
313
+ {
314
+ "metric": "acc_norm",
315
+ "aggregation": "mean",
316
+ "higher_is_better": true
317
+ }
318
+ ],
319
+ "output_type": "multiple_choice",
320
+ "repeats": 1,
321
+ "should_decontaminate": true,
322
+ "doc_to_decontamination_query": "{{support}} {{question}}"
323
+ },
324
+ "wikitext": {
325
+ "task": "wikitext",
326
+ "group": [
327
+ "perplexity",
328
+ "loglikelihood_rolling"
329
+ ],
330
+ "dataset_path": "EleutherAI/wikitext_document_level",
331
+ "dataset_name": "wikitext-2-raw-v1",
332
+ "training_split": "train",
333
+ "validation_split": "validation",
334
+ "test_split": "test",
335
+ "template_aliases": "",
336
+ "doc_to_text": "",
337
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fa6ad108040>",
338
+ "description": "",
339
+ "target_delimiter": " ",
340
+ "fewshot_delimiter": "\n\n",
341
+ "num_fewshot": 5,
342
+ "metric_list": [
343
+ {
344
+ "metric": "word_perplexity"
345
+ },
346
+ {
347
+ "metric": "byte_perplexity"
348
+ },
349
+ {
350
+ "metric": "bits_per_byte"
351
+ }
352
+ ],
353
+ "output_type": "loglikelihood_rolling",
354
+ "repeats": 1,
355
+ "should_decontaminate": true,
356
+ "doc_to_decontamination_query": "{{page}}"
357
+ },
358
+ "winogrande": {
359
+ "task": "winogrande",
360
+ "dataset_path": "winogrande",
361
+ "dataset_name": "winogrande_xl",
362
+ "training_split": "train",
363
+ "validation_split": "validation",
364
+ "doc_to_text": "<function doc_to_text at 0x7fa6ad0daef0>",
365
+ "doc_to_target": "<function doc_to_target at 0x7fa6ad0db370>",
366
+ "doc_to_choice": "<function doc_to_choice at 0x7fa6ad0db5b0>",
367
+ "description": "",
368
+ "target_delimiter": " ",
369
+ "fewshot_delimiter": "\n\n",
370
+ "num_fewshot": 5,
371
+ "metric_list": [
372
+ {
373
+ "metric": "acc",
374
+ "aggregation": "mean",
375
+ "higher_is_better": true
376
+ }
377
+ ],
378
+ "output_type": "multiple_choice",
379
+ "repeats": 1,
380
+ "should_decontaminate": false
381
+ }
382
+ },
383
+ "versions": {
384
+ "arc_challenge": "Yaml",
385
+ "arc_easy": "Yaml",
386
+ "boolq": "Yaml",
387
+ "hellaswag": "Yaml",
388
+ "lambada_openai": "Yaml",
389
+ "openbookqa": "Yaml",
390
+ "piqa": "Yaml",
391
+ "sciq": "Yaml",
392
+ "wikitext": "Yaml",
393
+ "winogrande": "Yaml"
394
+ },
395
+ "config": {
396
+ "model": "hf",
397
+ "model_args": "pretrained=EleutherAI/pythia-70m",
398
+ "num_fewshot": 5,
399
+ "batch_size": 16,
400
+ "batch_sizes": [],
401
+ "device": "cuda:0",
402
+ "use_cache": null,
403
+ "limit": null,
404
+ "bootstrap_iters": 100000
405
+ },
406
+ "git_hash": "4e44f0a"
407
+ }
408
+ hf (pretrained=EleutherAI/pythia-70m), limit: None, num_fewshot: 5, batch_size: 16
409
+ | Task |Version|Filter| Metric | Value | |Stderr |
410
+ |--------------|-------|------|---------------|-------:|---|------:|
411
+ |arc_challenge |Yaml |none |acc | 0.1672|± | 0.0109|
412
+ | | |none |acc_norm | 0.2090|± | 0.0119|
413
+ |arc_easy |Yaml |none |acc | 0.3670|± | 0.0099|
414
+ | | |none |acc_norm | 0.3582|± | 0.0098|
415
+ |boolq |Yaml |none |acc | 0.5480|± | 0.0087|
416
+ |hellaswag |Yaml |none |acc | 0.2660|± | 0.0044|
417
+ | | |none |acc_norm | 0.2747|± | 0.0045|
418
+ |lambada_openai|Yaml |none |perplexity |300.2755|± |12.3277|
419
+ | | |none |acc | 0.1434|± | 0.0049|
420
+ |openbookqa |Yaml |none |acc | 0.1440|± | 0.0157|
421
+ | | |none |acc_norm | 0.2440|± | 0.0192|
422
+ |piqa |Yaml |none |acc | 0.5876|± | 0.0115|
423
+ | | |none |acc_norm | 0.5881|± | 0.0115|
424
+ |sciq |Yaml |none |acc | 0.6220|± | 0.0153|
425
+ | | |none |acc_norm | 0.5890|± | 0.0156|
426
+ |wikitext |Yaml |none |word_perplexity|112.6458| | |
427
+ | | |none |byte_perplexity| 2.1789| | |
428
+ | | |none |bits_per_byte | 1.1236| | |
429
+ |winogrande |Yaml |none |acc | 0.5296|± | 0.0140|
430
+
base-70m-eval-files/EleutherAI-pythia-70m-5shot/results.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.16723549488054607,
5
+ "acc_stderr,none": 0.010905532724601189,
6
+ "acc_norm,none": 0.2090443686006826,
7
+ "acc_norm_stderr,none": 0.011882746987406444
8
+ },
9
+ "arc_easy": {
10
+ "acc,none": 0.367003367003367,
11
+ "acc_stderr,none": 0.009890173658452116,
12
+ "acc_norm,none": 0.3581649831649832,
13
+ "acc_norm_stderr,none": 0.009838331651451848
14
+ },
15
+ "boolq": {
16
+ "acc,none": 0.5480122324159021,
17
+ "acc_stderr,none": 0.008704643851177517
18
+ },
19
+ "hellaswag": {
20
+ "acc,none": 0.26598287193786097,
21
+ "acc_stderr,none": 0.004409521343140176,
22
+ "acc_norm,none": 0.2747460665206134,
23
+ "acc_norm_stderr,none": 0.004454739415705064
24
+ },
25
+ "lambada_openai": {
26
+ "perplexity,none": 300.2755221103115,
27
+ "perplexity_stderr,none": 12.327724137522168,
28
+ "acc,none": 0.14341160489035512,
29
+ "acc_stderr,none": 0.004883040360919453
30
+ },
31
+ "openbookqa": {
32
+ "acc,none": 0.144,
33
+ "acc_stderr,none": 0.015716934945725767,
34
+ "acc_norm,none": 0.244,
35
+ "acc_norm_stderr,none": 0.01922673489361458
36
+ },
37
+ "piqa": {
38
+ "acc,none": 0.5875952121871599,
39
+ "acc_stderr,none": 0.011485407152743137,
40
+ "acc_norm,none": 0.588139281828074,
41
+ "acc_norm_stderr,none": 0.011483141106304395
42
+ },
43
+ "sciq": {
44
+ "acc,none": 0.622,
45
+ "acc_stderr,none": 0.015341165254026637,
46
+ "acc_norm,none": 0.589,
47
+ "acc_norm_stderr,none": 0.01556667341859927
48
+ },
49
+ "wikitext": {
50
+ "word_perplexity,none": 112.6458354552029,
51
+ "byte_perplexity,none": 2.1788907860475493,
52
+ "bits_per_byte,none": 1.1235938851292067
53
+ },
54
+ "winogrande": {
55
+ "acc,none": 0.5295974743488555,
56
+ "acc_stderr,none": 0.014027843827840085
57
+ }
58
+ },
59
+ "configs": {
60
+ "arc_challenge": {
61
+ "task": "arc_challenge",
62
+ "group": [
63
+ "ai2_arc",
64
+ "multiple_choice"
65
+ ],
66
+ "dataset_path": "ai2_arc",
67
+ "dataset_name": "ARC-Challenge",
68
+ "training_split": "train",
69
+ "validation_split": "validation",
70
+ "test_split": "test",
71
+ "doc_to_text": "Question: {{question}}\nAnswer:",
72
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
73
+ "doc_to_choice": "{{choices.text}}",
74
+ "description": "",
75
+ "target_delimiter": " ",
76
+ "fewshot_delimiter": "\n\n",
77
+ "num_fewshot": 5,
78
+ "metric_list": [
79
+ {
80
+ "metric": "acc",
81
+ "aggregation": "mean",
82
+ "higher_is_better": true
83
+ },
84
+ {
85
+ "metric": "acc_norm",
86
+ "aggregation": "mean",
87
+ "higher_is_better": true
88
+ }
89
+ ],
90
+ "output_type": "multiple_choice",
91
+ "repeats": 1,
92
+ "should_decontaminate": true,
93
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
94
+ },
95
+ "arc_easy": {
96
+ "task": "arc_easy",
97
+ "group": [
98
+ "ai2_arc",
99
+ "multiple_choice"
100
+ ],
101
+ "dataset_path": "ai2_arc",
102
+ "dataset_name": "ARC-Easy",
103
+ "training_split": "train",
104
+ "validation_split": "validation",
105
+ "test_split": "test",
106
+ "doc_to_text": "Question: {{question}}\nAnswer:",
107
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
108
+ "doc_to_choice": "{{choices.text}}",
109
+ "description": "",
110
+ "target_delimiter": " ",
111
+ "fewshot_delimiter": "\n\n",
112
+ "num_fewshot": 5,
113
+ "metric_list": [
114
+ {
115
+ "metric": "acc",
116
+ "aggregation": "mean",
117
+ "higher_is_better": true
118
+ },
119
+ {
120
+ "metric": "acc_norm",
121
+ "aggregation": "mean",
122
+ "higher_is_better": true
123
+ }
124
+ ],
125
+ "output_type": "multiple_choice",
126
+ "repeats": 1,
127
+ "should_decontaminate": true,
128
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
129
+ },
130
+ "boolq": {
131
+ "task": "boolq",
132
+ "group": [
133
+ "super-glue-lm-eval-v1"
134
+ ],
135
+ "dataset_path": "super_glue",
136
+ "dataset_name": "boolq",
137
+ "training_split": "train",
138
+ "validation_split": "validation",
139
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
140
+ "doc_to_target": "label",
141
+ "doc_to_choice": [
142
+ "no",
143
+ "yes"
144
+ ],
145
+ "description": "",
146
+ "target_delimiter": " ",
147
+ "fewshot_delimiter": "\n\n",
148
+ "num_fewshot": 5,
149
+ "metric_list": [
150
+ {
151
+ "metric": "acc"
152
+ }
153
+ ],
154
+ "output_type": "multiple_choice",
155
+ "repeats": 1,
156
+ "should_decontaminate": true,
157
+ "doc_to_decontamination_query": "passage"
158
+ },
159
+ "hellaswag": {
160
+ "task": "hellaswag",
161
+ "group": [
162
+ "multiple_choice"
163
+ ],
164
+ "dataset_path": "hellaswag",
165
+ "training_split": "train",
166
+ "validation_split": "validation",
167
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
168
+ "doc_to_target": "{{label}}",
169
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
170
+ "description": "",
171
+ "target_delimiter": " ",
172
+ "fewshot_delimiter": "\n\n",
173
+ "num_fewshot": 5,
174
+ "metric_list": [
175
+ {
176
+ "metric": "acc",
177
+ "aggregation": "mean",
178
+ "higher_is_better": true
179
+ },
180
+ {
181
+ "metric": "acc_norm",
182
+ "aggregation": "mean",
183
+ "higher_is_better": true
184
+ }
185
+ ],
186
+ "output_type": "multiple_choice",
187
+ "repeats": 1,
188
+ "should_decontaminate": false
189
+ },
190
+ "lambada_openai": {
191
+ "task": "lambada_openai",
192
+ "group": [
193
+ "lambada",
194
+ "loglikelihood",
195
+ "perplexity"
196
+ ],
197
+ "dataset_path": "EleutherAI/lambada_openai",
198
+ "dataset_name": "default",
199
+ "test_split": "test",
200
+ "template_aliases": "",
201
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
202
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
203
+ "description": "",
204
+ "target_delimiter": " ",
205
+ "fewshot_delimiter": "\n\n",
206
+ "num_fewshot": 5,
207
+ "metric_list": [
208
+ {
209
+ "metric": "perplexity",
210
+ "aggregation": "perplexity",
211
+ "higher_is_better": false
212
+ },
213
+ {
214
+ "metric": "acc",
215
+ "aggregation": "mean",
216
+ "higher_is_better": true
217
+ }
218
+ ],
219
+ "output_type": "loglikelihood",
220
+ "repeats": 1,
221
+ "should_decontaminate": true,
222
+ "doc_to_decontamination_query": "{{text}}"
223
+ },
224
+ "openbookqa": {
225
+ "task": "openbookqa",
226
+ "group": [
227
+ "multiple_choice"
228
+ ],
229
+ "dataset_path": "openbookqa",
230
+ "dataset_name": "main",
231
+ "training_split": "train",
232
+ "validation_split": "validation",
233
+ "test_split": "test",
234
+ "doc_to_text": "question_stem",
235
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
236
+ "doc_to_choice": "{{choices.text}}",
237
+ "description": "",
238
+ "target_delimiter": " ",
239
+ "fewshot_delimiter": "\n\n",
240
+ "num_fewshot": 5,
241
+ "metric_list": [
242
+ {
243
+ "metric": "acc",
244
+ "aggregation": "mean",
245
+ "higher_is_better": true
246
+ },
247
+ {
248
+ "metric": "acc_norm",
249
+ "aggregation": "mean",
250
+ "higher_is_better": true
251
+ }
252
+ ],
253
+ "output_type": "multiple_choice",
254
+ "repeats": 1,
255
+ "should_decontaminate": true,
256
+ "doc_to_decontamination_query": "question_stem"
257
+ },
258
+ "piqa": {
259
+ "task": "piqa",
260
+ "group": [
261
+ "multiple_choice"
262
+ ],
263
+ "dataset_path": "piqa",
264
+ "training_split": "train",
265
+ "validation_split": "validation",
266
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
267
+ "doc_to_target": "label",
268
+ "doc_to_choice": "{{[sol1, sol2]}}",
269
+ "description": "",
270
+ "target_delimiter": " ",
271
+ "fewshot_delimiter": "\n\n",
272
+ "num_fewshot": 5,
273
+ "metric_list": [
274
+ {
275
+ "metric": "acc",
276
+ "aggregation": "mean",
277
+ "higher_is_better": true
278
+ },
279
+ {
280
+ "metric": "acc_norm",
281
+ "aggregation": "mean",
282
+ "higher_is_better": true
283
+ }
284
+ ],
285
+ "output_type": "multiple_choice",
286
+ "repeats": 1,
287
+ "should_decontaminate": true,
288
+ "doc_to_decontamination_query": "goal"
289
+ },
290
+ "sciq": {
291
+ "task": "sciq",
292
+ "group": [
293
+ "multiple_choice"
294
+ ],
295
+ "dataset_path": "sciq",
296
+ "training_split": "train",
297
+ "validation_split": "validation",
298
+ "test_split": "test",
299
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
300
+ "doc_to_target": 3,
301
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
302
+ "description": "",
303
+ "target_delimiter": " ",
304
+ "fewshot_delimiter": "\n\n",
305
+ "num_fewshot": 5,
306
+ "metric_list": [
307
+ {
308
+ "metric": "acc",
309
+ "aggregation": "mean",
310
+ "higher_is_better": true
311
+ },
312
+ {
313
+ "metric": "acc_norm",
314
+ "aggregation": "mean",
315
+ "higher_is_better": true
316
+ }
317
+ ],
318
+ "output_type": "multiple_choice",
319
+ "repeats": 1,
320
+ "should_decontaminate": true,
321
+ "doc_to_decontamination_query": "{{support}} {{question}}"
322
+ },
323
+ "wikitext": {
324
+ "task": "wikitext",
325
+ "group": [
326
+ "perplexity",
327
+ "loglikelihood_rolling"
328
+ ],
329
+ "dataset_path": "EleutherAI/wikitext_document_level",
330
+ "dataset_name": "wikitext-2-raw-v1",
331
+ "training_split": "train",
332
+ "validation_split": "validation",
333
+ "test_split": "test",
334
+ "template_aliases": "",
335
+ "doc_to_text": "",
336
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fa6ad108040>",
337
+ "description": "",
338
+ "target_delimiter": " ",
339
+ "fewshot_delimiter": "\n\n",
340
+ "num_fewshot": 5,
341
+ "metric_list": [
342
+ {
343
+ "metric": "word_perplexity"
344
+ },
345
+ {
346
+ "metric": "byte_perplexity"
347
+ },
348
+ {
349
+ "metric": "bits_per_byte"
350
+ }
351
+ ],
352
+ "output_type": "loglikelihood_rolling",
353
+ "repeats": 1,
354
+ "should_decontaminate": true,
355
+ "doc_to_decontamination_query": "{{page}}"
356
+ },
357
+ "winogrande": {
358
+ "task": "winogrande",
359
+ "dataset_path": "winogrande",
360
+ "dataset_name": "winogrande_xl",
361
+ "training_split": "train",
362
+ "validation_split": "validation",
363
+ "doc_to_text": "<function doc_to_text at 0x7fa6ad0daef0>",
364
+ "doc_to_target": "<function doc_to_target at 0x7fa6ad0db370>",
365
+ "doc_to_choice": "<function doc_to_choice at 0x7fa6ad0db5b0>",
366
+ "description": "",
367
+ "target_delimiter": " ",
368
+ "fewshot_delimiter": "\n\n",
369
+ "num_fewshot": 5,
370
+ "metric_list": [
371
+ {
372
+ "metric": "acc",
373
+ "aggregation": "mean",
374
+ "higher_is_better": true
375
+ }
376
+ ],
377
+ "output_type": "multiple_choice",
378
+ "repeats": 1,
379
+ "should_decontaminate": false
380
+ }
381
+ },
382
+ "versions": {
383
+ "arc_challenge": "Yaml",
384
+ "arc_easy": "Yaml",
385
+ "boolq": "Yaml",
386
+ "hellaswag": "Yaml",
387
+ "lambada_openai": "Yaml",
388
+ "openbookqa": "Yaml",
389
+ "piqa": "Yaml",
390
+ "sciq": "Yaml",
391
+ "wikitext": "Yaml",
392
+ "winogrande": "Yaml"
393
+ },
394
+ "config": {
395
+ "model": "hf",
396
+ "model_args": "pretrained=EleutherAI/pythia-70m",
397
+ "num_fewshot": 5,
398
+ "batch_size": 16,
399
+ "batch_sizes": [],
400
+ "device": "cuda:0",
401
+ "use_cache": null,
402
+ "limit": null,
403
+ "bootstrap_iters": 100000
404
+ },
405
+ "git_hash": "4e44f0a"
406
+ }
dpo-70m-eval-files/dpo-pythia-70m-0shot-shelloutput.txt ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bootstrapping for stddev: perplexity
2
+ {
3
+ "results": {
4
+ "arc_challenge": {
5
+ "acc,none": 0.1945392491467577,
6
+ "acc_stderr,none": 0.011567709174648728,
7
+ "acc_norm,none": 0.2235494880546075,
8
+ "acc_norm_stderr,none": 0.012174896631202612
9
+ },
10
+ "arc_easy": {
11
+ "acc,none": 0.39015151515151514,
12
+ "acc_stderr,none": 0.010009118166667422,
13
+ "acc_norm,none": 0.3560606060606061,
14
+ "acc_norm_stderr,none": 0.009825454608416315
15
+ },
16
+ "boolq": {
17
+ "acc,none": 0.6201834862385321,
18
+ "acc_stderr,none": 0.008488668235778608
19
+ },
20
+ "hellaswag": {
21
+ "acc,none": 0.26558454491137223,
22
+ "acc_stderr,none": 0.0044074137233833886,
23
+ "acc_norm,none": 0.27126070503883687,
24
+ "acc_norm_stderr,none": 0.004437016600956912
25
+ },
26
+ "lambada_openai": {
27
+ "perplexity,none": 270.65768032478974,
28
+ "perplexity_stderr,none": 16.606080761646496,
29
+ "acc,none": 0.2233650300795653,
30
+ "acc_stderr,none": 0.005802673494605817
31
+ },
32
+ "openbookqa": {
33
+ "acc,none": 0.132,
34
+ "acc_stderr,none": 0.015152927850580165,
35
+ "acc_norm,none": 0.242,
36
+ "acc_norm_stderr,none": 0.019173085678337167
37
+ },
38
+ "piqa": {
39
+ "acc,none": 0.6044613710554951,
40
+ "acc_stderr,none": 0.011408384494565276,
41
+ "acc_norm,none": 0.5941240478781284,
42
+ "acc_norm_stderr,none": 0.011457256809261783
43
+ },
44
+ "sciq": {
45
+ "acc,none": 0.636,
46
+ "acc_stderr,none": 0.015222868840522019,
47
+ "acc_norm,none": 0.558,
48
+ "acc_norm_stderr,none": 0.015712507211864207
49
+ },
50
+ "wikitext": {
51
+ "word_perplexity,none": 195.77496132792564,
52
+ "byte_perplexity,none": 2.386753581920647,
53
+ "bits_per_byte,none": 1.2550496244455038
54
+ },
55
+ "winogrande": {
56
+ "acc,none": 0.5288082083662194,
57
+ "acc_stderr,none": 0.01402914161590961
58
+ }
59
+ },
60
+ "configs": {
61
+ "arc_challenge": {
62
+ "task": "arc_challenge",
63
+ "group": [
64
+ "ai2_arc",
65
+ "multiple_choice"
66
+ ],
67
+ "dataset_path": "ai2_arc",
68
+ "dataset_name": "ARC-Challenge",
69
+ "training_split": "train",
70
+ "validation_split": "validation",
71
+ "test_split": "test",
72
+ "doc_to_text": "Question: {{question}}\nAnswer:",
73
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
74
+ "doc_to_choice": "{{choices.text}}",
75
+ "description": "",
76
+ "target_delimiter": " ",
77
+ "fewshot_delimiter": "\n\n",
78
+ "num_fewshot": 0,
79
+ "metric_list": [
80
+ {
81
+ "metric": "acc",
82
+ "aggregation": "mean",
83
+ "higher_is_better": true
84
+ },
85
+ {
86
+ "metric": "acc_norm",
87
+ "aggregation": "mean",
88
+ "higher_is_better": true
89
+ }
90
+ ],
91
+ "output_type": "multiple_choice",
92
+ "repeats": 1,
93
+ "should_decontaminate": true,
94
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
95
+ },
96
+ "arc_easy": {
97
+ "task": "arc_easy",
98
+ "group": [
99
+ "ai2_arc",
100
+ "multiple_choice"
101
+ ],
102
+ "dataset_path": "ai2_arc",
103
+ "dataset_name": "ARC-Easy",
104
+ "training_split": "train",
105
+ "validation_split": "validation",
106
+ "test_split": "test",
107
+ "doc_to_text": "Question: {{question}}\nAnswer:",
108
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
109
+ "doc_to_choice": "{{choices.text}}",
110
+ "description": "",
111
+ "target_delimiter": " ",
112
+ "fewshot_delimiter": "\n\n",
113
+ "num_fewshot": 0,
114
+ "metric_list": [
115
+ {
116
+ "metric": "acc",
117
+ "aggregation": "mean",
118
+ "higher_is_better": true
119
+ },
120
+ {
121
+ "metric": "acc_norm",
122
+ "aggregation": "mean",
123
+ "higher_is_better": true
124
+ }
125
+ ],
126
+ "output_type": "multiple_choice",
127
+ "repeats": 1,
128
+ "should_decontaminate": true,
129
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
130
+ },
131
+ "boolq": {
132
+ "task": "boolq",
133
+ "group": [
134
+ "super-glue-lm-eval-v1"
135
+ ],
136
+ "dataset_path": "super_glue",
137
+ "dataset_name": "boolq",
138
+ "training_split": "train",
139
+ "validation_split": "validation",
140
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
141
+ "doc_to_target": "label",
142
+ "doc_to_choice": [
143
+ "no",
144
+ "yes"
145
+ ],
146
+ "description": "",
147
+ "target_delimiter": " ",
148
+ "fewshot_delimiter": "\n\n",
149
+ "num_fewshot": 0,
150
+ "metric_list": [
151
+ {
152
+ "metric": "acc"
153
+ }
154
+ ],
155
+ "output_type": "multiple_choice",
156
+ "repeats": 1,
157
+ "should_decontaminate": true,
158
+ "doc_to_decontamination_query": "passage"
159
+ },
160
+ "hellaswag": {
161
+ "task": "hellaswag",
162
+ "group": [
163
+ "multiple_choice"
164
+ ],
165
+ "dataset_path": "hellaswag",
166
+ "training_split": "train",
167
+ "validation_split": "validation",
168
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
169
+ "doc_to_target": "{{label}}",
170
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
171
+ "description": "",
172
+ "target_delimiter": " ",
173
+ "fewshot_delimiter": "\n\n",
174
+ "num_fewshot": 0,
175
+ "metric_list": [
176
+ {
177
+ "metric": "acc",
178
+ "aggregation": "mean",
179
+ "higher_is_better": true
180
+ },
181
+ {
182
+ "metric": "acc_norm",
183
+ "aggregation": "mean",
184
+ "higher_is_better": true
185
+ }
186
+ ],
187
+ "output_type": "multiple_choice",
188
+ "repeats": 1,
189
+ "should_decontaminate": false
190
+ },
191
+ "lambada_openai": {
192
+ "task": "lambada_openai",
193
+ "group": [
194
+ "lambada",
195
+ "loglikelihood",
196
+ "perplexity"
197
+ ],
198
+ "dataset_path": "EleutherAI/lambada_openai",
199
+ "dataset_name": "default",
200
+ "test_split": "test",
201
+ "template_aliases": "",
202
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
203
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
204
+ "description": "",
205
+ "target_delimiter": " ",
206
+ "fewshot_delimiter": "\n\n",
207
+ "num_fewshot": 0,
208
+ "metric_list": [
209
+ {
210
+ "metric": "perplexity",
211
+ "aggregation": "perplexity",
212
+ "higher_is_better": false
213
+ },
214
+ {
215
+ "metric": "acc",
216
+ "aggregation": "mean",
217
+ "higher_is_better": true
218
+ }
219
+ ],
220
+ "output_type": "loglikelihood",
221
+ "repeats": 1,
222
+ "should_decontaminate": true,
223
+ "doc_to_decontamination_query": "{{text}}"
224
+ },
225
+ "openbookqa": {
226
+ "task": "openbookqa",
227
+ "group": [
228
+ "multiple_choice"
229
+ ],
230
+ "dataset_path": "openbookqa",
231
+ "dataset_name": "main",
232
+ "training_split": "train",
233
+ "validation_split": "validation",
234
+ "test_split": "test",
235
+ "doc_to_text": "question_stem",
236
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
237
+ "doc_to_choice": "{{choices.text}}",
238
+ "description": "",
239
+ "target_delimiter": " ",
240
+ "fewshot_delimiter": "\n\n",
241
+ "num_fewshot": 0,
242
+ "metric_list": [
243
+ {
244
+ "metric": "acc",
245
+ "aggregation": "mean",
246
+ "higher_is_better": true
247
+ },
248
+ {
249
+ "metric": "acc_norm",
250
+ "aggregation": "mean",
251
+ "higher_is_better": true
252
+ }
253
+ ],
254
+ "output_type": "multiple_choice",
255
+ "repeats": 1,
256
+ "should_decontaminate": true,
257
+ "doc_to_decontamination_query": "question_stem"
258
+ },
259
+ "piqa": {
260
+ "task": "piqa",
261
+ "group": [
262
+ "multiple_choice"
263
+ ],
264
+ "dataset_path": "piqa",
265
+ "training_split": "train",
266
+ "validation_split": "validation",
267
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
268
+ "doc_to_target": "label",
269
+ "doc_to_choice": "{{[sol1, sol2]}}",
270
+ "description": "",
271
+ "target_delimiter": " ",
272
+ "fewshot_delimiter": "\n\n",
273
+ "num_fewshot": 0,
274
+ "metric_list": [
275
+ {
276
+ "metric": "acc",
277
+ "aggregation": "mean",
278
+ "higher_is_better": true
279
+ },
280
+ {
281
+ "metric": "acc_norm",
282
+ "aggregation": "mean",
283
+ "higher_is_better": true
284
+ }
285
+ ],
286
+ "output_type": "multiple_choice",
287
+ "repeats": 1,
288
+ "should_decontaminate": true,
289
+ "doc_to_decontamination_query": "goal"
290
+ },
291
+ "sciq": {
292
+ "task": "sciq",
293
+ "group": [
294
+ "multiple_choice"
295
+ ],
296
+ "dataset_path": "sciq",
297
+ "training_split": "train",
298
+ "validation_split": "validation",
299
+ "test_split": "test",
300
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
301
+ "doc_to_target": 3,
302
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
303
+ "description": "",
304
+ "target_delimiter": " ",
305
+ "fewshot_delimiter": "\n\n",
306
+ "num_fewshot": 0,
307
+ "metric_list": [
308
+ {
309
+ "metric": "acc",
310
+ "aggregation": "mean",
311
+ "higher_is_better": true
312
+ },
313
+ {
314
+ "metric": "acc_norm",
315
+ "aggregation": "mean",
316
+ "higher_is_better": true
317
+ }
318
+ ],
319
+ "output_type": "multiple_choice",
320
+ "repeats": 1,
321
+ "should_decontaminate": true,
322
+ "doc_to_decontamination_query": "{{support}} {{question}}"
323
+ },
324
+ "wikitext": {
325
+ "task": "wikitext",
326
+ "group": [
327
+ "perplexity",
328
+ "loglikelihood_rolling"
329
+ ],
330
+ "dataset_path": "EleutherAI/wikitext_document_level",
331
+ "dataset_name": "wikitext-2-raw-v1",
332
+ "training_split": "train",
333
+ "validation_split": "validation",
334
+ "test_split": "test",
335
+ "template_aliases": "",
336
+ "doc_to_text": "",
337
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fd90c8dfeb0>",
338
+ "description": "",
339
+ "target_delimiter": " ",
340
+ "fewshot_delimiter": "\n\n",
341
+ "num_fewshot": 0,
342
+ "metric_list": [
343
+ {
344
+ "metric": "word_perplexity"
345
+ },
346
+ {
347
+ "metric": "byte_perplexity"
348
+ },
349
+ {
350
+ "metric": "bits_per_byte"
351
+ }
352
+ ],
353
+ "output_type": "loglikelihood_rolling",
354
+ "repeats": 1,
355
+ "should_decontaminate": true,
356
+ "doc_to_decontamination_query": "{{page}}"
357
+ },
358
+ "winogrande": {
359
+ "task": "winogrande",
360
+ "dataset_path": "winogrande",
361
+ "dataset_name": "winogrande_xl",
362
+ "training_split": "train",
363
+ "validation_split": "validation",
364
+ "doc_to_text": "<function doc_to_text at 0x7fd90c8dedd0>",
365
+ "doc_to_target": "<function doc_to_target at 0x7fd90c8df250>",
366
+ "doc_to_choice": "<function doc_to_choice at 0x7fd90c8df490>",
367
+ "description": "",
368
+ "target_delimiter": " ",
369
+ "fewshot_delimiter": "\n\n",
370
+ "num_fewshot": 0,
371
+ "metric_list": [
372
+ {
373
+ "metric": "acc",
374
+ "aggregation": "mean",
375
+ "higher_is_better": true
376
+ }
377
+ ],
378
+ "output_type": "multiple_choice",
379
+ "repeats": 1,
380
+ "should_decontaminate": false
381
+ }
382
+ },
383
+ "versions": {
384
+ "arc_challenge": "Yaml",
385
+ "arc_easy": "Yaml",
386
+ "boolq": "Yaml",
387
+ "hellaswag": "Yaml",
388
+ "lambada_openai": "Yaml",
389
+ "openbookqa": "Yaml",
390
+ "piqa": "Yaml",
391
+ "sciq": "Yaml",
392
+ "wikitext": "Yaml",
393
+ "winogrande": "Yaml"
394
+ },
395
+ "config": {
396
+ "model": "hf",
397
+ "model_args": "pretrained=lomahony/eleuther-pythia70m-hh-dpo",
398
+ "num_fewshot": 0,
399
+ "batch_size": 16,
400
+ "batch_sizes": [],
401
+ "device": "cuda:0",
402
+ "use_cache": null,
403
+ "limit": null,
404
+ "bootstrap_iters": 100000
405
+ },
406
+ "git_hash": "4e44f0a"
407
+ }
408
+ hf (pretrained=lomahony/eleuther-pythia70m-hh-dpo), limit: None, num_fewshot: 0, batch_size: 16
409
+ | Task |Version|Filter| Metric | Value | |Stderr |
410
+ |--------------|-------|------|---------------|-------:|---|------:|
411
+ |arc_challenge |Yaml |none |acc | 0.1945|± | 0.0116|
412
+ | | |none |acc_norm | 0.2235|± | 0.0122|
413
+ |arc_easy |Yaml |none |acc | 0.3902|± | 0.0100|
414
+ | | |none |acc_norm | 0.3561|± | 0.0098|
415
+ |boolq |Yaml |none |acc | 0.6202|± | 0.0085|
416
+ |hellaswag |Yaml |none |acc | 0.2656|± | 0.0044|
417
+ | | |none |acc_norm | 0.2713|± | 0.0044|
418
+ |lambada_openai|Yaml |none |perplexity |270.6577|± |16.6061|
419
+ | | |none |acc | 0.2234|± | 0.0058|
420
+ |openbookqa |Yaml |none |acc | 0.1320|± | 0.0152|
421
+ | | |none |acc_norm | 0.2420|± | 0.0192|
422
+ |piqa |Yaml |none |acc | 0.6045|± | 0.0114|
423
+ | | |none |acc_norm | 0.5941|± | 0.0115|
424
+ |sciq |Yaml |none |acc | 0.6360|± | 0.0152|
425
+ | | |none |acc_norm | 0.5580|± | 0.0157|
426
+ |wikitext |Yaml |none |word_perplexity|195.7750| | |
427
+ | | |none |byte_perplexity| 2.3868| | |
428
+ | | |none |bits_per_byte | 1.2550| | |
429
+ |winogrande |Yaml |none |acc | 0.5288|± | 0.0140|
430
+
dpo-70m-eval-files/dpo-pythia-70m-0shot/results.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.1945392491467577,
5
+ "acc_stderr,none": 0.011567709174648728,
6
+ "acc_norm,none": 0.2235494880546075,
7
+ "acc_norm_stderr,none": 0.012174896631202612
8
+ },
9
+ "arc_easy": {
10
+ "acc,none": 0.39015151515151514,
11
+ "acc_stderr,none": 0.010009118166667422,
12
+ "acc_norm,none": 0.3560606060606061,
13
+ "acc_norm_stderr,none": 0.009825454608416315
14
+ },
15
+ "boolq": {
16
+ "acc,none": 0.6201834862385321,
17
+ "acc_stderr,none": 0.008488668235778608
18
+ },
19
+ "hellaswag": {
20
+ "acc,none": 0.26558454491137223,
21
+ "acc_stderr,none": 0.0044074137233833886,
22
+ "acc_norm,none": 0.27126070503883687,
23
+ "acc_norm_stderr,none": 0.004437016600956912
24
+ },
25
+ "lambada_openai": {
26
+ "perplexity,none": 270.65768032478974,
27
+ "perplexity_stderr,none": 16.606080761646496,
28
+ "acc,none": 0.2233650300795653,
29
+ "acc_stderr,none": 0.005802673494605817
30
+ },
31
+ "openbookqa": {
32
+ "acc,none": 0.132,
33
+ "acc_stderr,none": 0.015152927850580165,
34
+ "acc_norm,none": 0.242,
35
+ "acc_norm_stderr,none": 0.019173085678337167
36
+ },
37
+ "piqa": {
38
+ "acc,none": 0.6044613710554951,
39
+ "acc_stderr,none": 0.011408384494565276,
40
+ "acc_norm,none": 0.5941240478781284,
41
+ "acc_norm_stderr,none": 0.011457256809261783
42
+ },
43
+ "sciq": {
44
+ "acc,none": 0.636,
45
+ "acc_stderr,none": 0.015222868840522019,
46
+ "acc_norm,none": 0.558,
47
+ "acc_norm_stderr,none": 0.015712507211864207
48
+ },
49
+ "wikitext": {
50
+ "word_perplexity,none": 195.77496132792564,
51
+ "byte_perplexity,none": 2.386753581920647,
52
+ "bits_per_byte,none": 1.2550496244455038
53
+ },
54
+ "winogrande": {
55
+ "acc,none": 0.5288082083662194,
56
+ "acc_stderr,none": 0.01402914161590961
57
+ }
58
+ },
59
+ "configs": {
60
+ "arc_challenge": {
61
+ "task": "arc_challenge",
62
+ "group": [
63
+ "ai2_arc",
64
+ "multiple_choice"
65
+ ],
66
+ "dataset_path": "ai2_arc",
67
+ "dataset_name": "ARC-Challenge",
68
+ "training_split": "train",
69
+ "validation_split": "validation",
70
+ "test_split": "test",
71
+ "doc_to_text": "Question: {{question}}\nAnswer:",
72
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
73
+ "doc_to_choice": "{{choices.text}}",
74
+ "description": "",
75
+ "target_delimiter": " ",
76
+ "fewshot_delimiter": "\n\n",
77
+ "num_fewshot": 0,
78
+ "metric_list": [
79
+ {
80
+ "metric": "acc",
81
+ "aggregation": "mean",
82
+ "higher_is_better": true
83
+ },
84
+ {
85
+ "metric": "acc_norm",
86
+ "aggregation": "mean",
87
+ "higher_is_better": true
88
+ }
89
+ ],
90
+ "output_type": "multiple_choice",
91
+ "repeats": 1,
92
+ "should_decontaminate": true,
93
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
94
+ },
95
+ "arc_easy": {
96
+ "task": "arc_easy",
97
+ "group": [
98
+ "ai2_arc",
99
+ "multiple_choice"
100
+ ],
101
+ "dataset_path": "ai2_arc",
102
+ "dataset_name": "ARC-Easy",
103
+ "training_split": "train",
104
+ "validation_split": "validation",
105
+ "test_split": "test",
106
+ "doc_to_text": "Question: {{question}}\nAnswer:",
107
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
108
+ "doc_to_choice": "{{choices.text}}",
109
+ "description": "",
110
+ "target_delimiter": " ",
111
+ "fewshot_delimiter": "\n\n",
112
+ "num_fewshot": 0,
113
+ "metric_list": [
114
+ {
115
+ "metric": "acc",
116
+ "aggregation": "mean",
117
+ "higher_is_better": true
118
+ },
119
+ {
120
+ "metric": "acc_norm",
121
+ "aggregation": "mean",
122
+ "higher_is_better": true
123
+ }
124
+ ],
125
+ "output_type": "multiple_choice",
126
+ "repeats": 1,
127
+ "should_decontaminate": true,
128
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
129
+ },
130
+ "boolq": {
131
+ "task": "boolq",
132
+ "group": [
133
+ "super-glue-lm-eval-v1"
134
+ ],
135
+ "dataset_path": "super_glue",
136
+ "dataset_name": "boolq",
137
+ "training_split": "train",
138
+ "validation_split": "validation",
139
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
140
+ "doc_to_target": "label",
141
+ "doc_to_choice": [
142
+ "no",
143
+ "yes"
144
+ ],
145
+ "description": "",
146
+ "target_delimiter": " ",
147
+ "fewshot_delimiter": "\n\n",
148
+ "num_fewshot": 0,
149
+ "metric_list": [
150
+ {
151
+ "metric": "acc"
152
+ }
153
+ ],
154
+ "output_type": "multiple_choice",
155
+ "repeats": 1,
156
+ "should_decontaminate": true,
157
+ "doc_to_decontamination_query": "passage"
158
+ },
159
+ "hellaswag": {
160
+ "task": "hellaswag",
161
+ "group": [
162
+ "multiple_choice"
163
+ ],
164
+ "dataset_path": "hellaswag",
165
+ "training_split": "train",
166
+ "validation_split": "validation",
167
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
168
+ "doc_to_target": "{{label}}",
169
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
170
+ "description": "",
171
+ "target_delimiter": " ",
172
+ "fewshot_delimiter": "\n\n",
173
+ "num_fewshot": 0,
174
+ "metric_list": [
175
+ {
176
+ "metric": "acc",
177
+ "aggregation": "mean",
178
+ "higher_is_better": true
179
+ },
180
+ {
181
+ "metric": "acc_norm",
182
+ "aggregation": "mean",
183
+ "higher_is_better": true
184
+ }
185
+ ],
186
+ "output_type": "multiple_choice",
187
+ "repeats": 1,
188
+ "should_decontaminate": false
189
+ },
190
+ "lambada_openai": {
191
+ "task": "lambada_openai",
192
+ "group": [
193
+ "lambada",
194
+ "loglikelihood",
195
+ "perplexity"
196
+ ],
197
+ "dataset_path": "EleutherAI/lambada_openai",
198
+ "dataset_name": "default",
199
+ "test_split": "test",
200
+ "template_aliases": "",
201
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
202
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
203
+ "description": "",
204
+ "target_delimiter": " ",
205
+ "fewshot_delimiter": "\n\n",
206
+ "num_fewshot": 0,
207
+ "metric_list": [
208
+ {
209
+ "metric": "perplexity",
210
+ "aggregation": "perplexity",
211
+ "higher_is_better": false
212
+ },
213
+ {
214
+ "metric": "acc",
215
+ "aggregation": "mean",
216
+ "higher_is_better": true
217
+ }
218
+ ],
219
+ "output_type": "loglikelihood",
220
+ "repeats": 1,
221
+ "should_decontaminate": true,
222
+ "doc_to_decontamination_query": "{{text}}"
223
+ },
224
+ "openbookqa": {
225
+ "task": "openbookqa",
226
+ "group": [
227
+ "multiple_choice"
228
+ ],
229
+ "dataset_path": "openbookqa",
230
+ "dataset_name": "main",
231
+ "training_split": "train",
232
+ "validation_split": "validation",
233
+ "test_split": "test",
234
+ "doc_to_text": "question_stem",
235
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
236
+ "doc_to_choice": "{{choices.text}}",
237
+ "description": "",
238
+ "target_delimiter": " ",
239
+ "fewshot_delimiter": "\n\n",
240
+ "num_fewshot": 0,
241
+ "metric_list": [
242
+ {
243
+ "metric": "acc",
244
+ "aggregation": "mean",
245
+ "higher_is_better": true
246
+ },
247
+ {
248
+ "metric": "acc_norm",
249
+ "aggregation": "mean",
250
+ "higher_is_better": true
251
+ }
252
+ ],
253
+ "output_type": "multiple_choice",
254
+ "repeats": 1,
255
+ "should_decontaminate": true,
256
+ "doc_to_decontamination_query": "question_stem"
257
+ },
258
+ "piqa": {
259
+ "task": "piqa",
260
+ "group": [
261
+ "multiple_choice"
262
+ ],
263
+ "dataset_path": "piqa",
264
+ "training_split": "train",
265
+ "validation_split": "validation",
266
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
267
+ "doc_to_target": "label",
268
+ "doc_to_choice": "{{[sol1, sol2]}}",
269
+ "description": "",
270
+ "target_delimiter": " ",
271
+ "fewshot_delimiter": "\n\n",
272
+ "num_fewshot": 0,
273
+ "metric_list": [
274
+ {
275
+ "metric": "acc",
276
+ "aggregation": "mean",
277
+ "higher_is_better": true
278
+ },
279
+ {
280
+ "metric": "acc_norm",
281
+ "aggregation": "mean",
282
+ "higher_is_better": true
283
+ }
284
+ ],
285
+ "output_type": "multiple_choice",
286
+ "repeats": 1,
287
+ "should_decontaminate": true,
288
+ "doc_to_decontamination_query": "goal"
289
+ },
290
+ "sciq": {
291
+ "task": "sciq",
292
+ "group": [
293
+ "multiple_choice"
294
+ ],
295
+ "dataset_path": "sciq",
296
+ "training_split": "train",
297
+ "validation_split": "validation",
298
+ "test_split": "test",
299
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
300
+ "doc_to_target": 3,
301
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
302
+ "description": "",
303
+ "target_delimiter": " ",
304
+ "fewshot_delimiter": "\n\n",
305
+ "num_fewshot": 0,
306
+ "metric_list": [
307
+ {
308
+ "metric": "acc",
309
+ "aggregation": "mean",
310
+ "higher_is_better": true
311
+ },
312
+ {
313
+ "metric": "acc_norm",
314
+ "aggregation": "mean",
315
+ "higher_is_better": true
316
+ }
317
+ ],
318
+ "output_type": "multiple_choice",
319
+ "repeats": 1,
320
+ "should_decontaminate": true,
321
+ "doc_to_decontamination_query": "{{support}} {{question}}"
322
+ },
323
+ "wikitext": {
324
+ "task": "wikitext",
325
+ "group": [
326
+ "perplexity",
327
+ "loglikelihood_rolling"
328
+ ],
329
+ "dataset_path": "EleutherAI/wikitext_document_level",
330
+ "dataset_name": "wikitext-2-raw-v1",
331
+ "training_split": "train",
332
+ "validation_split": "validation",
333
+ "test_split": "test",
334
+ "template_aliases": "",
335
+ "doc_to_text": "",
336
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fd90c8dfeb0>",
337
+ "description": "",
338
+ "target_delimiter": " ",
339
+ "fewshot_delimiter": "\n\n",
340
+ "num_fewshot": 0,
341
+ "metric_list": [
342
+ {
343
+ "metric": "word_perplexity"
344
+ },
345
+ {
346
+ "metric": "byte_perplexity"
347
+ },
348
+ {
349
+ "metric": "bits_per_byte"
350
+ }
351
+ ],
352
+ "output_type": "loglikelihood_rolling",
353
+ "repeats": 1,
354
+ "should_decontaminate": true,
355
+ "doc_to_decontamination_query": "{{page}}"
356
+ },
357
+ "winogrande": {
358
+ "task": "winogrande",
359
+ "dataset_path": "winogrande",
360
+ "dataset_name": "winogrande_xl",
361
+ "training_split": "train",
362
+ "validation_split": "validation",
363
+ "doc_to_text": "<function doc_to_text at 0x7fd90c8dedd0>",
364
+ "doc_to_target": "<function doc_to_target at 0x7fd90c8df250>",
365
+ "doc_to_choice": "<function doc_to_choice at 0x7fd90c8df490>",
366
+ "description": "",
367
+ "target_delimiter": " ",
368
+ "fewshot_delimiter": "\n\n",
369
+ "num_fewshot": 0,
370
+ "metric_list": [
371
+ {
372
+ "metric": "acc",
373
+ "aggregation": "mean",
374
+ "higher_is_better": true
375
+ }
376
+ ],
377
+ "output_type": "multiple_choice",
378
+ "repeats": 1,
379
+ "should_decontaminate": false
380
+ }
381
+ },
382
+ "versions": {
383
+ "arc_challenge": "Yaml",
384
+ "arc_easy": "Yaml",
385
+ "boolq": "Yaml",
386
+ "hellaswag": "Yaml",
387
+ "lambada_openai": "Yaml",
388
+ "openbookqa": "Yaml",
389
+ "piqa": "Yaml",
390
+ "sciq": "Yaml",
391
+ "wikitext": "Yaml",
392
+ "winogrande": "Yaml"
393
+ },
394
+ "config": {
395
+ "model": "hf",
396
+ "model_args": "pretrained=lomahony/eleuther-pythia70m-hh-dpo",
397
+ "num_fewshot": 0,
398
+ "batch_size": 16,
399
+ "batch_sizes": [],
400
+ "device": "cuda:0",
401
+ "use_cache": null,
402
+ "limit": null,
403
+ "bootstrap_iters": 100000
404
+ },
405
+ "git_hash": "4e44f0a"
406
+ }
dpo-70m-eval-files/dpo-pythia-70m-5shot-shelloutput.txt ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bootstrapping for stddev: perplexity
2
+ {
3
+ "results": {
4
+ "arc_challenge": {
5
+ "acc,none": 0.18515358361774745,
6
+ "acc_stderr,none": 0.011350774438389694,
7
+ "acc_norm,none": 0.21331058020477817,
8
+ "acc_norm_stderr,none": 0.011970971742326334
9
+ },
10
+ "arc_easy": {
11
+ "acc,none": 0.382996632996633,
12
+ "acc_stderr,none": 0.009974920384536484,
13
+ "acc_norm,none": 0.3707912457912458,
14
+ "acc_norm_stderr,none": 0.00991129282205692
15
+ },
16
+ "boolq": {
17
+ "acc,none": 0.5510703363914373,
18
+ "acc_stderr,none": 0.008699318031464162
19
+ },
20
+ "hellaswag": {
21
+ "acc,none": 0.264389563831906,
22
+ "acc_stderr,none": 0.004401063265803198,
23
+ "acc_norm,none": 0.27155945030870343,
24
+ "acc_norm_stderr,none": 0.004438549152538001
25
+ },
26
+ "lambada_openai": {
27
+ "perplexity,none": 708.6120565245322,
28
+ "perplexity_stderr,none": 41.92659162840902,
29
+ "acc,none": 0.1500097030855812,
30
+ "acc_stderr,none": 0.004974835589564718
31
+ },
32
+ "openbookqa": {
33
+ "acc,none": 0.14,
34
+ "acc_stderr,none": 0.015533272840269651,
35
+ "acc_norm,none": 0.236,
36
+ "acc_norm_stderr,none": 0.019008699622084728
37
+ },
38
+ "piqa": {
39
+ "acc,none": 0.5984766050054406,
40
+ "acc_stderr,none": 0.011437324373397841,
41
+ "acc_norm,none": 0.5930359085963003,
42
+ "acc_norm_stderr,none": 0.011462093919190166
43
+ },
44
+ "sciq": {
45
+ "acc,none": 0.619,
46
+ "acc_stderr,none": 0.015364734787007436,
47
+ "acc_norm,none": 0.584,
48
+ "acc_norm_stderr,none": 0.0155944601441406
49
+ },
50
+ "wikitext": {
51
+ "word_perplexity,none": 195.77496132792564,
52
+ "byte_perplexity,none": 2.386753581920647,
53
+ "bits_per_byte,none": 1.2550496244455038
54
+ },
55
+ "winogrande": {
56
+ "acc,none": 0.5288082083662194,
57
+ "acc_stderr,none": 0.01402914161590961
58
+ }
59
+ },
60
+ "configs": {
61
+ "arc_challenge": {
62
+ "task": "arc_challenge",
63
+ "group": [
64
+ "ai2_arc",
65
+ "multiple_choice"
66
+ ],
67
+ "dataset_path": "ai2_arc",
68
+ "dataset_name": "ARC-Challenge",
69
+ "training_split": "train",
70
+ "validation_split": "validation",
71
+ "test_split": "test",
72
+ "doc_to_text": "Question: {{question}}\nAnswer:",
73
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
74
+ "doc_to_choice": "{{choices.text}}",
75
+ "description": "",
76
+ "target_delimiter": " ",
77
+ "fewshot_delimiter": "\n\n",
78
+ "num_fewshot": 5,
79
+ "metric_list": [
80
+ {
81
+ "metric": "acc",
82
+ "aggregation": "mean",
83
+ "higher_is_better": true
84
+ },
85
+ {
86
+ "metric": "acc_norm",
87
+ "aggregation": "mean",
88
+ "higher_is_better": true
89
+ }
90
+ ],
91
+ "output_type": "multiple_choice",
92
+ "repeats": 1,
93
+ "should_decontaminate": true,
94
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
95
+ },
96
+ "arc_easy": {
97
+ "task": "arc_easy",
98
+ "group": [
99
+ "ai2_arc",
100
+ "multiple_choice"
101
+ ],
102
+ "dataset_path": "ai2_arc",
103
+ "dataset_name": "ARC-Easy",
104
+ "training_split": "train",
105
+ "validation_split": "validation",
106
+ "test_split": "test",
107
+ "doc_to_text": "Question: {{question}}\nAnswer:",
108
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
109
+ "doc_to_choice": "{{choices.text}}",
110
+ "description": "",
111
+ "target_delimiter": " ",
112
+ "fewshot_delimiter": "\n\n",
113
+ "num_fewshot": 5,
114
+ "metric_list": [
115
+ {
116
+ "metric": "acc",
117
+ "aggregation": "mean",
118
+ "higher_is_better": true
119
+ },
120
+ {
121
+ "metric": "acc_norm",
122
+ "aggregation": "mean",
123
+ "higher_is_better": true
124
+ }
125
+ ],
126
+ "output_type": "multiple_choice",
127
+ "repeats": 1,
128
+ "should_decontaminate": true,
129
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
130
+ },
131
+ "boolq": {
132
+ "task": "boolq",
133
+ "group": [
134
+ "super-glue-lm-eval-v1"
135
+ ],
136
+ "dataset_path": "super_glue",
137
+ "dataset_name": "boolq",
138
+ "training_split": "train",
139
+ "validation_split": "validation",
140
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
141
+ "doc_to_target": "label",
142
+ "doc_to_choice": [
143
+ "no",
144
+ "yes"
145
+ ],
146
+ "description": "",
147
+ "target_delimiter": " ",
148
+ "fewshot_delimiter": "\n\n",
149
+ "num_fewshot": 5,
150
+ "metric_list": [
151
+ {
152
+ "metric": "acc"
153
+ }
154
+ ],
155
+ "output_type": "multiple_choice",
156
+ "repeats": 1,
157
+ "should_decontaminate": true,
158
+ "doc_to_decontamination_query": "passage"
159
+ },
160
+ "hellaswag": {
161
+ "task": "hellaswag",
162
+ "group": [
163
+ "multiple_choice"
164
+ ],
165
+ "dataset_path": "hellaswag",
166
+ "training_split": "train",
167
+ "validation_split": "validation",
168
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
169
+ "doc_to_target": "{{label}}",
170
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
171
+ "description": "",
172
+ "target_delimiter": " ",
173
+ "fewshot_delimiter": "\n\n",
174
+ "num_fewshot": 5,
175
+ "metric_list": [
176
+ {
177
+ "metric": "acc",
178
+ "aggregation": "mean",
179
+ "higher_is_better": true
180
+ },
181
+ {
182
+ "metric": "acc_norm",
183
+ "aggregation": "mean",
184
+ "higher_is_better": true
185
+ }
186
+ ],
187
+ "output_type": "multiple_choice",
188
+ "repeats": 1,
189
+ "should_decontaminate": false
190
+ },
191
+ "lambada_openai": {
192
+ "task": "lambada_openai",
193
+ "group": [
194
+ "lambada",
195
+ "loglikelihood",
196
+ "perplexity"
197
+ ],
198
+ "dataset_path": "EleutherAI/lambada_openai",
199
+ "dataset_name": "default",
200
+ "test_split": "test",
201
+ "template_aliases": "",
202
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
203
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
204
+ "description": "",
205
+ "target_delimiter": " ",
206
+ "fewshot_delimiter": "\n\n",
207
+ "num_fewshot": 5,
208
+ "metric_list": [
209
+ {
210
+ "metric": "perplexity",
211
+ "aggregation": "perplexity",
212
+ "higher_is_better": false
213
+ },
214
+ {
215
+ "metric": "acc",
216
+ "aggregation": "mean",
217
+ "higher_is_better": true
218
+ }
219
+ ],
220
+ "output_type": "loglikelihood",
221
+ "repeats": 1,
222
+ "should_decontaminate": true,
223
+ "doc_to_decontamination_query": "{{text}}"
224
+ },
225
+ "openbookqa": {
226
+ "task": "openbookqa",
227
+ "group": [
228
+ "multiple_choice"
229
+ ],
230
+ "dataset_path": "openbookqa",
231
+ "dataset_name": "main",
232
+ "training_split": "train",
233
+ "validation_split": "validation",
234
+ "test_split": "test",
235
+ "doc_to_text": "question_stem",
236
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
237
+ "doc_to_choice": "{{choices.text}}",
238
+ "description": "",
239
+ "target_delimiter": " ",
240
+ "fewshot_delimiter": "\n\n",
241
+ "num_fewshot": 5,
242
+ "metric_list": [
243
+ {
244
+ "metric": "acc",
245
+ "aggregation": "mean",
246
+ "higher_is_better": true
247
+ },
248
+ {
249
+ "metric": "acc_norm",
250
+ "aggregation": "mean",
251
+ "higher_is_better": true
252
+ }
253
+ ],
254
+ "output_type": "multiple_choice",
255
+ "repeats": 1,
256
+ "should_decontaminate": true,
257
+ "doc_to_decontamination_query": "question_stem"
258
+ },
259
+ "piqa": {
260
+ "task": "piqa",
261
+ "group": [
262
+ "multiple_choice"
263
+ ],
264
+ "dataset_path": "piqa",
265
+ "training_split": "train",
266
+ "validation_split": "validation",
267
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
268
+ "doc_to_target": "label",
269
+ "doc_to_choice": "{{[sol1, sol2]}}",
270
+ "description": "",
271
+ "target_delimiter": " ",
272
+ "fewshot_delimiter": "\n\n",
273
+ "num_fewshot": 5,
274
+ "metric_list": [
275
+ {
276
+ "metric": "acc",
277
+ "aggregation": "mean",
278
+ "higher_is_better": true
279
+ },
280
+ {
281
+ "metric": "acc_norm",
282
+ "aggregation": "mean",
283
+ "higher_is_better": true
284
+ }
285
+ ],
286
+ "output_type": "multiple_choice",
287
+ "repeats": 1,
288
+ "should_decontaminate": true,
289
+ "doc_to_decontamination_query": "goal"
290
+ },
291
+ "sciq": {
292
+ "task": "sciq",
293
+ "group": [
294
+ "multiple_choice"
295
+ ],
296
+ "dataset_path": "sciq",
297
+ "training_split": "train",
298
+ "validation_split": "validation",
299
+ "test_split": "test",
300
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
301
+ "doc_to_target": 3,
302
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
303
+ "description": "",
304
+ "target_delimiter": " ",
305
+ "fewshot_delimiter": "\n\n",
306
+ "num_fewshot": 5,
307
+ "metric_list": [
308
+ {
309
+ "metric": "acc",
310
+ "aggregation": "mean",
311
+ "higher_is_better": true
312
+ },
313
+ {
314
+ "metric": "acc_norm",
315
+ "aggregation": "mean",
316
+ "higher_is_better": true
317
+ }
318
+ ],
319
+ "output_type": "multiple_choice",
320
+ "repeats": 1,
321
+ "should_decontaminate": true,
322
+ "doc_to_decontamination_query": "{{support}} {{question}}"
323
+ },
324
+ "wikitext": {
325
+ "task": "wikitext",
326
+ "group": [
327
+ "perplexity",
328
+ "loglikelihood_rolling"
329
+ ],
330
+ "dataset_path": "EleutherAI/wikitext_document_level",
331
+ "dataset_name": "wikitext-2-raw-v1",
332
+ "training_split": "train",
333
+ "validation_split": "validation",
334
+ "test_split": "test",
335
+ "template_aliases": "",
336
+ "doc_to_text": "",
337
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fa675b9beb0>",
338
+ "description": "",
339
+ "target_delimiter": " ",
340
+ "fewshot_delimiter": "\n\n",
341
+ "num_fewshot": 5,
342
+ "metric_list": [
343
+ {
344
+ "metric": "word_perplexity"
345
+ },
346
+ {
347
+ "metric": "byte_perplexity"
348
+ },
349
+ {
350
+ "metric": "bits_per_byte"
351
+ }
352
+ ],
353
+ "output_type": "loglikelihood_rolling",
354
+ "repeats": 1,
355
+ "should_decontaminate": true,
356
+ "doc_to_decontamination_query": "{{page}}"
357
+ },
358
+ "winogrande": {
359
+ "task": "winogrande",
360
+ "dataset_path": "winogrande",
361
+ "dataset_name": "winogrande_xl",
362
+ "training_split": "train",
363
+ "validation_split": "validation",
364
+ "doc_to_text": "<function doc_to_text at 0x7fa675b9add0>",
365
+ "doc_to_target": "<function doc_to_target at 0x7fa675b9b250>",
366
+ "doc_to_choice": "<function doc_to_choice at 0x7fa675b9b490>",
367
+ "description": "",
368
+ "target_delimiter": " ",
369
+ "fewshot_delimiter": "\n\n",
370
+ "num_fewshot": 5,
371
+ "metric_list": [
372
+ {
373
+ "metric": "acc",
374
+ "aggregation": "mean",
375
+ "higher_is_better": true
376
+ }
377
+ ],
378
+ "output_type": "multiple_choice",
379
+ "repeats": 1,
380
+ "should_decontaminate": false
381
+ }
382
+ },
383
+ "versions": {
384
+ "arc_challenge": "Yaml",
385
+ "arc_easy": "Yaml",
386
+ "boolq": "Yaml",
387
+ "hellaswag": "Yaml",
388
+ "lambada_openai": "Yaml",
389
+ "openbookqa": "Yaml",
390
+ "piqa": "Yaml",
391
+ "sciq": "Yaml",
392
+ "wikitext": "Yaml",
393
+ "winogrande": "Yaml"
394
+ },
395
+ "config": {
396
+ "model": "hf",
397
+ "model_args": "pretrained=lomahony/eleuther-pythia70m-hh-dpo",
398
+ "num_fewshot": 5,
399
+ "batch_size": 16,
400
+ "batch_sizes": [],
401
+ "device": "cuda:0",
402
+ "use_cache": null,
403
+ "limit": null,
404
+ "bootstrap_iters": 100000
405
+ },
406
+ "git_hash": "4e44f0a"
407
+ }
408
+ hf (pretrained=lomahony/eleuther-pythia70m-hh-dpo), limit: None, num_fewshot: 5, batch_size: 16
409
+ | Task |Version|Filter| Metric | Value | |Stderr |
410
+ |--------------|-------|------|---------------|-------:|---|------:|
411
+ |arc_challenge |Yaml |none |acc | 0.1852|± | 0.0114|
412
+ | | |none |acc_norm | 0.2133|± | 0.0120|
413
+ |arc_easy |Yaml |none |acc | 0.3830|± | 0.0100|
414
+ | | |none |acc_norm | 0.3708|± | 0.0099|
415
+ |boolq |Yaml |none |acc | 0.5511|± | 0.0087|
416
+ |hellaswag |Yaml |none |acc | 0.2644|± | 0.0044|
417
+ | | |none |acc_norm | 0.2716|± | 0.0044|
418
+ |lambada_openai|Yaml |none |perplexity |708.6121|± |41.9266|
419
+ | | |none |acc | 0.1500|± | 0.0050|
420
+ |openbookqa |Yaml |none |acc | 0.1400|± | 0.0155|
421
+ | | |none |acc_norm | 0.2360|± | 0.0190|
422
+ |piqa |Yaml |none |acc | 0.5985|± | 0.0114|
423
+ | | |none |acc_norm | 0.5930|± | 0.0115|
424
+ |sciq |Yaml |none |acc | 0.6190|± | 0.0154|
425
+ | | |none |acc_norm | 0.5840|± | 0.0156|
426
+ |wikitext |Yaml |none |word_perplexity|195.7750| | |
427
+ | | |none |byte_perplexity| 2.3868| | |
428
+ | | |none |bits_per_byte | 1.2550| | |
429
+ |winogrande |Yaml |none |acc | 0.5288|± | 0.0140|
430
+
dpo-70m-eval-files/dpo-pythia-70m-5shot/results.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "acc,none": 0.18515358361774745,
5
+ "acc_stderr,none": 0.011350774438389694,
6
+ "acc_norm,none": 0.21331058020477817,
7
+ "acc_norm_stderr,none": 0.011970971742326334
8
+ },
9
+ "arc_easy": {
10
+ "acc,none": 0.382996632996633,
11
+ "acc_stderr,none": 0.009974920384536484,
12
+ "acc_norm,none": 0.3707912457912458,
13
+ "acc_norm_stderr,none": 0.00991129282205692
14
+ },
15
+ "boolq": {
16
+ "acc,none": 0.5510703363914373,
17
+ "acc_stderr,none": 0.008699318031464162
18
+ },
19
+ "hellaswag": {
20
+ "acc,none": 0.264389563831906,
21
+ "acc_stderr,none": 0.004401063265803198,
22
+ "acc_norm,none": 0.27155945030870343,
23
+ "acc_norm_stderr,none": 0.004438549152538001
24
+ },
25
+ "lambada_openai": {
26
+ "perplexity,none": 708.6120565245322,
27
+ "perplexity_stderr,none": 41.92659162840902,
28
+ "acc,none": 0.1500097030855812,
29
+ "acc_stderr,none": 0.004974835589564718
30
+ },
31
+ "openbookqa": {
32
+ "acc,none": 0.14,
33
+ "acc_stderr,none": 0.015533272840269651,
34
+ "acc_norm,none": 0.236,
35
+ "acc_norm_stderr,none": 0.019008699622084728
36
+ },
37
+ "piqa": {
38
+ "acc,none": 0.5984766050054406,
39
+ "acc_stderr,none": 0.011437324373397841,
40
+ "acc_norm,none": 0.5930359085963003,
41
+ "acc_norm_stderr,none": 0.011462093919190166
42
+ },
43
+ "sciq": {
44
+ "acc,none": 0.619,
45
+ "acc_stderr,none": 0.015364734787007436,
46
+ "acc_norm,none": 0.584,
47
+ "acc_norm_stderr,none": 0.0155944601441406
48
+ },
49
+ "wikitext": {
50
+ "word_perplexity,none": 195.77496132792564,
51
+ "byte_perplexity,none": 2.386753581920647,
52
+ "bits_per_byte,none": 1.2550496244455038
53
+ },
54
+ "winogrande": {
55
+ "acc,none": 0.5288082083662194,
56
+ "acc_stderr,none": 0.01402914161590961
57
+ }
58
+ },
59
+ "configs": {
60
+ "arc_challenge": {
61
+ "task": "arc_challenge",
62
+ "group": [
63
+ "ai2_arc",
64
+ "multiple_choice"
65
+ ],
66
+ "dataset_path": "ai2_arc",
67
+ "dataset_name": "ARC-Challenge",
68
+ "training_split": "train",
69
+ "validation_split": "validation",
70
+ "test_split": "test",
71
+ "doc_to_text": "Question: {{question}}\nAnswer:",
72
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
73
+ "doc_to_choice": "{{choices.text}}",
74
+ "description": "",
75
+ "target_delimiter": " ",
76
+ "fewshot_delimiter": "\n\n",
77
+ "num_fewshot": 5,
78
+ "metric_list": [
79
+ {
80
+ "metric": "acc",
81
+ "aggregation": "mean",
82
+ "higher_is_better": true
83
+ },
84
+ {
85
+ "metric": "acc_norm",
86
+ "aggregation": "mean",
87
+ "higher_is_better": true
88
+ }
89
+ ],
90
+ "output_type": "multiple_choice",
91
+ "repeats": 1,
92
+ "should_decontaminate": true,
93
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
94
+ },
95
+ "arc_easy": {
96
+ "task": "arc_easy",
97
+ "group": [
98
+ "ai2_arc",
99
+ "multiple_choice"
100
+ ],
101
+ "dataset_path": "ai2_arc",
102
+ "dataset_name": "ARC-Easy",
103
+ "training_split": "train",
104
+ "validation_split": "validation",
105
+ "test_split": "test",
106
+ "doc_to_text": "Question: {{question}}\nAnswer:",
107
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
108
+ "doc_to_choice": "{{choices.text}}",
109
+ "description": "",
110
+ "target_delimiter": " ",
111
+ "fewshot_delimiter": "\n\n",
112
+ "num_fewshot": 5,
113
+ "metric_list": [
114
+ {
115
+ "metric": "acc",
116
+ "aggregation": "mean",
117
+ "higher_is_better": true
118
+ },
119
+ {
120
+ "metric": "acc_norm",
121
+ "aggregation": "mean",
122
+ "higher_is_better": true
123
+ }
124
+ ],
125
+ "output_type": "multiple_choice",
126
+ "repeats": 1,
127
+ "should_decontaminate": true,
128
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
129
+ },
130
+ "boolq": {
131
+ "task": "boolq",
132
+ "group": [
133
+ "super-glue-lm-eval-v1"
134
+ ],
135
+ "dataset_path": "super_glue",
136
+ "dataset_name": "boolq",
137
+ "training_split": "train",
138
+ "validation_split": "validation",
139
+ "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
140
+ "doc_to_target": "label",
141
+ "doc_to_choice": [
142
+ "no",
143
+ "yes"
144
+ ],
145
+ "description": "",
146
+ "target_delimiter": " ",
147
+ "fewshot_delimiter": "\n\n",
148
+ "num_fewshot": 5,
149
+ "metric_list": [
150
+ {
151
+ "metric": "acc"
152
+ }
153
+ ],
154
+ "output_type": "multiple_choice",
155
+ "repeats": 1,
156
+ "should_decontaminate": true,
157
+ "doc_to_decontamination_query": "passage"
158
+ },
159
+ "hellaswag": {
160
+ "task": "hellaswag",
161
+ "group": [
162
+ "multiple_choice"
163
+ ],
164
+ "dataset_path": "hellaswag",
165
+ "training_split": "train",
166
+ "validation_split": "validation",
167
+ "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}",
168
+ "doc_to_target": "{{label}}",
169
+ "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}",
170
+ "description": "",
171
+ "target_delimiter": " ",
172
+ "fewshot_delimiter": "\n\n",
173
+ "num_fewshot": 5,
174
+ "metric_list": [
175
+ {
176
+ "metric": "acc",
177
+ "aggregation": "mean",
178
+ "higher_is_better": true
179
+ },
180
+ {
181
+ "metric": "acc_norm",
182
+ "aggregation": "mean",
183
+ "higher_is_better": true
184
+ }
185
+ ],
186
+ "output_type": "multiple_choice",
187
+ "repeats": 1,
188
+ "should_decontaminate": false
189
+ },
190
+ "lambada_openai": {
191
+ "task": "lambada_openai",
192
+ "group": [
193
+ "lambada",
194
+ "loglikelihood",
195
+ "perplexity"
196
+ ],
197
+ "dataset_path": "EleutherAI/lambada_openai",
198
+ "dataset_name": "default",
199
+ "test_split": "test",
200
+ "template_aliases": "",
201
+ "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
202
+ "doc_to_target": "{{' '+text.split(' ')[-1]}}",
203
+ "description": "",
204
+ "target_delimiter": " ",
205
+ "fewshot_delimiter": "\n\n",
206
+ "num_fewshot": 5,
207
+ "metric_list": [
208
+ {
209
+ "metric": "perplexity",
210
+ "aggregation": "perplexity",
211
+ "higher_is_better": false
212
+ },
213
+ {
214
+ "metric": "acc",
215
+ "aggregation": "mean",
216
+ "higher_is_better": true
217
+ }
218
+ ],
219
+ "output_type": "loglikelihood",
220
+ "repeats": 1,
221
+ "should_decontaminate": true,
222
+ "doc_to_decontamination_query": "{{text}}"
223
+ },
224
+ "openbookqa": {
225
+ "task": "openbookqa",
226
+ "group": [
227
+ "multiple_choice"
228
+ ],
229
+ "dataset_path": "openbookqa",
230
+ "dataset_name": "main",
231
+ "training_split": "train",
232
+ "validation_split": "validation",
233
+ "test_split": "test",
234
+ "doc_to_text": "question_stem",
235
+ "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
236
+ "doc_to_choice": "{{choices.text}}",
237
+ "description": "",
238
+ "target_delimiter": " ",
239
+ "fewshot_delimiter": "\n\n",
240
+ "num_fewshot": 5,
241
+ "metric_list": [
242
+ {
243
+ "metric": "acc",
244
+ "aggregation": "mean",
245
+ "higher_is_better": true
246
+ },
247
+ {
248
+ "metric": "acc_norm",
249
+ "aggregation": "mean",
250
+ "higher_is_better": true
251
+ }
252
+ ],
253
+ "output_type": "multiple_choice",
254
+ "repeats": 1,
255
+ "should_decontaminate": true,
256
+ "doc_to_decontamination_query": "question_stem"
257
+ },
258
+ "piqa": {
259
+ "task": "piqa",
260
+ "group": [
261
+ "multiple_choice"
262
+ ],
263
+ "dataset_path": "piqa",
264
+ "training_split": "train",
265
+ "validation_split": "validation",
266
+ "doc_to_text": "Question: {{goal}}\nAnswer:",
267
+ "doc_to_target": "label",
268
+ "doc_to_choice": "{{[sol1, sol2]}}",
269
+ "description": "",
270
+ "target_delimiter": " ",
271
+ "fewshot_delimiter": "\n\n",
272
+ "num_fewshot": 5,
273
+ "metric_list": [
274
+ {
275
+ "metric": "acc",
276
+ "aggregation": "mean",
277
+ "higher_is_better": true
278
+ },
279
+ {
280
+ "metric": "acc_norm",
281
+ "aggregation": "mean",
282
+ "higher_is_better": true
283
+ }
284
+ ],
285
+ "output_type": "multiple_choice",
286
+ "repeats": 1,
287
+ "should_decontaminate": true,
288
+ "doc_to_decontamination_query": "goal"
289
+ },
290
+ "sciq": {
291
+ "task": "sciq",
292
+ "group": [
293
+ "multiple_choice"
294
+ ],
295
+ "dataset_path": "sciq",
296
+ "training_split": "train",
297
+ "validation_split": "validation",
298
+ "test_split": "test",
299
+ "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
300
+ "doc_to_target": 3,
301
+ "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
302
+ "description": "",
303
+ "target_delimiter": " ",
304
+ "fewshot_delimiter": "\n\n",
305
+ "num_fewshot": 5,
306
+ "metric_list": [
307
+ {
308
+ "metric": "acc",
309
+ "aggregation": "mean",
310
+ "higher_is_better": true
311
+ },
312
+ {
313
+ "metric": "acc_norm",
314
+ "aggregation": "mean",
315
+ "higher_is_better": true
316
+ }
317
+ ],
318
+ "output_type": "multiple_choice",
319
+ "repeats": 1,
320
+ "should_decontaminate": true,
321
+ "doc_to_decontamination_query": "{{support}} {{question}}"
322
+ },
323
+ "wikitext": {
324
+ "task": "wikitext",
325
+ "group": [
326
+ "perplexity",
327
+ "loglikelihood_rolling"
328
+ ],
329
+ "dataset_path": "EleutherAI/wikitext_document_level",
330
+ "dataset_name": "wikitext-2-raw-v1",
331
+ "training_split": "train",
332
+ "validation_split": "validation",
333
+ "test_split": "test",
334
+ "template_aliases": "",
335
+ "doc_to_text": "",
336
+ "doc_to_target": "<function wikitext_detokenizer at 0x7fa675b9beb0>",
337
+ "description": "",
338
+ "target_delimiter": " ",
339
+ "fewshot_delimiter": "\n\n",
340
+ "num_fewshot": 5,
341
+ "metric_list": [
342
+ {
343
+ "metric": "word_perplexity"
344
+ },
345
+ {
346
+ "metric": "byte_perplexity"
347
+ },
348
+ {
349
+ "metric": "bits_per_byte"
350
+ }
351
+ ],
352
+ "output_type": "loglikelihood_rolling",
353
+ "repeats": 1,
354
+ "should_decontaminate": true,
355
+ "doc_to_decontamination_query": "{{page}}"
356
+ },
357
+ "winogrande": {
358
+ "task": "winogrande",
359
+ "dataset_path": "winogrande",
360
+ "dataset_name": "winogrande_xl",
361
+ "training_split": "train",
362
+ "validation_split": "validation",
363
+ "doc_to_text": "<function doc_to_text at 0x7fa675b9add0>",
364
+ "doc_to_target": "<function doc_to_target at 0x7fa675b9b250>",
365
+ "doc_to_choice": "<function doc_to_choice at 0x7fa675b9b490>",
366
+ "description": "",
367
+ "target_delimiter": " ",
368
+ "fewshot_delimiter": "\n\n",
369
+ "num_fewshot": 5,
370
+ "metric_list": [
371
+ {
372
+ "metric": "acc",
373
+ "aggregation": "mean",
374
+ "higher_is_better": true
375
+ }
376
+ ],
377
+ "output_type": "multiple_choice",
378
+ "repeats": 1,
379
+ "should_decontaminate": false
380
+ }
381
+ },
382
+ "versions": {
383
+ "arc_challenge": "Yaml",
384
+ "arc_easy": "Yaml",
385
+ "boolq": "Yaml",
386
+ "hellaswag": "Yaml",
387
+ "lambada_openai": "Yaml",
388
+ "openbookqa": "Yaml",
389
+ "piqa": "Yaml",
390
+ "sciq": "Yaml",
391
+ "wikitext": "Yaml",
392
+ "winogrande": "Yaml"
393
+ },
394
+ "config": {
395
+ "model": "hf",
396
+ "model_args": "pretrained=lomahony/eleuther-pythia70m-hh-dpo",
397
+ "num_fewshot": 5,
398
+ "batch_size": 16,
399
+ "batch_sizes": [],
400
+ "device": "cuda:0",
401
+ "use_cache": null,
402
+ "limit": null,
403
+ "bootstrap_iters": 100000
404
+ },
405
+ "git_hash": "4e44f0a"
406
+ }