kanishka commited on
Commit
7bafc0d
·
verified ·
1 Parent(s): 061e649

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +14 -14
  3. eval_results.json +8 -8
  4. train_results.json +7 -7
  5. trainer_state.json +303 -303
README.md CHANGED
@@ -2,11 +2,23 @@
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
 
 
5
  metrics:
6
  - accuracy
7
  model-index:
8
  - name: opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3
9
- results: []
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3
16
 
17
- This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 2.6900
20
  - Accuracy: 0.4781
 
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - kanishka/babylm2-rewritten-clean-spacy_no-num-adj
7
  metrics:
8
  - accuracy
9
  model-index:
10
  - name: opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3
11
+ results:
12
+ - task:
13
+ name: Causal Language Modeling
14
+ type: text-generation
15
+ dataset:
16
+ name: kanishka/babylm2-rewritten-clean-spacy_no-num-adj
17
+ type: kanishka/babylm2-rewritten-clean-spacy_no-num-adj
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.4781093360218181
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3
28
 
29
+ This model was trained from scratch on the kanishka/babylm2-rewritten-clean-spacy_no-num-adj dataset.
30
  It achieves the following results on the evaluation set:
31
  - Loss: 2.6900
32
  - Accuracy: 0.4781
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 19.991404977248468,
3
- "eval_accuracy": 0.4780018806422093,
4
- "eval_loss": 2.688654899597168,
5
- "eval_runtime": 71.3254,
6
- "eval_samples": 60701,
7
- "eval_samples_per_second": 851.044,
8
- "eval_steps_per_second": 13.305,
9
- "perplexity": 14.711873658228711,
10
- "total_flos": 1.487763384827904e+18,
11
- "train_loss": 2.800290222853757,
12
- "train_runtime": 29774.6232,
13
- "train_samples": 569632,
14
- "train_samples_per_second": 382.629,
15
- "train_steps_per_second": 1.495
16
  }
 
1
  {
2
+ "epoch": 19.99134539732494,
3
+ "eval_accuracy": 0.4781093360218181,
4
+ "eval_loss": 2.690006971359253,
5
+ "eval_runtime": 71.6993,
6
+ "eval_samples": 60680,
7
+ "eval_samples_per_second": 846.313,
8
+ "eval_steps_per_second": 13.236,
9
+ "perplexity": 14.73177862060579,
10
+ "total_flos": 1.487139158163456e+18,
11
+ "train_loss": 2.8016022716494775,
12
+ "train_runtime": 30047.0507,
13
+ "train_samples": 569394,
14
+ "train_samples_per_second": 379.002,
15
+ "train_steps_per_second": 1.48
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 19.991404977248468,
3
- "eval_accuracy": 0.4780018806422093,
4
- "eval_loss": 2.688654899597168,
5
- "eval_runtime": 71.3254,
6
- "eval_samples": 60701,
7
- "eval_samples_per_second": 851.044,
8
- "eval_steps_per_second": 13.305,
9
- "perplexity": 14.711873658228711
10
  }
 
1
  {
2
+ "epoch": 19.99134539732494,
3
+ "eval_accuracy": 0.4781093360218181,
4
+ "eval_loss": 2.690006971359253,
5
+ "eval_runtime": 71.6993,
6
+ "eval_samples": 60680,
7
+ "eval_samples_per_second": 846.313,
8
+ "eval_steps_per_second": 13.236,
9
+ "perplexity": 14.73177862060579
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 19.991404977248468,
3
- "total_flos": 1.487763384827904e+18,
4
- "train_loss": 2.800290222853757,
5
- "train_runtime": 29774.6232,
6
- "train_samples": 569632,
7
- "train_samples_per_second": 382.629,
8
- "train_steps_per_second": 1.495
9
  }
 
1
  {
2
+ "epoch": 19.99134539732494,
3
+ "total_flos": 1.487139158163456e+18,
4
+ "train_loss": 2.8016022716494775,
5
+ "train_runtime": 30047.0507,
6
+ "train_samples": 569394,
7
+ "train_samples_per_second": 379.002,
8
+ "train_steps_per_second": 1.48
9
  }
trainer_state.json CHANGED
@@ -1,513 +1,513 @@
1
  {
2
- "best_metric": 2.688654899597168,
3
- "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44500",
4
- "epoch": 19.991404977248468,
5
  "eval_steps": 500,
6
- "global_step": 44500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4494129543284085,
13
- "grad_norm": 0.41069796681404114,
14
  "learning_rate": 3.125e-05,
15
- "loss": 5.5764,
16
  "step": 1000
17
  },
18
  {
19
- "epoch": 0.898825908656817,
20
- "grad_norm": 0.5602710247039795,
21
  "learning_rate": 6.25e-05,
22
- "loss": 4.0881,
23
  "step": 2000
24
  },
25
  {
26
  "epoch": 1.0,
27
- "eval_accuracy": 0.35879080714178885,
28
- "eval_loss": 3.840114116668701,
29
- "eval_runtime": 73.233,
30
- "eval_samples_per_second": 828.875,
31
- "eval_steps_per_second": 12.959,
32
- "step": 2226
33
  },
34
  {
35
- "epoch": 1.3478456266501881,
36
- "grad_norm": 0.5852290987968445,
37
  "learning_rate": 9.375e-05,
38
- "loss": 3.6865,
39
  "step": 3000
40
  },
41
  {
42
- "epoch": 1.7972585809785966,
43
- "grad_norm": 0.54194575548172,
44
  "learning_rate": 0.000125,
45
- "loss": 3.4347,
46
  "step": 4000
47
  },
48
  {
49
  "epoch": 2.0,
50
- "eval_accuracy": 0.40807745842608145,
51
- "eval_loss": 3.31392502784729,
52
- "eval_runtime": 73.0362,
53
- "eval_samples_per_second": 831.109,
54
- "eval_steps_per_second": 12.994,
55
- "step": 4452
56
  },
57
  {
58
- "epoch": 2.246278298971968,
59
- "grad_norm": 0.4505850672721863,
60
  "learning_rate": 0.00015625,
61
- "loss": 3.2332,
62
  "step": 5000
63
  },
64
  {
65
- "epoch": 2.6956912533003763,
66
- "grad_norm": 0.5012966394424438,
67
  "learning_rate": 0.0001875,
68
- "loss": 3.1182,
69
  "step": 6000
70
  },
71
  {
72
  "epoch": 3.0,
73
- "eval_accuracy": 0.4283898801938528,
74
- "eval_loss": 3.1022610664367676,
75
- "eval_runtime": 72.6546,
76
- "eval_samples_per_second": 835.474,
77
- "eval_steps_per_second": 13.062,
78
- "step": 6678
79
  },
80
  {
81
- "epoch": 3.1447109712937475,
82
- "grad_norm": 0.43612048029899597,
83
  "learning_rate": 0.00021875,
84
- "loss": 3.0238,
85
  "step": 7000
86
  },
87
  {
88
- "epoch": 3.594123925622156,
89
- "grad_norm": 0.38532325625419617,
90
  "learning_rate": 0.00025,
91
- "loss": 2.9562,
92
  "step": 8000
93
  },
94
  {
95
  "epoch": 4.0,
96
- "eval_accuracy": 0.4398597303206879,
97
- "eval_loss": 2.9949750900268555,
98
- "eval_runtime": 72.7491,
99
- "eval_samples_per_second": 834.388,
100
- "eval_steps_per_second": 13.045,
101
- "step": 8904
102
  },
103
  {
104
- "epoch": 4.043143643615527,
105
- "grad_norm": 0.3981912136077881,
106
  "learning_rate": 0.00028125000000000003,
107
- "loss": 2.9072,
108
  "step": 9000
109
  },
110
  {
111
- "epoch": 4.492556597943936,
112
- "grad_norm": 0.3344660997390747,
113
  "learning_rate": 0.0003125,
114
- "loss": 2.8545,
115
  "step": 10000
116
  },
117
  {
118
- "epoch": 4.941969552272345,
119
- "grad_norm": 0.337446391582489,
120
  "learning_rate": 0.00034375,
121
- "loss": 2.8365,
122
  "step": 11000
123
  },
124
  {
125
  "epoch": 5.0,
126
- "eval_accuracy": 0.44592869387751144,
127
- "eval_loss": 2.936906099319458,
128
- "eval_runtime": 72.6273,
129
- "eval_samples_per_second": 835.788,
130
- "eval_steps_per_second": 13.067,
131
- "step": 11130
132
  },
133
  {
134
- "epoch": 5.390989270265715,
135
- "grad_norm": 0.3183291256427765,
136
  "learning_rate": 0.000375,
137
- "loss": 2.7859,
138
  "step": 12000
139
  },
140
  {
141
- "epoch": 5.840402224594124,
142
- "grad_norm": 0.2951858937740326,
143
  "learning_rate": 0.00040625000000000004,
144
- "loss": 2.7807,
145
  "step": 13000
146
  },
147
  {
148
  "epoch": 6.0,
149
- "eval_accuracy": 0.4497357830135563,
150
- "eval_loss": 2.8994719982147217,
151
- "eval_runtime": 72.6109,
152
- "eval_samples_per_second": 835.976,
153
- "eval_steps_per_second": 13.07,
154
- "step": 13356
155
  },
156
  {
157
- "epoch": 6.289421942587495,
158
- "grad_norm": 0.2979063093662262,
159
  "learning_rate": 0.0004375,
160
- "loss": 2.7418,
161
  "step": 14000
162
  },
163
  {
164
- "epoch": 6.738834896915904,
165
- "grad_norm": 0.28142639994621277,
166
- "learning_rate": 0.00046875,
167
- "loss": 2.7389,
168
  "step": 15000
169
  },
170
  {
171
  "epoch": 7.0,
172
- "eval_accuracy": 0.4533856243606156,
173
- "eval_loss": 2.8659732341766357,
174
- "eval_runtime": 72.7067,
175
- "eval_samples_per_second": 834.875,
176
- "eval_steps_per_second": 13.052,
177
- "step": 15582
178
  },
179
  {
180
- "epoch": 7.187854614909274,
181
- "grad_norm": 0.2868495583534241,
182
  "learning_rate": 0.00049996875,
183
- "loss": 2.7131,
184
  "step": 16000
185
  },
186
  {
187
- "epoch": 7.637267569237683,
188
- "grad_norm": 0.25408118963241577,
189
  "learning_rate": 0.00053121875,
190
- "loss": 2.7055,
191
  "step": 17000
192
  },
193
  {
194
  "epoch": 8.0,
195
- "eval_accuracy": 0.4554842427572502,
196
- "eval_loss": 2.8459126949310303,
197
- "eval_runtime": 72.7046,
198
- "eval_samples_per_second": 834.898,
199
- "eval_steps_per_second": 13.053,
200
- "step": 17808
201
- },
202
- {
203
- "epoch": 8.086287287231054,
204
- "grad_norm": 0.25123631954193115,
205
- "learning_rate": 0.0005624375,
206
- "loss": 2.6925,
207
  "step": 18000
208
  },
209
  {
210
- "epoch": 8.535700241559462,
211
- "grad_norm": 0.24333898723125458,
212
  "learning_rate": 0.0005936875,
213
- "loss": 2.6726,
214
  "step": 19000
215
  },
216
  {
217
- "epoch": 8.985113195887871,
218
- "grad_norm": 0.23374955356121063,
219
- "learning_rate": 0.00062490625,
220
- "loss": 2.6857,
221
  "step": 20000
222
  },
223
  {
224
  "epoch": 9.0,
225
- "eval_accuracy": 0.4575513986751518,
226
- "eval_loss": 2.831002950668335,
227
- "eval_runtime": 72.439,
228
- "eval_samples_per_second": 837.96,
229
- "eval_steps_per_second": 13.101,
230
- "step": 20034
231
- },
232
- {
233
- "epoch": 9.434132913881243,
234
- "grad_norm": 0.22414237260818481,
235
- "learning_rate": 0.000656125,
236
- "loss": 2.6461,
237
  "step": 21000
238
  },
239
  {
240
- "epoch": 9.883545868209652,
241
- "grad_norm": 0.21065442264080048,
242
- "learning_rate": 0.0006873749999999999,
243
- "loss": 2.6638,
244
  "step": 22000
245
  },
246
  {
247
  "epoch": 10.0,
248
- "eval_accuracy": 0.4584477885979848,
249
- "eval_loss": 2.822634220123291,
250
- "eval_runtime": 72.5174,
251
- "eval_samples_per_second": 837.054,
252
- "eval_steps_per_second": 13.087,
253
- "step": 22260
254
  },
255
  {
256
- "epoch": 10.332565586203023,
257
- "grad_norm": 0.21908308565616608,
258
  "learning_rate": 0.000718625,
259
- "loss": 2.6333,
260
  "step": 23000
261
  },
262
  {
263
- "epoch": 10.78197854053143,
264
- "grad_norm": 0.19101175665855408,
265
  "learning_rate": 0.000749875,
266
- "loss": 2.6495,
267
  "step": 24000
268
  },
269
  {
270
  "epoch": 11.0,
271
- "eval_accuracy": 0.45945684908120843,
272
- "eval_loss": 2.81111741065979,
273
- "eval_runtime": 72.5362,
274
- "eval_samples_per_second": 836.837,
275
- "eval_steps_per_second": 13.083,
276
- "step": 24486
277
  },
278
  {
279
- "epoch": 11.230998258524801,
280
- "grad_norm": 0.19955122470855713,
281
  "learning_rate": 0.000781125,
282
- "loss": 2.6264,
283
  "step": 25000
284
  },
285
  {
286
- "epoch": 11.68041121285321,
287
- "grad_norm": 0.18576420843601227,
288
  "learning_rate": 0.00081234375,
289
- "loss": 2.6341,
290
  "step": 26000
291
  },
292
  {
293
  "epoch": 12.0,
294
- "eval_accuracy": 0.46001277234506266,
295
- "eval_loss": 2.8056912422180176,
296
- "eval_runtime": 72.7843,
297
- "eval_samples_per_second": 833.984,
298
- "eval_steps_per_second": 13.039,
299
- "step": 26712
300
  },
301
  {
302
- "epoch": 12.129430930846581,
303
- "grad_norm": 0.21167126297950745,
304
  "learning_rate": 0.00084359375,
305
- "loss": 2.6248,
306
  "step": 27000
307
  },
308
  {
309
- "epoch": 12.57884388517499,
310
- "grad_norm": 0.19634310901165009,
311
- "learning_rate": 0.0008748125,
312
- "loss": 2.6198,
313
  "step": 28000
314
  },
315
  {
316
  "epoch": 13.0,
317
- "eval_accuracy": 0.4610777158757277,
318
- "eval_loss": 2.8013079166412354,
319
- "eval_runtime": 72.8617,
320
- "eval_samples_per_second": 833.099,
321
- "eval_steps_per_second": 13.025,
322
- "step": 28938
323
- },
324
- {
325
- "epoch": 13.027863603168361,
326
- "grad_norm": 0.1967993527650833,
327
- "learning_rate": 0.0009060625,
328
- "loss": 2.6292,
329
  "step": 29000
330
  },
331
  {
332
- "epoch": 13.47727655749677,
333
- "grad_norm": 0.21736542880535126,
334
- "learning_rate": 0.00093728125,
335
- "loss": 2.6036,
336
  "step": 30000
337
  },
338
  {
339
- "epoch": 13.926689511825177,
340
- "grad_norm": 0.19267675280570984,
341
- "learning_rate": 0.00096853125,
342
- "loss": 2.6269,
343
  "step": 31000
344
  },
345
  {
346
  "epoch": 14.0,
347
- "eval_accuracy": 0.4611890297378568,
348
- "eval_loss": 2.794591188430786,
349
- "eval_runtime": 72.6832,
350
- "eval_samples_per_second": 835.145,
351
- "eval_steps_per_second": 13.057,
352
- "step": 31164
353
- },
354
- {
355
- "epoch": 14.375709229818549,
356
- "grad_norm": 0.19419512152671814,
357
- "learning_rate": 0.00099975,
358
- "loss": 2.5946,
359
  "step": 32000
360
  },
361
  {
362
- "epoch": 14.825122184146958,
363
- "grad_norm": 0.1727887988090515,
364
- "learning_rate": 0.00092064,
365
- "loss": 2.6122,
366
  "step": 33000
367
  },
368
  {
369
  "epoch": 15.0,
370
- "eval_accuracy": 0.4638763905753402,
371
- "eval_loss": 2.7776918411254883,
372
- "eval_runtime": 72.7228,
373
- "eval_samples_per_second": 834.69,
374
  "eval_steps_per_second": 13.05,
375
- "step": 33390
376
  },
377
  {
378
- "epoch": 15.274141902140329,
379
- "grad_norm": 0.1849382370710373,
380
- "learning_rate": 0.00084072,
381
- "loss": 2.5738,
382
  "step": 34000
383
  },
384
  {
385
- "epoch": 15.723554856468738,
386
- "grad_norm": 0.16888663172721863,
387
- "learning_rate": 0.00076072,
388
- "loss": 2.565,
389
  "step": 35000
390
  },
391
  {
392
  "epoch": 16.0,
393
- "eval_accuracy": 0.4671347921715926,
394
- "eval_loss": 2.7541751861572266,
395
- "eval_runtime": 72.8608,
396
- "eval_samples_per_second": 833.109,
397
- "eval_steps_per_second": 13.025,
398
- "step": 35616
399
- },
400
- {
401
- "epoch": 16.172574574462107,
402
- "grad_norm": 0.17893236875534058,
403
- "learning_rate": 0.00068072,
404
- "loss": 2.5371,
405
  "step": 36000
406
  },
407
  {
408
- "epoch": 16.621987528790516,
409
- "grad_norm": 0.1799129843711853,
410
- "learning_rate": 0.0006008,
411
- "loss": 2.5134,
412
  "step": 37000
413
  },
414
  {
415
  "epoch": 17.0,
416
- "eval_accuracy": 0.47074128377896024,
417
- "eval_loss": 2.7257015705108643,
418
- "eval_runtime": 72.9785,
419
- "eval_samples_per_second": 831.766,
420
- "eval_steps_per_second": 13.004,
421
- "step": 37842
422
- },
423
- {
424
- "epoch": 17.07100724678389,
425
- "grad_norm": 0.17874906957149506,
426
- "learning_rate": 0.0005208000000000001,
427
- "loss": 2.5026,
428
  "step": 38000
429
  },
430
  {
431
- "epoch": 17.520420201112298,
432
- "grad_norm": 0.18105448782444,
433
- "learning_rate": 0.00044088,
434
- "loss": 2.4567,
435
  "step": 39000
436
  },
437
  {
438
- "epoch": 17.969833155440707,
439
- "grad_norm": 0.17908377945423126,
440
- "learning_rate": 0.00036088,
441
- "loss": 2.4592,
442
  "step": 40000
443
  },
444
  {
445
  "epoch": 18.0,
446
- "eval_accuracy": 0.473861818989964,
447
- "eval_loss": 2.7058815956115723,
448
- "eval_runtime": 73.0112,
449
- "eval_samples_per_second": 831.393,
450
- "eval_steps_per_second": 12.998,
451
- "step": 40068
452
- },
453
- {
454
- "epoch": 18.418852873434076,
455
- "grad_norm": 0.18544642627239227,
456
- "learning_rate": 0.00028095999999999997,
457
- "loss": 2.3906,
458
  "step": 41000
459
  },
460
  {
461
- "epoch": 18.868265827762485,
462
- "grad_norm": 0.1904035061597824,
463
- "learning_rate": 0.00020096,
464
- "loss": 2.3964,
465
  "step": 42000
466
  },
467
  {
468
  "epoch": 19.0,
469
- "eval_accuracy": 0.47651713590660233,
470
- "eval_loss": 2.69079327583313,
471
- "eval_runtime": 72.778,
472
- "eval_samples_per_second": 834.057,
473
- "eval_steps_per_second": 13.04,
474
- "step": 42294
475
- },
476
- {
477
- "epoch": 19.317285545755855,
478
- "grad_norm": 0.1982097625732422,
479
- "learning_rate": 0.00012103999999999999,
480
- "loss": 2.3398,
481
  "step": 43000
482
  },
483
  {
484
- "epoch": 19.766698500084264,
485
- "grad_norm": 0.19803358614444733,
486
- "learning_rate": 4.104e-05,
487
- "loss": 2.3229,
488
  "step": 44000
489
  },
490
  {
491
- "epoch": 19.991404977248468,
492
- "eval_accuracy": 0.4780018806422093,
493
- "eval_loss": 2.688654899597168,
494
- "eval_runtime": 73.0151,
495
- "eval_samples_per_second": 831.349,
496
- "eval_steps_per_second": 12.997,
497
- "step": 44500
498
  },
499
  {
500
- "epoch": 19.991404977248468,
501
- "step": 44500,
502
- "total_flos": 1.487763384827904e+18,
503
- "train_loss": 2.800290222853757,
504
- "train_runtime": 29774.6232,
505
- "train_samples_per_second": 382.629,
506
- "train_steps_per_second": 1.495
507
  }
508
  ],
509
  "logging_steps": 1000,
510
- "max_steps": 44500,
511
  "num_input_tokens_seen": 0,
512
  "num_train_epochs": 20,
513
  "save_steps": 500,
@@ -532,7 +532,7 @@
532
  "attributes": {}
533
  }
534
  },
535
- "total_flos": 1.487763384827904e+18,
536
  "train_batch_size": 32,
537
  "trial_name": null,
538
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.690006971359253,
3
+ "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44480",
4
+ "epoch": 19.99134539732494,
5
  "eval_steps": 500,
6
+ "global_step": 44480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.44958974935371476,
13
+ "grad_norm": 0.5849416255950928,
14
  "learning_rate": 3.125e-05,
15
+ "loss": 5.5771,
16
  "step": 1000
17
  },
18
  {
19
+ "epoch": 0.8991794987074295,
20
+ "grad_norm": 0.6677811145782471,
21
  "learning_rate": 6.25e-05,
22
+ "loss": 4.0896,
23
  "step": 2000
24
  },
25
  {
26
  "epoch": 1.0,
27
+ "eval_accuracy": 0.35931850789096126,
28
+ "eval_loss": 3.830660104751587,
29
+ "eval_runtime": 73.9335,
30
+ "eval_samples_per_second": 820.738,
31
+ "eval_steps_per_second": 12.836,
32
+ "step": 2225
33
  },
34
  {
35
+ "epoch": 1.348432055749129,
36
+ "grad_norm": 0.5427906513214111,
37
  "learning_rate": 9.375e-05,
38
+ "loss": 3.6885,
39
  "step": 3000
40
  },
41
  {
42
+ "epoch": 1.7980218051028438,
43
+ "grad_norm": 0.5098850727081299,
44
  "learning_rate": 0.000125,
45
+ "loss": 3.4325,
46
  "step": 4000
47
  },
48
  {
49
  "epoch": 2.0,
50
+ "eval_accuracy": 0.40814210192976336,
51
+ "eval_loss": 3.3144962787628174,
52
+ "eval_runtime": 73.7109,
53
+ "eval_samples_per_second": 823.216,
54
+ "eval_steps_per_second": 12.875,
55
+ "step": 4450
56
  },
57
  {
58
+ "epoch": 2.2472743621445432,
59
+ "grad_norm": 0.47539061307907104,
60
  "learning_rate": 0.00015625,
61
+ "loss": 3.2344,
62
  "step": 5000
63
  },
64
  {
65
+ "epoch": 2.696864111498258,
66
+ "grad_norm": 0.4557252824306488,
67
  "learning_rate": 0.0001875,
68
+ "loss": 3.1208,
69
  "step": 6000
70
  },
71
  {
72
  "epoch": 3.0,
73
+ "eval_accuracy": 0.42951116108935333,
74
+ "eval_loss": 3.1043691635131836,
75
+ "eval_runtime": 73.4454,
76
+ "eval_samples_per_second": 826.192,
77
+ "eval_steps_per_second": 12.921,
78
+ "step": 6675
79
  },
80
  {
81
+ "epoch": 3.1461166685399573,
82
+ "grad_norm": 0.4797649085521698,
83
  "learning_rate": 0.00021875,
84
+ "loss": 3.0251,
85
  "step": 7000
86
  },
87
  {
88
+ "epoch": 3.595706417893672,
89
+ "grad_norm": 0.39974284172058105,
90
  "learning_rate": 0.00025,
91
+ "loss": 2.957,
92
  "step": 8000
93
  },
94
  {
95
  "epoch": 4.0,
96
+ "eval_accuracy": 0.43958987682086675,
97
+ "eval_loss": 2.997297763824463,
98
+ "eval_runtime": 72.7828,
99
+ "eval_samples_per_second": 833.713,
100
+ "eval_steps_per_second": 13.039,
101
+ "step": 8900
102
  },
103
  {
104
+ "epoch": 4.044958974935372,
105
+ "grad_norm": 0.37127092480659485,
106
  "learning_rate": 0.00028125000000000003,
107
+ "loss": 2.91,
108
  "step": 9000
109
  },
110
  {
111
+ "epoch": 4.4945487242890865,
112
+ "grad_norm": 0.35168108344078064,
113
  "learning_rate": 0.0003125,
114
+ "loss": 2.8553,
115
  "step": 10000
116
  },
117
  {
118
+ "epoch": 4.944138473642801,
119
+ "grad_norm": 0.33759039640426636,
120
  "learning_rate": 0.00034375,
121
+ "loss": 2.8381,
122
  "step": 11000
123
  },
124
  {
125
  "epoch": 5.0,
126
+ "eval_accuracy": 0.4463711272247858,
127
+ "eval_loss": 2.9338483810424805,
128
+ "eval_runtime": 73.014,
129
+ "eval_samples_per_second": 831.073,
130
+ "eval_steps_per_second": 12.998,
131
+ "step": 11125
132
  },
133
  {
134
+ "epoch": 5.3933910306845005,
135
+ "grad_norm": 0.3382190465927124,
136
  "learning_rate": 0.000375,
137
+ "loss": 2.7866,
138
  "step": 12000
139
  },
140
  {
141
+ "epoch": 5.842980780038215,
142
+ "grad_norm": 0.30114060640335083,
143
  "learning_rate": 0.00040625000000000004,
144
+ "loss": 2.7819,
145
  "step": 13000
146
  },
147
  {
148
  "epoch": 6.0,
149
+ "eval_accuracy": 0.4507991133170473,
150
+ "eval_loss": 2.8903539180755615,
151
+ "eval_runtime": 72.6589,
152
+ "eval_samples_per_second": 835.135,
153
+ "eval_steps_per_second": 13.061,
154
+ "step": 13350
155
  },
156
  {
157
+ "epoch": 6.292233337079915,
158
+ "grad_norm": 0.29889941215515137,
159
  "learning_rate": 0.0004375,
160
+ "loss": 2.7435,
161
  "step": 14000
162
  },
163
  {
164
+ "epoch": 6.741823086433629,
165
+ "grad_norm": 0.2726060450077057,
166
+ "learning_rate": 0.00046871875,
167
+ "loss": 2.7385,
168
  "step": 15000
169
  },
170
  {
171
  "epoch": 7.0,
172
+ "eval_accuracy": 0.4530601548463815,
173
+ "eval_loss": 2.8666210174560547,
174
+ "eval_runtime": 72.57,
175
+ "eval_samples_per_second": 836.158,
176
+ "eval_steps_per_second": 13.077,
177
+ "step": 15575
178
  },
179
  {
180
+ "epoch": 7.191075643475329,
181
+ "grad_norm": 0.2712298333644867,
182
  "learning_rate": 0.00049996875,
183
+ "loss": 2.7141,
184
  "step": 16000
185
  },
186
  {
187
+ "epoch": 7.640665392829043,
188
+ "grad_norm": 0.2567969560623169,
189
  "learning_rate": 0.00053121875,
190
+ "loss": 2.7061,
191
  "step": 17000
192
  },
193
  {
194
  "epoch": 8.0,
195
+ "eval_accuracy": 0.4559095609239081,
196
+ "eval_loss": 2.8456263542175293,
197
+ "eval_runtime": 72.544,
198
+ "eval_samples_per_second": 836.458,
199
+ "eval_steps_per_second": 13.082,
200
+ "step": 17800
201
+ },
202
+ {
203
+ "epoch": 8.089917949870744,
204
+ "grad_norm": 0.24305634200572968,
205
+ "learning_rate": 0.0005624687499999999,
206
+ "loss": 2.6951,
207
  "step": 18000
208
  },
209
  {
210
+ "epoch": 8.539507699224458,
211
+ "grad_norm": 0.23454323410987854,
212
  "learning_rate": 0.0005936875,
213
+ "loss": 2.6745,
214
  "step": 19000
215
  },
216
  {
217
+ "epoch": 8.989097448578173,
218
+ "grad_norm": 0.2262556552886963,
219
+ "learning_rate": 0.0006249375000000001,
220
+ "loss": 2.6855,
221
  "step": 20000
222
  },
223
  {
224
  "epoch": 9.0,
225
+ "eval_accuracy": 0.4575118590613569,
226
+ "eval_loss": 2.833211898803711,
227
+ "eval_runtime": 72.5848,
228
+ "eval_samples_per_second": 835.988,
229
+ "eval_steps_per_second": 13.074,
230
+ "step": 20025
231
+ },
232
+ {
233
+ "epoch": 9.438350005619872,
234
+ "grad_norm": 0.2205825001001358,
235
+ "learning_rate": 0.0006561562500000001,
236
+ "loss": 2.6456,
237
  "step": 21000
238
  },
239
  {
240
+ "epoch": 9.887939754973587,
241
+ "grad_norm": 0.21933791041374207,
242
+ "learning_rate": 0.00068740625,
243
+ "loss": 2.6669,
244
  "step": 22000
245
  },
246
  {
247
  "epoch": 10.0,
248
+ "eval_accuracy": 0.45864309072343507,
249
+ "eval_loss": 2.819795608520508,
250
+ "eval_runtime": 72.9477,
251
+ "eval_samples_per_second": 831.828,
252
+ "eval_steps_per_second": 13.009,
253
+ "step": 22250
254
  },
255
  {
256
+ "epoch": 10.337192312015286,
257
+ "grad_norm": 0.21199771761894226,
258
  "learning_rate": 0.000718625,
259
+ "loss": 2.636,
260
  "step": 23000
261
  },
262
  {
263
+ "epoch": 10.786782061369001,
264
+ "grad_norm": 0.20166124403476715,
265
  "learning_rate": 0.000749875,
266
+ "loss": 2.6499,
267
  "step": 24000
268
  },
269
  {
270
  "epoch": 11.0,
271
+ "eval_accuracy": 0.4597305698812155,
272
+ "eval_loss": 2.8118433952331543,
273
+ "eval_runtime": 73.1507,
274
+ "eval_samples_per_second": 829.521,
275
+ "eval_steps_per_second": 12.973,
276
+ "step": 24475
277
  },
278
  {
279
+ "epoch": 11.2360346184107,
280
+ "grad_norm": 0.20014500617980957,
281
  "learning_rate": 0.000781125,
282
+ "loss": 2.6298,
283
  "step": 25000
284
  },
285
  {
286
+ "epoch": 11.685624367764415,
287
+ "grad_norm": 0.19325494766235352,
288
  "learning_rate": 0.00081234375,
289
+ "loss": 2.6351,
290
  "step": 26000
291
  },
292
  {
293
  "epoch": 12.0,
294
+ "eval_accuracy": 0.4601314513940052,
295
+ "eval_loss": 2.807219982147217,
296
+ "eval_runtime": 72.6854,
297
+ "eval_samples_per_second": 834.831,
298
+ "eval_steps_per_second": 13.056,
299
+ "step": 26700
300
  },
301
  {
302
+ "epoch": 12.134876924806115,
303
+ "grad_norm": 0.19165903329849243,
304
  "learning_rate": 0.00084359375,
305
+ "loss": 2.6265,
306
  "step": 27000
307
  },
308
  {
309
+ "epoch": 12.58446667415983,
310
+ "grad_norm": 0.1863769292831421,
311
+ "learning_rate": 0.0008748437500000001,
312
+ "loss": 2.6204,
313
  "step": 28000
314
  },
315
  {
316
  "epoch": 13.0,
317
+ "eval_accuracy": 0.4611660010081818,
318
+ "eval_loss": 2.802619218826294,
319
+ "eval_runtime": 72.6835,
320
+ "eval_samples_per_second": 834.852,
321
+ "eval_steps_per_second": 13.057,
322
+ "step": 28925
323
+ },
324
+ {
325
+ "epoch": 13.033719231201529,
326
+ "grad_norm": 0.19991189241409302,
327
+ "learning_rate": 0.00090609375,
328
+ "loss": 2.6286,
329
  "step": 29000
330
  },
331
  {
332
+ "epoch": 13.483308980555243,
333
+ "grad_norm": 0.18545052409172058,
334
+ "learning_rate": 0.0009373125,
335
+ "loss": 2.6068,
336
  "step": 30000
337
  },
338
  {
339
+ "epoch": 13.932898729908958,
340
+ "grad_norm": 0.17478196322917938,
341
+ "learning_rate": 0.0009685625,
342
+ "loss": 2.6277,
343
  "step": 31000
344
  },
345
  {
346
  "epoch": 14.0,
347
+ "eval_accuracy": 0.4612847208758256,
348
+ "eval_loss": 2.801252841949463,
349
+ "eval_runtime": 72.4155,
350
+ "eval_samples_per_second": 837.942,
351
+ "eval_steps_per_second": 13.105,
352
+ "step": 31150
353
+ },
354
+ {
355
+ "epoch": 14.382151286950657,
356
+ "grad_norm": 0.17678463459014893,
357
+ "learning_rate": 0.00099978125,
358
+ "loss": 2.5975,
359
  "step": 32000
360
  },
361
  {
362
+ "epoch": 14.831741036304372,
363
+ "grad_norm": 0.17033128440380096,
364
+ "learning_rate": 0.0009204326923076923,
365
+ "loss": 2.6136,
366
  "step": 33000
367
  },
368
  {
369
  "epoch": 15.0,
370
+ "eval_accuracy": 0.46383548541367764,
371
+ "eval_loss": 2.779118061065674,
372
+ "eval_runtime": 72.723,
373
+ "eval_samples_per_second": 834.399,
374
  "eval_steps_per_second": 13.05,
375
+ "step": 33375
376
  },
377
  {
378
+ "epoch": 15.280993593346071,
379
+ "grad_norm": 0.1788545548915863,
380
+ "learning_rate": 0.0008403044871794871,
381
+ "loss": 2.5726,
382
  "step": 34000
383
  },
384
  {
385
+ "epoch": 15.730583342699786,
386
+ "grad_norm": 0.17410264909267426,
387
+ "learning_rate": 0.0007602564102564103,
388
+ "loss": 2.5687,
389
  "step": 35000
390
  },
391
  {
392
  "epoch": 16.0,
393
+ "eval_accuracy": 0.4676111908177905,
394
+ "eval_loss": 2.75136661529541,
395
+ "eval_runtime": 72.8808,
396
+ "eval_samples_per_second": 832.592,
397
+ "eval_steps_per_second": 13.021,
398
+ "step": 35600
399
+ },
400
+ {
401
+ "epoch": 16.179835899741487,
402
+ "grad_norm": 0.17518466711044312,
403
+ "learning_rate": 0.0006801282051282051,
404
+ "loss": 2.5356,
405
  "step": 36000
406
  },
407
  {
408
+ "epoch": 16.6294256490952,
409
+ "grad_norm": 0.17421123385429382,
410
+ "learning_rate": 0.0006000801282051283,
411
+ "loss": 2.5184,
412
  "step": 37000
413
  },
414
  {
415
  "epoch": 17.0,
416
+ "eval_accuracy": 0.4707787558002766,
417
+ "eval_loss": 2.728271245956421,
418
+ "eval_runtime": 72.8051,
419
+ "eval_samples_per_second": 833.458,
420
+ "eval_steps_per_second": 13.035,
421
+ "step": 37825
422
+ },
423
+ {
424
+ "epoch": 17.0786782061369,
425
+ "grad_norm": 0.18264968693256378,
426
+ "learning_rate": 0.0005199519230769231,
427
+ "loss": 2.4989,
428
  "step": 38000
429
  },
430
  {
431
+ "epoch": 17.528267955490616,
432
+ "grad_norm": 0.18594865500926971,
433
+ "learning_rate": 0.00043990384615384616,
434
+ "loss": 2.4571,
435
  "step": 39000
436
  },
437
  {
438
+ "epoch": 17.97785770484433,
439
+ "grad_norm": 0.17989173531532288,
440
+ "learning_rate": 0.00035977564102564105,
441
+ "loss": 2.4613,
442
  "step": 40000
443
  },
444
  {
445
  "epoch": 18.0,
446
+ "eval_accuracy": 0.47402006023239884,
447
+ "eval_loss": 2.705965518951416,
448
+ "eval_runtime": 72.7008,
449
+ "eval_samples_per_second": 834.654,
450
+ "eval_steps_per_second": 13.054,
451
+ "step": 40050
452
+ },
453
+ {
454
+ "epoch": 18.42711026188603,
455
+ "grad_norm": 0.19071288406848907,
456
+ "learning_rate": 0.0002797275641025641,
457
+ "loss": 2.3913,
458
  "step": 41000
459
  },
460
  {
461
+ "epoch": 18.876700011239745,
462
+ "grad_norm": 0.19211626052856445,
463
+ "learning_rate": 0.00019959935897435898,
464
+ "loss": 2.3966,
465
  "step": 42000
466
  },
467
  {
468
  "epoch": 19.0,
469
+ "eval_accuracy": 0.4765565421949927,
470
+ "eval_loss": 2.6947293281555176,
471
+ "eval_runtime": 72.857,
472
+ "eval_samples_per_second": 832.864,
473
+ "eval_steps_per_second": 13.026,
474
+ "step": 42275
475
+ },
476
+ {
477
+ "epoch": 19.325952568281444,
478
+ "grad_norm": 0.19902721047401428,
479
+ "learning_rate": 0.00011947115384615386,
480
+ "loss": 2.3404,
481
  "step": 43000
482
  },
483
  {
484
+ "epoch": 19.775542317635157,
485
+ "grad_norm": 0.20530302822589874,
486
+ "learning_rate": 3.942307692307692e-05,
487
+ "loss": 2.3227,
488
  "step": 44000
489
  },
490
  {
491
+ "epoch": 19.99134539732494,
492
+ "eval_accuracy": 0.4781093360218181,
493
+ "eval_loss": 2.690006971359253,
494
+ "eval_runtime": 72.887,
495
+ "eval_samples_per_second": 832.521,
496
+ "eval_steps_per_second": 13.02,
497
+ "step": 44480
498
  },
499
  {
500
+ "epoch": 19.99134539732494,
501
+ "step": 44480,
502
+ "total_flos": 1.487139158163456e+18,
503
+ "train_loss": 2.8016022716494775,
504
+ "train_runtime": 30047.0507,
505
+ "train_samples_per_second": 379.002,
506
+ "train_steps_per_second": 1.48
507
  }
508
  ],
509
  "logging_steps": 1000,
510
+ "max_steps": 44480,
511
  "num_input_tokens_seen": 0,
512
  "num_train_epochs": 20,
513
  "save_steps": 500,
 
532
  "attributes": {}
533
  }
534
  },
535
+ "total_flos": 1.487139158163456e+18,
536
  "train_batch_size": 32,
537
  "trial_name": null,
538
  "trial_params": null