eformat commited on
Commit
e62caa0
1 Parent(s): 5bfeb6e

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c58239a67307473e58bbc09292e8bf457bccbf3d8f093413f2a9983160f1373
3
  size 34100216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a115dacf1f2435d487c24b16dcdf2e2ded0bc635269a44e344367ce2938d8594
3
  size 34100216
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c58239a67307473e58bbc09292e8bf457bccbf3d8f093413f2a9983160f1373
3
  size 34100216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a115dacf1f2435d487c24b16dcdf2e2ded0bc635269a44e344367ce2938d8594
3
  size 34100216
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93a1f6aa5183962874fd62e0d358e0f0ea27e6a68a0535d07f990842e7013963
3
  size 68292346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f3eb5fe27f4339c2b2499f040822b2be66cf8314166a745797c2d8e5e499f8
3
  size 68292346
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04745946c21cb6d9e7af44a9e436e4207c8437de1201d76c48a7771e0e0f3bad
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f50f0b8cf8a8f5247d0e9729d14f2b46d5491d4b47ff4bafcc913f88950008b8
3
  size 14244
checkpoint-200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5fcb2e0acbfc451125ca5cd950e0420d092eee3eeb6dd5b682202b7a14cd22e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:112b68374ca1c950140f0870a98c4f3bc5a36bbea3cfec174b29f0bafea1fbee
3
  size 1064
checkpoint-200/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.00485074853113271,
5
  "eval_steps": 500,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,1403 +9,1403 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 2.4253742655663552e-05,
13
- "grad_norm": 2.9237160682678223,
14
  "learning_rate": 2.0000000000000003e-06,
15
- "loss": 1.4192,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 4.8507485311327104e-05,
20
- "grad_norm": 8.286487579345703,
21
  "learning_rate": 4.000000000000001e-06,
22
- "loss": 1.7523,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 7.276122796699065e-05,
27
- "grad_norm": 1.957672119140625,
28
  "learning_rate": 6e-06,
29
- "loss": 1.9875,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 9.701497062265421e-05,
34
- "grad_norm": 0.5736268758773804,
35
  "learning_rate": 8.000000000000001e-06,
36
- "loss": 1.427,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.00012126871327831776,
41
- "grad_norm": 0.7024741172790527,
42
  "learning_rate": 1e-05,
43
- "loss": 1.4541,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.0001455224559339813,
48
- "grad_norm": 6.171888828277588,
49
  "learning_rate": 1.2e-05,
50
- "loss": 1.9617,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.00016977619858964487,
55
- "grad_norm": 1.4335829019546509,
56
  "learning_rate": 1.4000000000000001e-05,
57
- "loss": 1.6549,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.00019402994124530841,
62
- "grad_norm": 0.8052822947502136,
63
  "learning_rate": 1.6000000000000003e-05,
64
- "loss": 1.681,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.00021828368390097196,
69
- "grad_norm": 1.0054032802581787,
70
  "learning_rate": 1.8e-05,
71
- "loss": 1.7113,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.00024253742655663553,
76
- "grad_norm": 2.378899097442627,
77
  "learning_rate": 2e-05,
78
- "loss": 2.0225,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.00026679116921229907,
83
- "grad_norm": 12.526110649108887,
84
- "learning_rate": 2.2000000000000003e-05,
85
- "loss": 2.3219,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 0.0002910449118679626,
90
- "grad_norm": 1.658133864402771,
91
- "learning_rate": 2.4e-05,
92
- "loss": 1.5336,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 0.00031529865452362615,
97
- "grad_norm": 18.127710342407227,
98
- "learning_rate": 2.6000000000000002e-05,
99
- "loss": 2.1609,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 0.00033955239717928975,
104
- "grad_norm": 0.7923838496208191,
105
- "learning_rate": 2.8000000000000003e-05,
106
- "loss": 1.7354,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 0.0003638061398349533,
111
- "grad_norm": 1.6248362064361572,
112
- "learning_rate": 3e-05,
113
- "loss": 1.5774,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 0.00038805988249061683,
118
- "grad_norm": 1.6371029615402222,
119
- "learning_rate": 3.2000000000000005e-05,
120
- "loss": 2.2978,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 0.00041231362514628037,
125
- "grad_norm": 1.409232497215271,
126
- "learning_rate": 3.4000000000000007e-05,
127
- "loss": 1.1136,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 0.0004365673678019439,
132
- "grad_norm": 5.593843936920166,
133
- "learning_rate": 3.6e-05,
134
- "loss": 2.3497,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 0.0004608211104576075,
139
- "grad_norm": 1.9881709814071655,
140
- "learning_rate": 3.8e-05,
141
- "loss": 2.0849,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 0.00048507485311327105,
146
- "grad_norm": 1.7124191522598267,
147
- "learning_rate": 4e-05,
148
- "loss": 1.565,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 0.0005093285957689346,
153
- "grad_norm": 0.6472250819206238,
154
- "learning_rate": 4.2e-05,
155
- "loss": 1.0869,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 0.0005335823384245981,
160
- "grad_norm": 4.816655158996582,
161
- "learning_rate": 4.4000000000000006e-05,
162
- "loss": 2.2343,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 0.0005578360810802617,
167
- "grad_norm": 1.3076063394546509,
168
- "learning_rate": 4.600000000000001e-05,
169
- "loss": 1.8767,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 0.0005820898237359252,
174
- "grad_norm": 1.0358009338378906,
175
- "learning_rate": 4.8e-05,
176
- "loss": 1.7001,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 0.0006063435663915888,
181
- "grad_norm": 0.8611243367195129,
182
- "learning_rate": 5e-05,
183
- "loss": 1.0703,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 0.0006305973090472523,
188
- "grad_norm": 1.8082849979400635,
189
- "learning_rate": 5.2000000000000004e-05,
190
- "loss": 2.409,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 0.000654851051702916,
195
- "grad_norm": 2.4376935958862305,
196
- "learning_rate": 5.4000000000000005e-05,
197
- "loss": 1.6188,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 0.0006791047943585795,
202
- "grad_norm": 1.7561759948730469,
203
- "learning_rate": 5.6000000000000006e-05,
204
- "loss": 1.8782,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 0.000703358537014243,
209
- "grad_norm": 1.4189746379852295,
210
- "learning_rate": 5.8e-05,
211
- "loss": 2.1414,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 0.0007276122796699066,
216
- "grad_norm": 1.186546802520752,
217
- "learning_rate": 6e-05,
218
- "loss": 1.5557,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 0.0007518660223255701,
223
- "grad_norm": 1.365446925163269,
224
- "learning_rate": 6.2e-05,
225
- "loss": 1.9806,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 0.0007761197649812337,
230
- "grad_norm": 5.6013641357421875,
231
- "learning_rate": 6.400000000000001e-05,
232
- "loss": 2.836,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 0.0008003735076368972,
237
- "grad_norm": 1.5393822193145752,
238
- "learning_rate": 6.6e-05,
239
- "loss": 1.6833,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 0.0008246272502925607,
244
- "grad_norm": 1.7934705018997192,
245
- "learning_rate": 6.800000000000001e-05,
246
- "loss": 1.6762,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 0.0008488809929482243,
251
- "grad_norm": 1.1468597650527954,
252
- "learning_rate": 7e-05,
253
- "loss": 1.2819,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 0.0008731347356038878,
258
- "grad_norm": 4.899059772491455,
259
- "learning_rate": 7.2e-05,
260
- "loss": 1.7406,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 0.0008973884782595515,
265
- "grad_norm": 1.6008192300796509,
266
- "learning_rate": 7.4e-05,
267
- "loss": 1.5028,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 0.000921642220915215,
272
- "grad_norm": 3.4235761165618896,
273
- "learning_rate": 7.6e-05,
274
- "loss": 1.6816,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 0.0009458959635708786,
279
- "grad_norm": 1.1148329973220825,
280
- "learning_rate": 7.800000000000001e-05,
281
- "loss": 1.4805,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 0.0009701497062265421,
286
- "grad_norm": 4.683089733123779,
287
- "learning_rate": 8e-05,
288
- "loss": 1.9679,
289
  "step": 40
290
  },
291
  {
292
- "epoch": 0.0009944034488822055,
293
- "grad_norm": 0.75764000415802,
294
- "learning_rate": 8.2e-05,
295
- "loss": 1.3406,
296
  "step": 41
297
  },
298
  {
299
- "epoch": 0.0010186571915378692,
300
- "grad_norm": 1.3041810989379883,
301
- "learning_rate": 8.4e-05,
302
- "loss": 1.4009,
303
  "step": 42
304
  },
305
  {
306
- "epoch": 0.0010429109341935328,
307
- "grad_norm": 1.4765511751174927,
308
- "learning_rate": 8.6e-05,
309
- "loss": 1.3446,
310
  "step": 43
311
  },
312
  {
313
- "epoch": 0.0010671646768491963,
314
- "grad_norm": 3.716846227645874,
315
- "learning_rate": 8.800000000000001e-05,
316
- "loss": 2.1417,
317
  "step": 44
318
  },
319
  {
320
- "epoch": 0.00109141841950486,
321
- "grad_norm": 1.0331724882125854,
322
- "learning_rate": 9e-05,
323
- "loss": 1.396,
324
  "step": 45
325
  },
326
  {
327
- "epoch": 0.0011156721621605234,
328
- "grad_norm": 2.426039934158325,
329
- "learning_rate": 9.200000000000001e-05,
330
- "loss": 1.466,
331
  "step": 46
332
  },
333
  {
334
- "epoch": 0.001139925904816187,
335
- "grad_norm": 1.3340977430343628,
336
- "learning_rate": 9.4e-05,
337
- "loss": 1.7891,
338
  "step": 47
339
  },
340
  {
341
- "epoch": 0.0011641796474718504,
342
- "grad_norm": 3.5191874504089355,
343
- "learning_rate": 9.6e-05,
344
- "loss": 1.6633,
345
  "step": 48
346
  },
347
  {
348
- "epoch": 0.001188433390127514,
349
- "grad_norm": 1.6775161027908325,
350
- "learning_rate": 9.8e-05,
351
- "loss": 1.4875,
352
  "step": 49
353
  },
354
  {
355
- "epoch": 0.0012126871327831775,
356
- "grad_norm": 8.406580924987793,
357
- "learning_rate": 0.0001,
358
- "loss": 2.1048,
359
  "step": 50
360
  },
361
  {
362
- "epoch": 0.0012369408754388412,
363
- "grad_norm": 6.012252330780029,
364
- "learning_rate": 0.00010200000000000001,
365
- "loss": 2.0742,
366
  "step": 51
367
  },
368
  {
369
- "epoch": 0.0012611946180945046,
370
- "grad_norm": 12.059433937072754,
371
- "learning_rate": 0.00010400000000000001,
372
- "loss": 1.843,
373
  "step": 52
374
  },
375
  {
376
- "epoch": 0.0012854483607501683,
377
- "grad_norm": 1.3695584535598755,
378
- "learning_rate": 0.00010600000000000002,
379
- "loss": 1.8865,
380
  "step": 53
381
  },
382
  {
383
- "epoch": 0.001309702103405832,
384
- "grad_norm": 2.331925630569458,
385
- "learning_rate": 0.00010800000000000001,
386
- "loss": 1.8662,
387
  "step": 54
388
  },
389
  {
390
- "epoch": 0.0013339558460614953,
391
- "grad_norm": 2.2023472785949707,
392
- "learning_rate": 0.00011000000000000002,
393
- "loss": 1.3941,
394
  "step": 55
395
  },
396
  {
397
- "epoch": 0.001358209588717159,
398
- "grad_norm": 1.971096396446228,
399
- "learning_rate": 0.00011200000000000001,
400
- "loss": 1.9362,
401
  "step": 56
402
  },
403
  {
404
- "epoch": 0.0013824633313728224,
405
- "grad_norm": 1.254345417022705,
406
- "learning_rate": 0.00011399999999999999,
407
- "loss": 1.5581,
408
  "step": 57
409
  },
410
  {
411
- "epoch": 0.001406717074028486,
412
- "grad_norm": Infinity,
413
- "learning_rate": 0.00011399999999999999,
414
- "loss": 1.5454,
415
  "step": 58
416
  },
417
  {
418
- "epoch": 0.0014309708166841495,
419
- "grad_norm": 1.5801585912704468,
420
- "learning_rate": 0.000116,
421
- "loss": 1.553,
422
  "step": 59
423
  },
424
  {
425
- "epoch": 0.0014552245593398132,
426
- "grad_norm": 0.9962314963340759,
427
- "learning_rate": 0.000118,
428
- "loss": 1.4924,
429
  "step": 60
430
  },
431
  {
432
- "epoch": 0.0014794783019954766,
433
- "grad_norm": 5.291327476501465,
434
- "learning_rate": 0.00012,
435
- "loss": 1.8365,
436
  "step": 61
437
  },
438
  {
439
- "epoch": 0.0015037320446511402,
440
- "grad_norm": 1.6945866346359253,
441
- "learning_rate": 0.000122,
442
- "loss": 1.4683,
443
  "step": 62
444
  },
445
  {
446
- "epoch": 0.0015279857873068039,
447
- "grad_norm": 1.0400323867797852,
448
- "learning_rate": 0.000124,
449
- "loss": 0.8587,
450
  "step": 63
451
  },
452
  {
453
- "epoch": 0.0015522395299624673,
454
- "grad_norm": 3.5120689868927,
455
- "learning_rate": 0.000126,
456
- "loss": 1.2127,
457
  "step": 64
458
  },
459
  {
460
- "epoch": 0.001576493272618131,
461
- "grad_norm": 1.0469675064086914,
462
- "learning_rate": 0.00012800000000000002,
463
- "loss": 1.654,
464
  "step": 65
465
  },
466
  {
467
- "epoch": 0.0016007470152737944,
468
- "grad_norm": 1.5673848390579224,
469
- "learning_rate": 0.00013000000000000002,
470
- "loss": 1.801,
471
  "step": 66
472
  },
473
  {
474
- "epoch": 0.001625000757929458,
475
- "grad_norm": 2.0134799480438232,
476
- "learning_rate": 0.000132,
477
- "loss": 2.0432,
478
  "step": 67
479
  },
480
  {
481
- "epoch": 0.0016492545005851215,
482
- "grad_norm": 7.747060775756836,
483
- "learning_rate": 0.000134,
484
- "loss": 1.4823,
485
  "step": 68
486
  },
487
  {
488
- "epoch": 0.0016735082432407851,
489
- "grad_norm": 3.2787840366363525,
490
- "learning_rate": 0.00013600000000000003,
491
- "loss": 1.4865,
492
  "step": 69
493
  },
494
  {
495
- "epoch": 0.0016977619858964486,
496
- "grad_norm": 4.277163028717041,
497
- "learning_rate": 0.000138,
498
- "loss": 1.4665,
499
  "step": 70
500
  },
501
  {
502
- "epoch": 0.0017220157285521122,
503
- "grad_norm": 11.562691688537598,
504
- "learning_rate": 0.00014,
505
- "loss": 2.0221,
506
  "step": 71
507
  },
508
  {
509
- "epoch": 0.0017462694712077757,
510
- "grad_norm": 2.5653882026672363,
511
- "learning_rate": 0.000142,
512
- "loss": 1.4631,
513
  "step": 72
514
  },
515
  {
516
- "epoch": 0.0017705232138634393,
517
- "grad_norm": 2.392688751220703,
518
- "learning_rate": 0.000144,
519
- "loss": 1.3514,
520
  "step": 73
521
  },
522
  {
523
- "epoch": 0.001794776956519103,
524
- "grad_norm": 3.874115467071533,
525
- "learning_rate": 0.000146,
526
- "loss": 1.5983,
527
  "step": 74
528
  },
529
  {
530
- "epoch": 0.0018190306991747664,
531
- "grad_norm": 3.085160732269287,
532
- "learning_rate": 0.000148,
533
- "loss": 1.321,
534
  "step": 75
535
  },
536
  {
537
- "epoch": 0.00184328444183043,
538
- "grad_norm": 1.7499727010726929,
539
- "learning_rate": 0.00015000000000000001,
540
- "loss": 1.385,
541
  "step": 76
542
  },
543
  {
544
- "epoch": 0.0018675381844860935,
545
- "grad_norm": 14.051904678344727,
546
- "learning_rate": 0.000152,
547
- "loss": 1.5748,
548
  "step": 77
549
  },
550
  {
551
- "epoch": 0.0018917919271417571,
552
- "grad_norm": 4.689906597137451,
553
- "learning_rate": 0.000154,
554
- "loss": 1.2126,
555
  "step": 78
556
  },
557
  {
558
- "epoch": 0.0019160456697974206,
559
- "grad_norm": 1.0368577241897583,
560
- "learning_rate": 0.00015600000000000002,
561
- "loss": 1.0542,
562
  "step": 79
563
  },
564
  {
565
- "epoch": 0.0019402994124530842,
566
- "grad_norm": 3.577094078063965,
567
- "learning_rate": 0.00015800000000000002,
568
- "loss": 1.9481,
569
  "step": 80
570
  },
571
  {
572
- "epoch": 0.001964553155108748,
573
- "grad_norm": NaN,
574
- "learning_rate": 0.00015800000000000002,
575
- "loss": 1.2913,
576
  "step": 81
577
  },
578
  {
579
- "epoch": 0.001988806897764411,
580
- "grad_norm": 4.969227313995361,
581
- "learning_rate": 0.00016,
582
- "loss": 1.5551,
583
  "step": 82
584
  },
585
  {
586
- "epoch": 0.0020130606404200747,
587
- "grad_norm": 71.41371154785156,
588
- "learning_rate": 0.000162,
589
- "loss": 1.5847,
590
  "step": 83
591
  },
592
  {
593
- "epoch": 0.0020373143830757384,
594
- "grad_norm": 0.9679685831069946,
595
- "learning_rate": 0.000164,
596
- "loss": 1.1516,
597
  "step": 84
598
  },
599
  {
600
- "epoch": 0.002061568125731402,
601
- "grad_norm": 7.050138473510742,
602
- "learning_rate": 0.000166,
603
- "loss": 1.6904,
604
  "step": 85
605
  },
606
  {
607
- "epoch": 0.0020858218683870657,
608
- "grad_norm": 1.4181028604507446,
609
- "learning_rate": 0.000168,
610
- "loss": 1.5385,
611
  "step": 86
612
  },
613
  {
614
- "epoch": 0.002110075611042729,
615
- "grad_norm": 1.122887134552002,
616
- "learning_rate": 0.00017,
617
- "loss": 1.282,
618
  "step": 87
619
  },
620
  {
621
- "epoch": 0.0021343293536983925,
622
- "grad_norm": 3.0532054901123047,
623
- "learning_rate": 0.000172,
624
- "loss": 1.7126,
625
  "step": 88
626
  },
627
  {
628
- "epoch": 0.002158583096354056,
629
- "grad_norm": 1.1368091106414795,
630
- "learning_rate": 0.000174,
631
- "loss": 1.3792,
632
  "step": 89
633
  },
634
  {
635
- "epoch": 0.00218283683900972,
636
- "grad_norm": 1.5556614398956299,
637
- "learning_rate": 0.00017600000000000002,
638
- "loss": 1.1266,
639
  "step": 90
640
  },
641
  {
642
- "epoch": 0.002207090581665383,
643
- "grad_norm": 2.5969090461730957,
644
- "learning_rate": 0.00017800000000000002,
645
- "loss": 1.1916,
646
  "step": 91
647
  },
648
  {
649
- "epoch": 0.0022313443243210467,
650
- "grad_norm": 7.9968719482421875,
651
- "learning_rate": 0.00018,
652
- "loss": 2.3005,
653
  "step": 92
654
  },
655
  {
656
- "epoch": 0.0022555980669767104,
657
- "grad_norm": 1.0678150653839111,
658
- "learning_rate": 0.000182,
659
- "loss": 1.1991,
660
  "step": 93
661
  },
662
  {
663
- "epoch": 0.002279851809632374,
664
- "grad_norm": 6.678997993469238,
665
- "learning_rate": 0.00018400000000000003,
666
- "loss": 1.9749,
667
  "step": 94
668
  },
669
  {
670
- "epoch": 0.0023041055522880377,
671
- "grad_norm": 2.6500961780548096,
672
- "learning_rate": 0.00018600000000000002,
673
- "loss": 1.7569,
674
  "step": 95
675
  },
676
  {
677
- "epoch": 0.002328359294943701,
678
- "grad_norm": 1.547878384590149,
679
- "learning_rate": 0.000188,
680
- "loss": 1.7286,
681
  "step": 96
682
  },
683
  {
684
- "epoch": 0.0023526130375993645,
685
- "grad_norm": 1.910861849784851,
686
- "learning_rate": 0.00019,
687
- "loss": 2.0815,
688
  "step": 97
689
  },
690
  {
691
- "epoch": 0.002376866780255028,
692
- "grad_norm": 3.678144693374634,
693
- "learning_rate": 0.000192,
694
- "loss": 1.7011,
695
  "step": 98
696
  },
697
  {
698
- "epoch": 0.002401120522910692,
699
- "grad_norm": 1.6496407985687256,
700
- "learning_rate": 0.000194,
701
- "loss": 1.8013,
702
  "step": 99
703
  },
704
  {
705
- "epoch": 0.002425374265566355,
706
- "grad_norm": 1.3245445489883423,
707
- "learning_rate": 0.000196,
708
- "loss": 1.5302,
709
  "step": 100
710
  },
711
  {
712
- "epoch": 0.0024496280082220187,
713
- "grad_norm": 1.609408974647522,
714
- "learning_rate": 0.00019800000000000002,
715
- "loss": 1.6082,
716
  "step": 101
717
  },
718
  {
719
- "epoch": 0.0024738817508776823,
720
- "grad_norm": 2.7757482528686523,
721
- "learning_rate": 0.0002,
722
- "loss": 1.6157,
723
  "step": 102
724
  },
725
  {
726
- "epoch": 0.002498135493533346,
727
- "grad_norm": 2.0618674755096436,
728
  "learning_rate": 0.00019800000000000002,
729
- "loss": 1.7212,
730
  "step": 103
731
  },
732
  {
733
- "epoch": 0.002522389236189009,
734
- "grad_norm": 1.7433160543441772,
735
- "learning_rate": 0.000196,
736
- "loss": 1.9133,
737
  "step": 104
738
  },
739
  {
740
- "epoch": 0.002546642978844673,
741
- "grad_norm": 1.4754176139831543,
742
- "learning_rate": 0.000194,
743
- "loss": 1.3311,
744
  "step": 105
745
  },
746
  {
747
- "epoch": 0.0025708967215003365,
748
- "grad_norm": 1.4527512788772583,
749
- "learning_rate": 0.000192,
750
- "loss": 1.6432,
751
  "step": 106
752
  },
753
  {
754
- "epoch": 0.002595150464156,
755
- "grad_norm": 5.3099045753479,
756
- "learning_rate": 0.00019,
757
- "loss": 1.3233,
758
  "step": 107
759
  },
760
  {
761
- "epoch": 0.002619404206811664,
762
- "grad_norm": 1.016135573387146,
763
- "learning_rate": 0.000188,
764
- "loss": 1.3291,
765
  "step": 108
766
  },
767
  {
768
- "epoch": 0.002643657949467327,
769
- "grad_norm": 1.0468915700912476,
770
- "learning_rate": 0.00018600000000000002,
771
  "loss": 1.4454,
772
  "step": 109
773
  },
774
  {
775
- "epoch": 0.0026679116921229907,
776
- "grad_norm": 3.245159387588501,
777
- "learning_rate": 0.00018400000000000003,
778
- "loss": 1.6501,
779
  "step": 110
780
  },
781
  {
782
- "epoch": 0.0026921654347786543,
783
- "grad_norm": 1.1628005504608154,
784
- "learning_rate": 0.000182,
785
- "loss": 1.399,
786
  "step": 111
787
  },
788
  {
789
- "epoch": 0.002716419177434318,
790
- "grad_norm": 3.3215761184692383,
791
- "learning_rate": 0.00018,
792
- "loss": 1.4829,
793
  "step": 112
794
  },
795
  {
796
- "epoch": 0.002740672920089981,
797
- "grad_norm": 1.424054503440857,
798
- "learning_rate": 0.00017800000000000002,
799
- "loss": 1.7692,
800
  "step": 113
801
  },
802
  {
803
- "epoch": 0.002764926662745645,
804
- "grad_norm": 1.2700444459915161,
805
- "learning_rate": 0.00017600000000000002,
806
- "loss": 1.6959,
807
  "step": 114
808
  },
809
  {
810
- "epoch": 0.0027891804054013085,
811
- "grad_norm": 1.262736201286316,
812
- "learning_rate": 0.000174,
813
- "loss": 1.5445,
814
  "step": 115
815
  },
816
  {
817
- "epoch": 0.002813434148056972,
818
- "grad_norm": 4.976276397705078,
819
- "learning_rate": 0.000172,
820
- "loss": 1.4571,
821
  "step": 116
822
  },
823
  {
824
- "epoch": 0.002837687890712636,
825
- "grad_norm": 1.2716983556747437,
826
- "learning_rate": 0.00017,
827
- "loss": 1.217,
828
  "step": 117
829
  },
830
  {
831
- "epoch": 0.002861941633368299,
832
- "grad_norm": 1.5143672227859497,
833
- "learning_rate": 0.000168,
834
- "loss": 1.3059,
835
  "step": 118
836
  },
837
  {
838
- "epoch": 0.0028861953760239627,
839
- "grad_norm": 0.9907928705215454,
840
- "learning_rate": 0.000166,
841
- "loss": 1.8035,
842
  "step": 119
843
  },
844
  {
845
- "epoch": 0.0029104491186796263,
846
- "grad_norm": 1.7163773775100708,
847
- "learning_rate": 0.000164,
848
- "loss": 2.1635,
849
  "step": 120
850
  },
851
  {
852
- "epoch": 0.00293470286133529,
853
- "grad_norm": 7.744179725646973,
854
- "learning_rate": 0.000162,
855
- "loss": 2.2964,
856
  "step": 121
857
  },
858
  {
859
- "epoch": 0.002958956603990953,
860
- "grad_norm": 1.101208209991455,
861
- "learning_rate": 0.00016,
862
- "loss": 1.46,
863
  "step": 122
864
  },
865
  {
866
- "epoch": 0.002983210346646617,
867
- "grad_norm": 0.7589418888092041,
868
- "learning_rate": 0.00015800000000000002,
869
- "loss": 1.1036,
870
  "step": 123
871
  },
872
  {
873
- "epoch": 0.0030074640893022805,
874
- "grad_norm": 4.626345634460449,
875
- "learning_rate": 0.00015600000000000002,
876
- "loss": 1.5257,
877
  "step": 124
878
  },
879
  {
880
- "epoch": 0.003031717831957944,
881
- "grad_norm": 1.5994218587875366,
882
- "learning_rate": 0.000154,
883
- "loss": 1.2856,
884
  "step": 125
885
  },
886
  {
887
- "epoch": 0.0030559715746136078,
888
- "grad_norm": 1.3893097639083862,
889
- "learning_rate": 0.000152,
890
- "loss": 1.3359,
891
  "step": 126
892
  },
893
  {
894
- "epoch": 0.003080225317269271,
895
- "grad_norm": 1.0536174774169922,
896
- "learning_rate": 0.00015000000000000001,
897
- "loss": 1.1718,
898
  "step": 127
899
  },
900
  {
901
- "epoch": 0.0031044790599249346,
902
- "grad_norm": 1.2898694276809692,
903
- "learning_rate": 0.000148,
904
- "loss": 1.2908,
905
  "step": 128
906
  },
907
  {
908
- "epoch": 0.0031287328025805983,
909
- "grad_norm": 1.6943238973617554,
910
- "learning_rate": 0.000146,
911
- "loss": 1.7723,
912
  "step": 129
913
  },
914
  {
915
- "epoch": 0.003152986545236262,
916
- "grad_norm": 1.2562038898468018,
917
- "learning_rate": 0.000144,
918
- "loss": 1.4476,
919
  "step": 130
920
  },
921
  {
922
- "epoch": 0.003177240287891925,
923
- "grad_norm": 1.0170366764068604,
924
- "learning_rate": 0.000142,
925
- "loss": 1.2967,
926
  "step": 131
927
  },
928
  {
929
- "epoch": 0.003201494030547589,
930
- "grad_norm": 5.646616458892822,
931
- "learning_rate": 0.00014,
932
- "loss": 1.4366,
933
  "step": 132
934
  },
935
  {
936
- "epoch": 0.0032257477732032525,
937
- "grad_norm": 1.29356849193573,
938
- "learning_rate": 0.000138,
939
- "loss": 1.4713,
940
  "step": 133
941
  },
942
  {
943
- "epoch": 0.003250001515858916,
944
- "grad_norm": 1.3575730323791504,
945
- "learning_rate": 0.00013600000000000003,
946
- "loss": 1.2233,
947
  "step": 134
948
  },
949
  {
950
- "epoch": 0.0032742552585145793,
951
- "grad_norm": 1.158686876296997,
952
- "learning_rate": 0.000134,
953
- "loss": 1.3622,
954
  "step": 135
955
  },
956
  {
957
- "epoch": 0.003298509001170243,
958
- "grad_norm": 1.4680081605911255,
959
- "learning_rate": 0.000132,
960
- "loss": 1.7191,
961
  "step": 136
962
  },
963
  {
964
- "epoch": 0.0033227627438259066,
965
- "grad_norm": 1.2448982000350952,
966
- "learning_rate": 0.00013000000000000002,
967
- "loss": 1.9376,
968
  "step": 137
969
  },
970
  {
971
- "epoch": 0.0033470164864815703,
972
- "grad_norm": 1.2404478788375854,
973
- "learning_rate": 0.00012800000000000002,
974
- "loss": 1.5334,
975
  "step": 138
976
  },
977
  {
978
- "epoch": 0.003371270229137234,
979
- "grad_norm": 0.9862974882125854,
980
- "learning_rate": 0.000126,
981
- "loss": 1.2073,
982
  "step": 139
983
  },
984
  {
985
- "epoch": 0.003395523971792897,
986
- "grad_norm": 7.36776876449585,
987
- "learning_rate": 0.000124,
988
- "loss": 1.5745,
989
  "step": 140
990
  },
991
  {
992
- "epoch": 0.003419777714448561,
993
- "grad_norm": 1.1356145143508911,
994
- "learning_rate": 0.000122,
995
- "loss": 1.563,
996
  "step": 141
997
  },
998
  {
999
- "epoch": 0.0034440314571042244,
1000
- "grad_norm": 2.073420286178589,
1001
- "learning_rate": 0.00012,
1002
- "loss": 1.7438,
1003
  "step": 142
1004
  },
1005
  {
1006
- "epoch": 0.003468285199759888,
1007
- "grad_norm": 3.029547691345215,
1008
- "learning_rate": 0.000118,
1009
- "loss": 1.6755,
1010
  "step": 143
1011
  },
1012
  {
1013
- "epoch": 0.0034925389424155513,
1014
- "grad_norm": 14.66256046295166,
1015
- "learning_rate": 0.000116,
1016
- "loss": 1.8375,
1017
  "step": 144
1018
  },
1019
  {
1020
- "epoch": 0.003516792685071215,
1021
- "grad_norm": 0.8765383958816528,
1022
- "learning_rate": 0.00011399999999999999,
1023
- "loss": 0.6782,
1024
  "step": 145
1025
  },
1026
  {
1027
- "epoch": 0.0035410464277268786,
1028
- "grad_norm": 3.167731761932373,
1029
- "learning_rate": 0.00011200000000000001,
1030
- "loss": 1.4619,
1031
  "step": 146
1032
  },
1033
  {
1034
- "epoch": 0.0035653001703825423,
1035
- "grad_norm": 1.1016606092453003,
1036
- "learning_rate": 0.00011000000000000002,
1037
- "loss": 1.5817,
1038
  "step": 147
1039
  },
1040
  {
1041
- "epoch": 0.003589553913038206,
1042
- "grad_norm": 2.5535902976989746,
1043
- "learning_rate": 0.00010800000000000001,
1044
- "loss": 1.3491,
1045
  "step": 148
1046
  },
1047
  {
1048
- "epoch": 0.003613807655693869,
1049
- "grad_norm": 1.1920926570892334,
1050
- "learning_rate": 0.00010600000000000002,
1051
- "loss": 1.5675,
1052
  "step": 149
1053
  },
1054
  {
1055
- "epoch": 0.0036380613983495328,
1056
- "grad_norm": 1.1023300886154175,
1057
- "learning_rate": 0.00010400000000000001,
1058
- "loss": 1.2821,
1059
  "step": 150
1060
  },
1061
  {
1062
- "epoch": 0.0036623151410051964,
1063
- "grad_norm": 2.1146063804626465,
1064
- "learning_rate": 0.00010200000000000001,
1065
- "loss": 1.5396,
1066
  "step": 151
1067
  },
1068
  {
1069
- "epoch": 0.00368656888366086,
1070
- "grad_norm": 1.1838195323944092,
1071
- "learning_rate": 0.0001,
1072
- "loss": 1.2992,
1073
  "step": 152
1074
  },
1075
  {
1076
- "epoch": 0.0037108226263165233,
1077
- "grad_norm": 1.414258360862732,
1078
- "learning_rate": 9.8e-05,
1079
- "loss": 1.6888,
1080
  "step": 153
1081
  },
1082
  {
1083
- "epoch": 0.003735076368972187,
1084
- "grad_norm": 0.8630995154380798,
1085
- "learning_rate": 9.6e-05,
1086
- "loss": 1.141,
1087
  "step": 154
1088
  },
1089
  {
1090
- "epoch": 0.0037593301116278506,
1091
- "grad_norm": 2.4763197898864746,
1092
- "learning_rate": 9.4e-05,
1093
- "loss": 1.5743,
1094
  "step": 155
1095
  },
1096
  {
1097
- "epoch": 0.0037835838542835142,
1098
- "grad_norm": 0.9457703828811646,
1099
- "learning_rate": 9.200000000000001e-05,
1100
- "loss": 1.3879,
1101
  "step": 156
1102
  },
1103
  {
1104
- "epoch": 0.003807837596939178,
1105
- "grad_norm": 1.308862328529358,
1106
- "learning_rate": 9e-05,
1107
- "loss": 1.1564,
1108
  "step": 157
1109
  },
1110
  {
1111
- "epoch": 0.003832091339594841,
1112
- "grad_norm": 1.208833932876587,
1113
- "learning_rate": 8.800000000000001e-05,
1114
- "loss": 1.2059,
1115
  "step": 158
1116
  },
1117
  {
1118
- "epoch": 0.0038563450822505048,
1119
- "grad_norm": 1.9145225286483765,
1120
- "learning_rate": 8.6e-05,
1121
- "loss": 1.4744,
1122
  "step": 159
1123
  },
1124
  {
1125
- "epoch": 0.0038805988249061684,
1126
- "grad_norm": 1.0121599435806274,
1127
- "learning_rate": 8.4e-05,
1128
- "loss": 1.8676,
1129
  "step": 160
1130
  },
1131
  {
1132
- "epoch": 0.003904852567561832,
1133
- "grad_norm": 1.9128226041793823,
1134
- "learning_rate": 8.2e-05,
1135
- "loss": 1.6948,
1136
  "step": 161
1137
  },
1138
  {
1139
- "epoch": 0.003929106310217496,
1140
- "grad_norm": 1.7783511877059937,
1141
- "learning_rate": 8e-05,
1142
- "loss": 1.4198,
1143
  "step": 162
1144
  },
1145
  {
1146
- "epoch": 0.003953360052873159,
1147
- "grad_norm": 0.9799928665161133,
1148
- "learning_rate": 7.800000000000001e-05,
1149
- "loss": 1.185,
1150
  "step": 163
1151
  },
1152
  {
1153
- "epoch": 0.003977613795528822,
1154
- "grad_norm": 1.5025819540023804,
1155
- "learning_rate": 7.6e-05,
1156
- "loss": 1.0638,
1157
  "step": 164
1158
  },
1159
  {
1160
- "epoch": 0.004001867538184486,
1161
- "grad_norm": 1.2552540302276611,
1162
- "learning_rate": 7.4e-05,
1163
- "loss": 1.3463,
1164
  "step": 165
1165
  },
1166
  {
1167
- "epoch": 0.0040261212808401494,
1168
- "grad_norm": 1.0616928339004517,
1169
- "learning_rate": 7.2e-05,
1170
- "loss": 1.9275,
1171
  "step": 166
1172
  },
1173
  {
1174
- "epoch": 0.004050375023495813,
1175
- "grad_norm": 2.8277781009674072,
1176
- "learning_rate": 7e-05,
1177
- "loss": 1.9653,
1178
  "step": 167
1179
  },
1180
  {
1181
- "epoch": 0.004074628766151477,
1182
- "grad_norm": 1.7752915620803833,
1183
- "learning_rate": 6.800000000000001e-05,
1184
- "loss": 1.2783,
1185
  "step": 168
1186
  },
1187
  {
1188
- "epoch": 0.00409888250880714,
1189
- "grad_norm": 2.4863102436065674,
1190
- "learning_rate": 6.6e-05,
1191
- "loss": 1.2803,
1192
  "step": 169
1193
  },
1194
  {
1195
- "epoch": 0.004123136251462804,
1196
- "grad_norm": 0.9784297943115234,
1197
- "learning_rate": 6.400000000000001e-05,
1198
- "loss": 1.0347,
1199
  "step": 170
1200
  },
1201
  {
1202
- "epoch": 0.004147389994118468,
1203
- "grad_norm": 2.0077602863311768,
1204
- "learning_rate": 6.2e-05,
1205
- "loss": 1.5035,
1206
  "step": 171
1207
  },
1208
  {
1209
- "epoch": 0.004171643736774131,
1210
- "grad_norm": 1.5943875312805176,
1211
- "learning_rate": 6e-05,
1212
- "loss": 1.7834,
1213
  "step": 172
1214
  },
1215
  {
1216
- "epoch": 0.004195897479429794,
1217
- "grad_norm": 1.0898661613464355,
1218
- "learning_rate": 5.8e-05,
1219
- "loss": 1.5043,
1220
  "step": 173
1221
  },
1222
  {
1223
- "epoch": 0.004220151222085458,
1224
- "grad_norm": 1.8181873559951782,
1225
- "learning_rate": 5.6000000000000006e-05,
1226
- "loss": 1.5026,
1227
  "step": 174
1228
  },
1229
  {
1230
- "epoch": 0.004244404964741121,
1231
- "grad_norm": 0.9227128028869629,
1232
- "learning_rate": 5.4000000000000005e-05,
1233
- "loss": 1.3945,
1234
  "step": 175
1235
  },
1236
  {
1237
- "epoch": 0.004268658707396785,
1238
- "grad_norm": 1.3131438493728638,
1239
- "learning_rate": 5.2000000000000004e-05,
1240
- "loss": 1.2629,
1241
  "step": 176
1242
  },
1243
  {
1244
- "epoch": 0.004292912450052449,
1245
- "grad_norm": 1.388433814048767,
1246
- "learning_rate": 5e-05,
1247
- "loss": 1.607,
1248
  "step": 177
1249
  },
1250
  {
1251
- "epoch": 0.004317166192708112,
1252
- "grad_norm": 1.5145083665847778,
1253
- "learning_rate": 4.8e-05,
1254
- "loss": 1.8671,
1255
  "step": 178
1256
  },
1257
  {
1258
- "epoch": 0.004341419935363776,
1259
- "grad_norm": 2.0021004676818848,
1260
- "learning_rate": 4.600000000000001e-05,
1261
- "loss": 2.0679,
1262
  "step": 179
1263
  },
1264
  {
1265
- "epoch": 0.00436567367801944,
1266
- "grad_norm": 1.8512483835220337,
1267
- "learning_rate": 4.4000000000000006e-05,
1268
- "loss": 1.3095,
1269
  "step": 180
1270
  },
1271
  {
1272
- "epoch": 0.004389927420675103,
1273
- "grad_norm": 1.0164090394973755,
1274
- "learning_rate": 4.2e-05,
1275
- "loss": 1.3749,
1276
  "step": 181
1277
  },
1278
  {
1279
- "epoch": 0.004414181163330766,
1280
- "grad_norm": 30.730253219604492,
1281
- "learning_rate": 4e-05,
1282
- "loss": 2.3106,
1283
  "step": 182
1284
  },
1285
  {
1286
- "epoch": 0.00443843490598643,
1287
- "grad_norm": 1.2311471700668335,
1288
- "learning_rate": 3.8e-05,
1289
- "loss": 1.1078,
1290
  "step": 183
1291
  },
1292
  {
1293
- "epoch": 0.004462688648642093,
1294
- "grad_norm": 0.9555554389953613,
1295
- "learning_rate": 3.6e-05,
1296
- "loss": 1.4175,
1297
  "step": 184
1298
  },
1299
  {
1300
- "epoch": 0.004486942391297757,
1301
- "grad_norm": 1.4511969089508057,
1302
- "learning_rate": 3.4000000000000007e-05,
1303
- "loss": 1.4099,
1304
  "step": 185
1305
  },
1306
  {
1307
- "epoch": 0.004511196133953421,
1308
- "grad_norm": 5.319721221923828,
1309
- "learning_rate": 3.2000000000000005e-05,
1310
- "loss": 1.2749,
1311
  "step": 186
1312
  },
1313
  {
1314
- "epoch": 0.004535449876609084,
1315
- "grad_norm": 1.8668482303619385,
1316
- "learning_rate": 3e-05,
1317
- "loss": 1.4658,
1318
  "step": 187
1319
  },
1320
  {
1321
- "epoch": 0.004559703619264748,
1322
- "grad_norm": 1.737425684928894,
1323
- "learning_rate": 2.8000000000000003e-05,
1324
- "loss": 1.8142,
1325
  "step": 188
1326
  },
1327
  {
1328
- "epoch": 0.004583957361920412,
1329
- "grad_norm": 12.860699653625488,
1330
- "learning_rate": 2.6000000000000002e-05,
1331
- "loss": 1.6338,
1332
  "step": 189
1333
  },
1334
  {
1335
- "epoch": 0.004608211104576075,
1336
- "grad_norm": 0.6315305233001709,
1337
- "learning_rate": 2.4e-05,
1338
- "loss": 1.0426,
1339
  "step": 190
1340
  },
1341
  {
1342
- "epoch": 0.004632464847231738,
1343
- "grad_norm": 1.0747138261795044,
1344
- "learning_rate": 2.2000000000000003e-05,
1345
- "loss": 1.4042,
1346
  "step": 191
1347
  },
1348
  {
1349
- "epoch": 0.004656718589887402,
1350
- "grad_norm": 1.1410670280456543,
1351
- "learning_rate": 2e-05,
1352
- "loss": 1.2148,
1353
  "step": 192
1354
  },
1355
  {
1356
- "epoch": 0.004680972332543065,
1357
- "grad_norm": 1.4486732482910156,
1358
- "learning_rate": 1.8e-05,
1359
- "loss": 1.3208,
1360
  "step": 193
1361
  },
1362
  {
1363
- "epoch": 0.004705226075198729,
1364
- "grad_norm": 2.5336716175079346,
1365
- "learning_rate": 1.6000000000000003e-05,
1366
- "loss": 1.7136,
1367
  "step": 194
1368
  },
1369
  {
1370
- "epoch": 0.004729479817854393,
1371
- "grad_norm": 3.4056637287139893,
1372
- "learning_rate": 1.4000000000000001e-05,
1373
- "loss": 1.5443,
1374
  "step": 195
1375
  },
1376
  {
1377
- "epoch": 0.004753733560510056,
1378
- "grad_norm": 1.1721996068954468,
1379
- "learning_rate": 1.2e-05,
1380
- "loss": 1.4518,
1381
  "step": 196
1382
  },
1383
  {
1384
- "epoch": 0.00477798730316572,
1385
- "grad_norm": 1.9326874017715454,
1386
- "learning_rate": 1e-05,
1387
- "loss": 1.6381,
1388
  "step": 197
1389
  },
1390
  {
1391
- "epoch": 0.004802241045821384,
1392
- "grad_norm": 1.0234136581420898,
1393
- "learning_rate": 8.000000000000001e-06,
1394
- "loss": 1.4116,
1395
  "step": 198
1396
  },
1397
  {
1398
- "epoch": 0.004826494788477047,
1399
- "grad_norm": 2.1815526485443115,
1400
- "learning_rate": 6e-06,
1401
- "loss": 1.7442,
1402
  "step": 199
1403
  },
1404
  {
1405
- "epoch": 0.00485074853113271,
1406
- "grad_norm": 1.9557406902313232,
1407
- "learning_rate": 4.000000000000001e-06,
1408
- "loss": 1.9568,
1409
  "step": 200
1410
  }
1411
  ],
@@ -1426,7 +1426,7 @@
1426
  "attributes": {}
1427
  }
1428
  },
1429
- "total_flos": 6034692870365184.0,
1430
  "train_batch_size": 1,
1431
  "trial_name": null,
1432
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3189792663476874,
5
  "eval_steps": 500,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.001594896331738437,
13
+ "grad_norm": 9.529223442077637,
14
  "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 3.8964,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.003189792663476874,
20
+ "grad_norm": 3.4800214767456055,
21
  "learning_rate": 4.000000000000001e-06,
22
+ "loss": 2.2247,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.004784688995215311,
27
+ "grad_norm": 3.026850461959839,
28
  "learning_rate": 6e-06,
29
+ "loss": 2.2337,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.006379585326953748,
34
+ "grad_norm": 7.065993785858154,
35
  "learning_rate": 8.000000000000001e-06,
36
+ "loss": 1.9733,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.007974481658692184,
41
+ "grad_norm": 5.338789463043213,
42
  "learning_rate": 1e-05,
43
+ "loss": 2.2808,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.009569377990430622,
48
+ "grad_norm": 12.933262825012207,
49
  "learning_rate": 1.2e-05,
50
+ "loss": 2.2984,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.011164274322169059,
55
+ "grad_norm": 3.535792350769043,
56
  "learning_rate": 1.4000000000000001e-05,
57
+ "loss": 2.6236,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.012759170653907496,
62
+ "grad_norm": 7.564616680145264,
63
  "learning_rate": 1.6000000000000003e-05,
64
+ "loss": 2.4849,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.014354066985645933,
69
+ "grad_norm": 7.379055976867676,
70
  "learning_rate": 1.8e-05,
71
+ "loss": 2.9017,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.01594896331738437,
76
+ "grad_norm": 6.781640529632568,
77
  "learning_rate": 2e-05,
78
+ "loss": 2.5836,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.017543859649122806,
83
+ "grad_norm": Infinity,
84
+ "learning_rate": 2e-05,
85
+ "loss": 2.6482,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.019138755980861243,
90
+ "grad_norm": 9.666836738586426,
91
+ "learning_rate": 2.2000000000000003e-05,
92
+ "loss": 1.5775,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.02073365231259968,
97
+ "grad_norm": 7.910143852233887,
98
+ "learning_rate": 2.4e-05,
99
+ "loss": 2.259,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.022328548644338118,
104
+ "grad_norm": 5.05098295211792,
105
+ "learning_rate": 2.6000000000000002e-05,
106
+ "loss": 2.1146,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.023923444976076555,
111
+ "grad_norm": 4.457101345062256,
112
+ "learning_rate": 2.8000000000000003e-05,
113
+ "loss": 2.1058,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.025518341307814992,
118
+ "grad_norm": NaN,
119
+ "learning_rate": 2.8000000000000003e-05,
120
+ "loss": 2.4278,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.02711323763955343,
125
+ "grad_norm": 4.420550346374512,
126
+ "learning_rate": 3e-05,
127
+ "loss": 1.9174,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.028708133971291867,
132
+ "grad_norm": 9.368809700012207,
133
+ "learning_rate": 3.2000000000000005e-05,
134
+ "loss": 2.0925,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.030303030303030304,
139
+ "grad_norm": 4.382827281951904,
140
+ "learning_rate": 3.4000000000000007e-05,
141
+ "loss": 1.3716,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.03189792663476874,
146
+ "grad_norm": 24.012392044067383,
147
+ "learning_rate": 3.6e-05,
148
+ "loss": 2.348,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.03349282296650718,
153
+ "grad_norm": 2.227252721786499,
154
+ "learning_rate": 3.8e-05,
155
+ "loss": 2.0678,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.03508771929824561,
160
+ "grad_norm": 6.851357460021973,
161
+ "learning_rate": 4e-05,
162
+ "loss": 3.2008,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.03668261562998405,
167
+ "grad_norm": 5.75844144821167,
168
+ "learning_rate": 4.2e-05,
169
+ "loss": 1.8704,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.03827751196172249,
174
+ "grad_norm": 14.692899703979492,
175
+ "learning_rate": 4.4000000000000006e-05,
176
+ "loss": 2.9055,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 0.03987240829346093,
181
+ "grad_norm": NaN,
182
+ "learning_rate": 4.4000000000000006e-05,
183
+ "loss": 2.4497,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 0.04146730462519936,
188
+ "grad_norm": NaN,
189
+ "learning_rate": 4.4000000000000006e-05,
190
+ "loss": 2.4382,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 0.0430622009569378,
195
+ "grad_norm": 6.065037727355957,
196
+ "learning_rate": 4.600000000000001e-05,
197
+ "loss": 2.105,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 0.044657097288676235,
202
+ "grad_norm": 4.765100479125977,
203
+ "learning_rate": 4.8e-05,
204
+ "loss": 2.5292,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 0.046251993620414676,
209
+ "grad_norm": 10.693521499633789,
210
+ "learning_rate": 5e-05,
211
+ "loss": 1.4471,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 0.04784688995215311,
216
+ "grad_norm": 5.703505992889404,
217
+ "learning_rate": 5.2000000000000004e-05,
218
+ "loss": 1.8759,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 0.049441786283891544,
223
+ "grad_norm": 6.594554424285889,
224
+ "learning_rate": 5.4000000000000005e-05,
225
+ "loss": 2.3555,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 0.051036682615629984,
230
+ "grad_norm": 10.944120407104492,
231
+ "learning_rate": 5.6000000000000006e-05,
232
+ "loss": 2.6607,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 0.05263157894736842,
237
+ "grad_norm": 4.437515735626221,
238
+ "learning_rate": 5.8e-05,
239
+ "loss": 2.1366,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 0.05422647527910686,
244
+ "grad_norm": 18.751192092895508,
245
+ "learning_rate": 6e-05,
246
+ "loss": 2.8504,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 0.05582137161084529,
251
+ "grad_norm": 4.625219821929932,
252
+ "learning_rate": 6.2e-05,
253
+ "loss": 1.7409,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 0.05741626794258373,
258
+ "grad_norm": 5.256422996520996,
259
+ "learning_rate": 6.400000000000001e-05,
260
+ "loss": 2.965,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 0.05901116427432217,
265
+ "grad_norm": 5.383383274078369,
266
+ "learning_rate": 6.6e-05,
267
+ "loss": 2.5555,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 0.06060606060606061,
272
+ "grad_norm": 20.616491317749023,
273
+ "learning_rate": 6.800000000000001e-05,
274
+ "loss": 2.1693,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 0.06220095693779904,
279
+ "grad_norm": 14.188122749328613,
280
+ "learning_rate": 7e-05,
281
+ "loss": 3.2098,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 0.06379585326953748,
286
+ "grad_norm": 7.294703483581543,
287
+ "learning_rate": 7.2e-05,
288
+ "loss": 2.8137,
289
  "step": 40
290
  },
291
  {
292
+ "epoch": 0.06539074960127592,
293
+ "grad_norm": 16.109020233154297,
294
+ "learning_rate": 7.4e-05,
295
+ "loss": 3.4821,
296
  "step": 41
297
  },
298
  {
299
+ "epoch": 0.06698564593301436,
300
+ "grad_norm": 12.77609920501709,
301
+ "learning_rate": 7.6e-05,
302
+ "loss": 3.2586,
303
  "step": 42
304
  },
305
  {
306
+ "epoch": 0.0685805422647528,
307
+ "grad_norm": 7.247976303100586,
308
+ "learning_rate": 7.800000000000001e-05,
309
+ "loss": 2.1131,
310
  "step": 43
311
  },
312
  {
313
+ "epoch": 0.07017543859649122,
314
+ "grad_norm": 3.0302693843841553,
315
+ "learning_rate": 8e-05,
316
+ "loss": 2.3047,
317
  "step": 44
318
  },
319
  {
320
+ "epoch": 0.07177033492822966,
321
+ "grad_norm": 6.998560905456543,
322
+ "learning_rate": 8.2e-05,
323
+ "loss": 1.9685,
324
  "step": 45
325
  },
326
  {
327
+ "epoch": 0.0733652312599681,
328
+ "grad_norm": 11.82168197631836,
329
+ "learning_rate": 8.4e-05,
330
+ "loss": 2.412,
331
  "step": 46
332
  },
333
  {
334
+ "epoch": 0.07496012759170653,
335
+ "grad_norm": 2.4560766220092773,
336
+ "learning_rate": 8.6e-05,
337
+ "loss": 1.5546,
338
  "step": 47
339
  },
340
  {
341
+ "epoch": 0.07655502392344497,
342
+ "grad_norm": 7.5531110763549805,
343
+ "learning_rate": 8.800000000000001e-05,
344
+ "loss": 2.0706,
345
  "step": 48
346
  },
347
  {
348
+ "epoch": 0.07814992025518341,
349
+ "grad_norm": 9.41441535949707,
350
+ "learning_rate": 9e-05,
351
+ "loss": 2.0769,
352
  "step": 49
353
  },
354
  {
355
+ "epoch": 0.07974481658692185,
356
+ "grad_norm": 5.027219295501709,
357
+ "learning_rate": 9.200000000000001e-05,
358
+ "loss": 1.0112,
359
  "step": 50
360
  },
361
  {
362
+ "epoch": 0.08133971291866028,
363
+ "grad_norm": 4.590622901916504,
364
+ "learning_rate": 9.4e-05,
365
+ "loss": 2.4014,
366
  "step": 51
367
  },
368
  {
369
+ "epoch": 0.08293460925039872,
370
+ "grad_norm": 110.29186248779297,
371
+ "learning_rate": 9.6e-05,
372
+ "loss": 2.2312,
373
  "step": 52
374
  },
375
  {
376
+ "epoch": 0.08452950558213716,
377
+ "grad_norm": 13.907599449157715,
378
+ "learning_rate": 9.8e-05,
379
+ "loss": 3.74,
380
  "step": 53
381
  },
382
  {
383
+ "epoch": 0.0861244019138756,
384
+ "grad_norm": 21.45476722717285,
385
+ "learning_rate": 0.0001,
386
+ "loss": 2.2449,
387
  "step": 54
388
  },
389
  {
390
+ "epoch": 0.08771929824561403,
391
+ "grad_norm": 26.367050170898438,
392
+ "learning_rate": 0.00010200000000000001,
393
+ "loss": 2.4117,
394
  "step": 55
395
  },
396
  {
397
+ "epoch": 0.08931419457735247,
398
+ "grad_norm": 11.473040580749512,
399
+ "learning_rate": 0.00010400000000000001,
400
+ "loss": 1.9575,
401
  "step": 56
402
  },
403
  {
404
+ "epoch": 0.09090909090909091,
405
+ "grad_norm": 7.996298789978027,
406
+ "learning_rate": 0.00010600000000000002,
407
+ "loss": 2.2506,
408
  "step": 57
409
  },
410
  {
411
+ "epoch": 0.09250398724082935,
412
+ "grad_norm": 5.780291557312012,
413
+ "learning_rate": 0.00010800000000000001,
414
+ "loss": 1.685,
415
  "step": 58
416
  },
417
  {
418
+ "epoch": 0.09409888357256778,
419
+ "grad_norm": 14.359979629516602,
420
+ "learning_rate": 0.00011000000000000002,
421
+ "loss": 2.1997,
422
  "step": 59
423
  },
424
  {
425
+ "epoch": 0.09569377990430622,
426
+ "grad_norm": 2.4853787422180176,
427
+ "learning_rate": 0.00011200000000000001,
428
+ "loss": 1.2731,
429
  "step": 60
430
  },
431
  {
432
+ "epoch": 0.09728867623604466,
433
+ "grad_norm": 7.136777877807617,
434
+ "learning_rate": 0.00011399999999999999,
435
+ "loss": 2.1987,
436
  "step": 61
437
  },
438
  {
439
+ "epoch": 0.09888357256778309,
440
+ "grad_norm": 7.219399929046631,
441
+ "learning_rate": 0.000116,
442
+ "loss": 2.3118,
443
  "step": 62
444
  },
445
  {
446
+ "epoch": 0.10047846889952153,
447
+ "grad_norm": 15.516590118408203,
448
+ "learning_rate": 0.000118,
449
+ "loss": 2.2534,
450
  "step": 63
451
  },
452
  {
453
+ "epoch": 0.10207336523125997,
454
+ "grad_norm": 4.05905818939209,
455
+ "learning_rate": 0.00012,
456
+ "loss": 2.0509,
457
  "step": 64
458
  },
459
  {
460
+ "epoch": 0.10366826156299841,
461
+ "grad_norm": 51.53736877441406,
462
+ "learning_rate": 0.000122,
463
+ "loss": 3.7686,
464
  "step": 65
465
  },
466
  {
467
+ "epoch": 0.10526315789473684,
468
+ "grad_norm": 4.316274642944336,
469
+ "learning_rate": 0.000124,
470
+ "loss": 1.332,
471
  "step": 66
472
  },
473
  {
474
+ "epoch": 0.10685805422647528,
475
+ "grad_norm": 47.97142028808594,
476
+ "learning_rate": 0.000126,
477
+ "loss": 1.4256,
478
  "step": 67
479
  },
480
  {
481
+ "epoch": 0.10845295055821372,
482
+ "grad_norm": 10.22885513305664,
483
+ "learning_rate": 0.00012800000000000002,
484
+ "loss": 1.4572,
485
  "step": 68
486
  },
487
  {
488
+ "epoch": 0.11004784688995216,
489
+ "grad_norm": 5.966179370880127,
490
+ "learning_rate": 0.00013000000000000002,
491
+ "loss": 1.3998,
492
  "step": 69
493
  },
494
  {
495
+ "epoch": 0.11164274322169059,
496
+ "grad_norm": 3.413966178894043,
497
+ "learning_rate": 0.000132,
498
+ "loss": 1.6306,
499
  "step": 70
500
  },
501
  {
502
+ "epoch": 0.11323763955342903,
503
+ "grad_norm": 197.0709991455078,
504
+ "learning_rate": 0.000134,
505
+ "loss": 1.6587,
506
  "step": 71
507
  },
508
  {
509
+ "epoch": 0.11483253588516747,
510
+ "grad_norm": 12.735696792602539,
511
+ "learning_rate": 0.00013600000000000003,
512
+ "loss": 2.0253,
513
  "step": 72
514
  },
515
  {
516
+ "epoch": 0.11642743221690591,
517
+ "grad_norm": 14.38369083404541,
518
+ "learning_rate": 0.000138,
519
+ "loss": 2.0833,
520
  "step": 73
521
  },
522
  {
523
+ "epoch": 0.11802232854864433,
524
+ "grad_norm": 5.746401786804199,
525
+ "learning_rate": 0.00014,
526
+ "loss": 1.959,
527
  "step": 74
528
  },
529
  {
530
+ "epoch": 0.11961722488038277,
531
+ "grad_norm": 4.266767501831055,
532
+ "learning_rate": 0.000142,
533
+ "loss": 1.5752,
534
  "step": 75
535
  },
536
  {
537
+ "epoch": 0.12121212121212122,
538
+ "grad_norm": 16.618648529052734,
539
+ "learning_rate": 0.000144,
540
+ "loss": 3.0947,
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 0.12280701754385964,
545
+ "grad_norm": 2.3666510581970215,
546
+ "learning_rate": 0.000146,
547
+ "loss": 2.0715,
548
  "step": 77
549
  },
550
  {
551
+ "epoch": 0.12440191387559808,
552
+ "grad_norm": 6.622753143310547,
553
+ "learning_rate": 0.000148,
554
+ "loss": 1.8844,
555
  "step": 78
556
  },
557
  {
558
+ "epoch": 0.12599681020733652,
559
+ "grad_norm": 43.816375732421875,
560
+ "learning_rate": 0.00015000000000000001,
561
+ "loss": 1.9108,
562
  "step": 79
563
  },
564
  {
565
+ "epoch": 0.12759170653907495,
566
+ "grad_norm": 6.682168006896973,
567
+ "learning_rate": 0.000152,
568
+ "loss": 1.5588,
569
  "step": 80
570
  },
571
  {
572
+ "epoch": 0.1291866028708134,
573
+ "grad_norm": 3.0997276306152344,
574
+ "learning_rate": 0.000154,
575
+ "loss": 2.1988,
576
  "step": 81
577
  },
578
  {
579
+ "epoch": 0.13078149920255183,
580
+ "grad_norm": 4.309176445007324,
581
+ "learning_rate": 0.00015600000000000002,
582
+ "loss": 1.406,
583
  "step": 82
584
  },
585
  {
586
+ "epoch": 0.13237639553429026,
587
+ "grad_norm": 5.416724681854248,
588
+ "learning_rate": 0.00015800000000000002,
589
+ "loss": 1.576,
590
  "step": 83
591
  },
592
  {
593
+ "epoch": 0.1339712918660287,
594
+ "grad_norm": 4.361196517944336,
595
+ "learning_rate": 0.00016,
596
+ "loss": 2.2985,
597
  "step": 84
598
  },
599
  {
600
+ "epoch": 0.13556618819776714,
601
+ "grad_norm": 7.655956745147705,
602
+ "learning_rate": 0.000162,
603
+ "loss": 1.3693,
604
  "step": 85
605
  },
606
  {
607
+ "epoch": 0.1371610845295056,
608
+ "grad_norm": 5.973109245300293,
609
+ "learning_rate": 0.000164,
610
+ "loss": 2.4914,
611
  "step": 86
612
  },
613
  {
614
+ "epoch": 0.13875598086124402,
615
+ "grad_norm": 5.1589436531066895,
616
+ "learning_rate": 0.000166,
617
+ "loss": 1.6638,
618
  "step": 87
619
  },
620
  {
621
+ "epoch": 0.14035087719298245,
622
+ "grad_norm": 6.356327056884766,
623
+ "learning_rate": 0.000168,
624
+ "loss": 1.381,
625
  "step": 88
626
  },
627
  {
628
+ "epoch": 0.1419457735247209,
629
+ "grad_norm": 1.5582388639450073,
630
+ "learning_rate": 0.00017,
631
+ "loss": 1.4864,
632
  "step": 89
633
  },
634
  {
635
+ "epoch": 0.14354066985645933,
636
+ "grad_norm": 2.830798387527466,
637
+ "learning_rate": 0.000172,
638
+ "loss": 1.897,
639
  "step": 90
640
  },
641
  {
642
+ "epoch": 0.14513556618819776,
643
+ "grad_norm": 20.24082374572754,
644
+ "learning_rate": 0.000174,
645
+ "loss": 1.7698,
646
  "step": 91
647
  },
648
  {
649
+ "epoch": 0.1467304625199362,
650
+ "grad_norm": 35.429466247558594,
651
+ "learning_rate": 0.00017600000000000002,
652
+ "loss": 3.6051,
653
  "step": 92
654
  },
655
  {
656
+ "epoch": 0.14832535885167464,
657
+ "grad_norm": 9.8428373336792,
658
+ "learning_rate": 0.00017800000000000002,
659
+ "loss": 1.9402,
660
  "step": 93
661
  },
662
  {
663
+ "epoch": 0.14992025518341306,
664
+ "grad_norm": 3.7131340503692627,
665
+ "learning_rate": 0.00018,
666
+ "loss": 2.2192,
667
  "step": 94
668
  },
669
  {
670
+ "epoch": 0.15151515151515152,
671
+ "grad_norm": 8.838521003723145,
672
+ "learning_rate": 0.000182,
673
+ "loss": 2.0167,
674
  "step": 95
675
  },
676
  {
677
+ "epoch": 0.15311004784688995,
678
+ "grad_norm": 9.63149356842041,
679
+ "learning_rate": 0.00018400000000000003,
680
+ "loss": 1.6169,
681
  "step": 96
682
  },
683
  {
684
+ "epoch": 0.1547049441786284,
685
+ "grad_norm": 6.012756824493408,
686
+ "learning_rate": 0.00018600000000000002,
687
+ "loss": 1.4477,
688
  "step": 97
689
  },
690
  {
691
+ "epoch": 0.15629984051036683,
692
+ "grad_norm": 9.02092456817627,
693
+ "learning_rate": 0.000188,
694
+ "loss": 1.5999,
695
  "step": 98
696
  },
697
  {
698
+ "epoch": 0.15789473684210525,
699
+ "grad_norm": 5.8646416664123535,
700
+ "learning_rate": 0.00019,
701
+ "loss": 1.2515,
702
  "step": 99
703
  },
704
  {
705
+ "epoch": 0.1594896331738437,
706
+ "grad_norm": 3.3447864055633545,
707
+ "learning_rate": 0.000192,
708
+ "loss": 1.257,
709
  "step": 100
710
  },
711
  {
712
+ "epoch": 0.16108452950558214,
713
+ "grad_norm": 8.706202507019043,
714
+ "learning_rate": 0.000194,
715
+ "loss": 2.0047,
716
  "step": 101
717
  },
718
  {
719
+ "epoch": 0.16267942583732056,
720
+ "grad_norm": 17.863378524780273,
721
+ "learning_rate": 0.000196,
722
+ "loss": 2.421,
723
  "step": 102
724
  },
725
  {
726
+ "epoch": 0.16427432216905902,
727
+ "grad_norm": 5.376917362213135,
728
  "learning_rate": 0.00019800000000000002,
729
+ "loss": 2.1448,
730
  "step": 103
731
  },
732
  {
733
+ "epoch": 0.16586921850079744,
734
+ "grad_norm": 3.9824860095977783,
735
+ "learning_rate": 0.0002,
736
+ "loss": 1.7987,
737
  "step": 104
738
  },
739
  {
740
+ "epoch": 0.1674641148325359,
741
+ "grad_norm": 6.7771382331848145,
742
+ "learning_rate": 0.00019800000000000002,
743
+ "loss": 1.8173,
744
  "step": 105
745
  },
746
  {
747
+ "epoch": 0.16905901116427433,
748
+ "grad_norm": 6.204737186431885,
749
+ "learning_rate": 0.000196,
750
+ "loss": 1.2615,
751
  "step": 106
752
  },
753
  {
754
+ "epoch": 0.17065390749601275,
755
+ "grad_norm": 5.108335018157959,
756
+ "learning_rate": 0.000194,
757
+ "loss": 1.6966,
758
  "step": 107
759
  },
760
  {
761
+ "epoch": 0.1722488038277512,
762
+ "grad_norm": 51.948585510253906,
763
+ "learning_rate": 0.000192,
764
+ "loss": 2.8226,
765
  "step": 108
766
  },
767
  {
768
+ "epoch": 0.17384370015948963,
769
+ "grad_norm": 3.510647773742676,
770
+ "learning_rate": 0.00019,
771
  "loss": 1.4454,
772
  "step": 109
773
  },
774
  {
775
+ "epoch": 0.17543859649122806,
776
+ "grad_norm": 5.786037921905518,
777
+ "learning_rate": 0.000188,
778
+ "loss": 1.3979,
779
  "step": 110
780
  },
781
  {
782
+ "epoch": 0.17703349282296652,
783
+ "grad_norm": 5.4267964363098145,
784
+ "learning_rate": 0.00018600000000000002,
785
+ "loss": 1.8775,
786
  "step": 111
787
  },
788
  {
789
+ "epoch": 0.17862838915470494,
790
+ "grad_norm": 7.448083400726318,
791
+ "learning_rate": 0.00018400000000000003,
792
+ "loss": 1.8377,
793
  "step": 112
794
  },
795
  {
796
+ "epoch": 0.18022328548644337,
797
+ "grad_norm": 6.855521202087402,
798
+ "learning_rate": 0.000182,
799
+ "loss": 1.7079,
800
  "step": 113
801
  },
802
  {
803
+ "epoch": 0.18181818181818182,
804
+ "grad_norm": 6.257575988769531,
805
+ "learning_rate": 0.00018,
806
+ "loss": 2.2364,
807
  "step": 114
808
  },
809
  {
810
+ "epoch": 0.18341307814992025,
811
+ "grad_norm": 15.47193431854248,
812
+ "learning_rate": 0.00017800000000000002,
813
+ "loss": 1.7272,
814
  "step": 115
815
  },
816
  {
817
+ "epoch": 0.1850079744816587,
818
+ "grad_norm": 12.51389217376709,
819
+ "learning_rate": 0.00017600000000000002,
820
+ "loss": 1.8385,
821
  "step": 116
822
  },
823
  {
824
+ "epoch": 0.18660287081339713,
825
+ "grad_norm": 8.150806427001953,
826
+ "learning_rate": 0.000174,
827
+ "loss": 2.429,
828
  "step": 117
829
  },
830
  {
831
+ "epoch": 0.18819776714513556,
832
+ "grad_norm": 29.065210342407227,
833
+ "learning_rate": 0.000172,
834
+ "loss": 1.3318,
835
  "step": 118
836
  },
837
  {
838
+ "epoch": 0.189792663476874,
839
+ "grad_norm": 5.457766532897949,
840
+ "learning_rate": 0.00017,
841
+ "loss": 1.2329,
842
  "step": 119
843
  },
844
  {
845
+ "epoch": 0.19138755980861244,
846
+ "grad_norm": 3.7080814838409424,
847
+ "learning_rate": 0.000168,
848
+ "loss": 1.1145,
849
  "step": 120
850
  },
851
  {
852
+ "epoch": 0.19298245614035087,
853
+ "grad_norm": 5.2060065269470215,
854
+ "learning_rate": 0.000166,
855
+ "loss": 1.6808,
856
  "step": 121
857
  },
858
  {
859
+ "epoch": 0.19457735247208932,
860
+ "grad_norm": 15.03097152709961,
861
+ "learning_rate": 0.000164,
862
+ "loss": 2.207,
863
  "step": 122
864
  },
865
  {
866
+ "epoch": 0.19617224880382775,
867
+ "grad_norm": 3.1523945331573486,
868
+ "learning_rate": 0.000162,
869
+ "loss": 0.8121,
870
  "step": 123
871
  },
872
  {
873
+ "epoch": 0.19776714513556617,
874
+ "grad_norm": 12.531198501586914,
875
+ "learning_rate": 0.00016,
876
+ "loss": 1.6493,
877
  "step": 124
878
  },
879
  {
880
+ "epoch": 0.19936204146730463,
881
+ "grad_norm": 18.13838768005371,
882
+ "learning_rate": 0.00015800000000000002,
883
+ "loss": 1.9895,
884
  "step": 125
885
  },
886
  {
887
+ "epoch": 0.20095693779904306,
888
+ "grad_norm": 3.7068324089050293,
889
+ "learning_rate": 0.00015600000000000002,
890
+ "loss": 1.2397,
891
  "step": 126
892
  },
893
  {
894
+ "epoch": 0.2025518341307815,
895
+ "grad_norm": 3.565275192260742,
896
+ "learning_rate": 0.000154,
897
+ "loss": 1.1711,
898
  "step": 127
899
  },
900
  {
901
+ "epoch": 0.20414673046251994,
902
+ "grad_norm": 2.20239520072937,
903
+ "learning_rate": 0.000152,
904
+ "loss": 1.8436,
905
  "step": 128
906
  },
907
  {
908
+ "epoch": 0.20574162679425836,
909
+ "grad_norm": 3.103001117706299,
910
+ "learning_rate": 0.00015000000000000001,
911
+ "loss": 1.0085,
912
  "step": 129
913
  },
914
  {
915
+ "epoch": 0.20733652312599682,
916
+ "grad_norm": 3.8708302974700928,
917
+ "learning_rate": 0.000148,
918
+ "loss": 1.0336,
919
  "step": 130
920
  },
921
  {
922
+ "epoch": 0.20893141945773525,
923
+ "grad_norm": 4.008622646331787,
924
+ "learning_rate": 0.000146,
925
+ "loss": 1.7669,
926
  "step": 131
927
  },
928
  {
929
+ "epoch": 0.21052631578947367,
930
+ "grad_norm": 3.847217559814453,
931
+ "learning_rate": 0.000144,
932
+ "loss": 1.5163,
933
  "step": 132
934
  },
935
  {
936
+ "epoch": 0.21212121212121213,
937
+ "grad_norm": 9.75152587890625,
938
+ "learning_rate": 0.000142,
939
+ "loss": 2.3286,
940
  "step": 133
941
  },
942
  {
943
+ "epoch": 0.21371610845295055,
944
+ "grad_norm": 9.416215896606445,
945
+ "learning_rate": 0.00014,
946
+ "loss": 2.0397,
947
  "step": 134
948
  },
949
  {
950
+ "epoch": 0.215311004784689,
951
+ "grad_norm": 8.208052635192871,
952
+ "learning_rate": 0.000138,
953
+ "loss": 1.7927,
954
  "step": 135
955
  },
956
  {
957
+ "epoch": 0.21690590111642744,
958
+ "grad_norm": 11.827594757080078,
959
+ "learning_rate": 0.00013600000000000003,
960
+ "loss": 2.1958,
961
  "step": 136
962
  },
963
  {
964
+ "epoch": 0.21850079744816586,
965
+ "grad_norm": 5.788820266723633,
966
+ "learning_rate": 0.000134,
967
+ "loss": 1.8336,
968
  "step": 137
969
  },
970
  {
971
+ "epoch": 0.22009569377990432,
972
+ "grad_norm": 3.6653785705566406,
973
+ "learning_rate": 0.000132,
974
+ "loss": 1.0477,
975
  "step": 138
976
  },
977
  {
978
+ "epoch": 0.22169059011164274,
979
+ "grad_norm": 63.57593536376953,
980
+ "learning_rate": 0.00013000000000000002,
981
+ "loss": 1.5667,
982
  "step": 139
983
  },
984
  {
985
+ "epoch": 0.22328548644338117,
986
+ "grad_norm": 12.209519386291504,
987
+ "learning_rate": 0.00012800000000000002,
988
+ "loss": 1.6414,
989
  "step": 140
990
  },
991
  {
992
+ "epoch": 0.22488038277511962,
993
+ "grad_norm": 8.834230422973633,
994
+ "learning_rate": 0.000126,
995
+ "loss": 1.1292,
996
  "step": 141
997
  },
998
  {
999
+ "epoch": 0.22647527910685805,
1000
+ "grad_norm": 3.4640305042266846,
1001
+ "learning_rate": 0.000124,
1002
+ "loss": 1.3858,
1003
  "step": 142
1004
  },
1005
  {
1006
+ "epoch": 0.22807017543859648,
1007
+ "grad_norm": 5.367834091186523,
1008
+ "learning_rate": 0.000122,
1009
+ "loss": 1.4735,
1010
  "step": 143
1011
  },
1012
  {
1013
+ "epoch": 0.22966507177033493,
1014
+ "grad_norm": 3.5475995540618896,
1015
+ "learning_rate": 0.00012,
1016
+ "loss": 1.8889,
1017
  "step": 144
1018
  },
1019
  {
1020
+ "epoch": 0.23125996810207336,
1021
+ "grad_norm": 4.0117621421813965,
1022
+ "learning_rate": 0.000118,
1023
+ "loss": 0.6609,
1024
  "step": 145
1025
  },
1026
  {
1027
+ "epoch": 0.23285486443381181,
1028
+ "grad_norm": 2.641918420791626,
1029
+ "learning_rate": 0.000116,
1030
+ "loss": 1.439,
1031
  "step": 146
1032
  },
1033
  {
1034
+ "epoch": 0.23444976076555024,
1035
+ "grad_norm": 24.166297912597656,
1036
+ "learning_rate": 0.00011399999999999999,
1037
+ "loss": 1.5829,
1038
  "step": 147
1039
  },
1040
  {
1041
+ "epoch": 0.23604465709728867,
1042
+ "grad_norm": 12.261051177978516,
1043
+ "learning_rate": 0.00011200000000000001,
1044
+ "loss": 2.6259,
1045
  "step": 148
1046
  },
1047
  {
1048
+ "epoch": 0.23763955342902712,
1049
+ "grad_norm": 4.844594955444336,
1050
+ "learning_rate": 0.00011000000000000002,
1051
+ "loss": 2.2162,
1052
  "step": 149
1053
  },
1054
  {
1055
+ "epoch": 0.23923444976076555,
1056
+ "grad_norm": 6.697972297668457,
1057
+ "learning_rate": 0.00010800000000000001,
1058
+ "loss": 2.7383,
1059
  "step": 150
1060
  },
1061
  {
1062
+ "epoch": 0.24082934609250398,
1063
+ "grad_norm": 4.018860340118408,
1064
+ "learning_rate": 0.00010600000000000002,
1065
+ "loss": 2.4861,
1066
  "step": 151
1067
  },
1068
  {
1069
+ "epoch": 0.24242424242424243,
1070
+ "grad_norm": 7.205791473388672,
1071
+ "learning_rate": 0.00010400000000000001,
1072
+ "loss": 1.4837,
1073
  "step": 152
1074
  },
1075
  {
1076
+ "epoch": 0.24401913875598086,
1077
+ "grad_norm": 8.773712158203125,
1078
+ "learning_rate": 0.00010200000000000001,
1079
+ "loss": 1.6268,
1080
  "step": 153
1081
  },
1082
  {
1083
+ "epoch": 0.24561403508771928,
1084
+ "grad_norm": 4.254454135894775,
1085
+ "learning_rate": 0.0001,
1086
+ "loss": 1.6405,
1087
  "step": 154
1088
  },
1089
  {
1090
+ "epoch": 0.24720893141945774,
1091
+ "grad_norm": 4.130655288696289,
1092
+ "learning_rate": 9.8e-05,
1093
+ "loss": 0.9554,
1094
  "step": 155
1095
  },
1096
  {
1097
+ "epoch": 0.24880382775119617,
1098
+ "grad_norm": 7.5216169357299805,
1099
+ "learning_rate": 9.6e-05,
1100
+ "loss": 1.3912,
1101
  "step": 156
1102
  },
1103
  {
1104
+ "epoch": 0.2503987240829346,
1105
+ "grad_norm": 70.0249252319336,
1106
+ "learning_rate": 9.4e-05,
1107
+ "loss": 1.8041,
1108
  "step": 157
1109
  },
1110
  {
1111
+ "epoch": 0.25199362041467305,
1112
+ "grad_norm": 3.9001526832580566,
1113
+ "learning_rate": 9.200000000000001e-05,
1114
+ "loss": 1.7109,
1115
  "step": 158
1116
  },
1117
  {
1118
+ "epoch": 0.2535885167464115,
1119
+ "grad_norm": 5.0965094566345215,
1120
+ "learning_rate": 9e-05,
1121
+ "loss": 1.0314,
1122
  "step": 159
1123
  },
1124
  {
1125
+ "epoch": 0.2551834130781499,
1126
+ "grad_norm": 3.0462706089019775,
1127
+ "learning_rate": 8.800000000000001e-05,
1128
+ "loss": 1.3541,
1129
  "step": 160
1130
  },
1131
  {
1132
+ "epoch": 0.2567783094098884,
1133
+ "grad_norm": 2.8334619998931885,
1134
+ "learning_rate": 8.6e-05,
1135
+ "loss": 1.9827,
1136
  "step": 161
1137
  },
1138
  {
1139
+ "epoch": 0.2583732057416268,
1140
+ "grad_norm": 5.991235256195068,
1141
+ "learning_rate": 8.4e-05,
1142
+ "loss": 1.4319,
1143
  "step": 162
1144
  },
1145
  {
1146
+ "epoch": 0.25996810207336524,
1147
+ "grad_norm": 6.401395320892334,
1148
+ "learning_rate": 8.2e-05,
1149
+ "loss": 1.912,
1150
  "step": 163
1151
  },
1152
  {
1153
+ "epoch": 0.26156299840510366,
1154
+ "grad_norm": 3.0269479751586914,
1155
+ "learning_rate": 8e-05,
1156
+ "loss": 1.5637,
1157
  "step": 164
1158
  },
1159
  {
1160
+ "epoch": 0.2631578947368421,
1161
+ "grad_norm": 10.380670547485352,
1162
+ "learning_rate": 7.800000000000001e-05,
1163
+ "loss": 1.8585,
1164
  "step": 165
1165
  },
1166
  {
1167
+ "epoch": 0.2647527910685805,
1168
+ "grad_norm": 7.375792503356934,
1169
+ "learning_rate": 7.6e-05,
1170
+ "loss": 1.8773,
1171
  "step": 166
1172
  },
1173
  {
1174
+ "epoch": 0.266347687400319,
1175
+ "grad_norm": 3.75229811668396,
1176
+ "learning_rate": 7.4e-05,
1177
+ "loss": 1.9936,
1178
  "step": 167
1179
  },
1180
  {
1181
+ "epoch": 0.2679425837320574,
1182
+ "grad_norm": 4.587918281555176,
1183
+ "learning_rate": 7.2e-05,
1184
+ "loss": 1.0402,
1185
  "step": 168
1186
  },
1187
  {
1188
+ "epoch": 0.26953748006379585,
1189
+ "grad_norm": 4.558103561401367,
1190
+ "learning_rate": 7e-05,
1191
+ "loss": 2.0408,
1192
  "step": 169
1193
  },
1194
  {
1195
+ "epoch": 0.2711323763955343,
1196
+ "grad_norm": 6.410488605499268,
1197
+ "learning_rate": 6.800000000000001e-05,
1198
+ "loss": 1.448,
1199
  "step": 170
1200
  },
1201
  {
1202
+ "epoch": 0.2727272727272727,
1203
+ "grad_norm": 9.21990966796875,
1204
+ "learning_rate": 6.6e-05,
1205
+ "loss": 0.963,
1206
  "step": 171
1207
  },
1208
  {
1209
+ "epoch": 0.2743221690590112,
1210
+ "grad_norm": 10.446578025817871,
1211
+ "learning_rate": 6.400000000000001e-05,
1212
+ "loss": 1.4381,
1213
  "step": 172
1214
  },
1215
  {
1216
+ "epoch": 0.2759170653907496,
1217
+ "grad_norm": 13.717737197875977,
1218
+ "learning_rate": 6.2e-05,
1219
+ "loss": 1.2965,
1220
  "step": 173
1221
  },
1222
  {
1223
+ "epoch": 0.27751196172248804,
1224
+ "grad_norm": 36.117401123046875,
1225
+ "learning_rate": 6e-05,
1226
+ "loss": 1.2144,
1227
  "step": 174
1228
  },
1229
  {
1230
+ "epoch": 0.27910685805422647,
1231
+ "grad_norm": 8.741774559020996,
1232
+ "learning_rate": 5.8e-05,
1233
+ "loss": 1.5059,
1234
  "step": 175
1235
  },
1236
  {
1237
+ "epoch": 0.2807017543859649,
1238
+ "grad_norm": 3.040428400039673,
1239
+ "learning_rate": 5.6000000000000006e-05,
1240
+ "loss": 1.4881,
1241
  "step": 176
1242
  },
1243
  {
1244
+ "epoch": 0.2822966507177033,
1245
+ "grad_norm": 7.401648044586182,
1246
+ "learning_rate": 5.4000000000000005e-05,
1247
+ "loss": 0.7462,
1248
  "step": 177
1249
  },
1250
  {
1251
+ "epoch": 0.2838915470494418,
1252
+ "grad_norm": 3.3486616611480713,
1253
+ "learning_rate": 5.2000000000000004e-05,
1254
+ "loss": 0.8773,
1255
  "step": 178
1256
  },
1257
  {
1258
+ "epoch": 0.28548644338118023,
1259
+ "grad_norm": 9.647512435913086,
1260
+ "learning_rate": 5e-05,
1261
+ "loss": 1.3147,
1262
  "step": 179
1263
  },
1264
  {
1265
+ "epoch": 0.28708133971291866,
1266
+ "grad_norm": 4.068251132965088,
1267
+ "learning_rate": 4.8e-05,
1268
+ "loss": 1.4796,
1269
  "step": 180
1270
  },
1271
  {
1272
+ "epoch": 0.2886762360446571,
1273
+ "grad_norm": 5.119231700897217,
1274
+ "learning_rate": 4.600000000000001e-05,
1275
+ "loss": 1.2465,
1276
  "step": 181
1277
  },
1278
  {
1279
+ "epoch": 0.2902711323763955,
1280
+ "grad_norm": 11.434249877929688,
1281
+ "learning_rate": 4.4000000000000006e-05,
1282
+ "loss": 1.0839,
1283
  "step": 182
1284
  },
1285
  {
1286
+ "epoch": 0.291866028708134,
1287
+ "grad_norm": 2.6633126735687256,
1288
+ "learning_rate": 4.2e-05,
1289
+ "loss": 1.1653,
1290
  "step": 183
1291
  },
1292
  {
1293
+ "epoch": 0.2934609250398724,
1294
+ "grad_norm": 8.366178512573242,
1295
+ "learning_rate": 4e-05,
1296
+ "loss": 1.8623,
1297
  "step": 184
1298
  },
1299
  {
1300
+ "epoch": 0.29505582137161085,
1301
+ "grad_norm": 13.434002876281738,
1302
+ "learning_rate": 3.8e-05,
1303
+ "loss": 1.4409,
1304
  "step": 185
1305
  },
1306
  {
1307
+ "epoch": 0.2966507177033493,
1308
+ "grad_norm": 3.535421371459961,
1309
+ "learning_rate": 3.6e-05,
1310
+ "loss": 1.2367,
1311
  "step": 186
1312
  },
1313
  {
1314
+ "epoch": 0.2982456140350877,
1315
+ "grad_norm": 5.332886219024658,
1316
+ "learning_rate": 3.4000000000000007e-05,
1317
+ "loss": 1.4744,
1318
  "step": 187
1319
  },
1320
  {
1321
+ "epoch": 0.29984051036682613,
1322
+ "grad_norm": 3.7560222148895264,
1323
+ "learning_rate": 3.2000000000000005e-05,
1324
+ "loss": 1.3489,
1325
  "step": 188
1326
  },
1327
  {
1328
+ "epoch": 0.3014354066985646,
1329
+ "grad_norm": 4.513449668884277,
1330
+ "learning_rate": 3e-05,
1331
+ "loss": 1.3738,
1332
  "step": 189
1333
  },
1334
  {
1335
+ "epoch": 0.30303030303030304,
1336
+ "grad_norm": 7.969113826751709,
1337
+ "learning_rate": 2.8000000000000003e-05,
1338
+ "loss": 1.7352,
1339
  "step": 190
1340
  },
1341
  {
1342
+ "epoch": 0.30462519936204147,
1343
+ "grad_norm": 4.11577033996582,
1344
+ "learning_rate": 2.6000000000000002e-05,
1345
+ "loss": 1.6171,
1346
  "step": 191
1347
  },
1348
  {
1349
+ "epoch": 0.3062200956937799,
1350
+ "grad_norm": 4.727830410003662,
1351
+ "learning_rate": 2.4e-05,
1352
+ "loss": 1.3359,
1353
  "step": 192
1354
  },
1355
  {
1356
+ "epoch": 0.3078149920255183,
1357
+ "grad_norm": 1.7988307476043701,
1358
+ "learning_rate": 2.2000000000000003e-05,
1359
+ "loss": 0.834,
1360
  "step": 193
1361
  },
1362
  {
1363
+ "epoch": 0.3094098883572568,
1364
+ "grad_norm": 4.455323696136475,
1365
+ "learning_rate": 2e-05,
1366
+ "loss": 1.2367,
1367
  "step": 194
1368
  },
1369
  {
1370
+ "epoch": 0.31100478468899523,
1371
+ "grad_norm": 5.334070682525635,
1372
+ "learning_rate": 1.8e-05,
1373
+ "loss": 1.0581,
1374
  "step": 195
1375
  },
1376
  {
1377
+ "epoch": 0.31259968102073366,
1378
+ "grad_norm": 4.967501640319824,
1379
+ "learning_rate": 1.6000000000000003e-05,
1380
+ "loss": 2.2129,
1381
  "step": 196
1382
  },
1383
  {
1384
+ "epoch": 0.3141945773524721,
1385
+ "grad_norm": 4.0120673179626465,
1386
+ "learning_rate": 1.4000000000000001e-05,
1387
+ "loss": 1.2043,
1388
  "step": 197
1389
  },
1390
  {
1391
+ "epoch": 0.3157894736842105,
1392
+ "grad_norm": 4.491064548492432,
1393
+ "learning_rate": 1.2e-05,
1394
+ "loss": 1.3383,
1395
  "step": 198
1396
  },
1397
  {
1398
+ "epoch": 0.31738437001594894,
1399
+ "grad_norm": 3.6377949714660645,
1400
+ "learning_rate": 1e-05,
1401
+ "loss": 1.2878,
1402
  "step": 199
1403
  },
1404
  {
1405
+ "epoch": 0.3189792663476874,
1406
+ "grad_norm": 3.7990877628326416,
1407
+ "learning_rate": 8.000000000000001e-06,
1408
+ "loss": 1.7178,
1409
  "step": 200
1410
  }
1411
  ],
 
1426
  "attributes": {}
1427
  }
1428
  },
1429
+ "total_flos": 1717079281606656.0,
1430
  "train_batch_size": 1,
1431
  "trial_name": null,
1432
  "trial_params": null
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02e9962458f016a1c07dc5280d7007cfd14653c05542b51edd26eefb09ce3f00
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0882660315b6d44be153a534ef331a1a652a772eed227ca85ccc42c14c629577
3
  size 5176
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02e9962458f016a1c07dc5280d7007cfd14653c05542b51edd26eefb09ce3f00
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0882660315b6d44be153a534ef331a1a652a772eed227ca85ccc42c14c629577
3
  size 5176