eformat commited on
Commit
5bfeb6e
1 Parent(s): f66ac36

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a09f72564127e675a61ec7d58c2b23ee0225211e1d406d4e469fe6bd74a8dc1
3
  size 34100216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c58239a67307473e58bbc09292e8bf457bccbf3d8f093413f2a9983160f1373
3
  size 34100216
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a09f72564127e675a61ec7d58c2b23ee0225211e1d406d4e469fe6bd74a8dc1
3
  size 34100216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c58239a67307473e58bbc09292e8bf457bccbf3d8f093413f2a9983160f1373
3
  size 34100216
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:442b87a1767f1aa22edbdbe9e6f4ca2c59b5587a8346c755af8d0ae24e68f1ba
3
  size 68292346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93a1f6aa5183962874fd62e0d358e0f0ea27e6a68a0535d07f990842e7013963
3
  size 68292346
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6db1565fdf085bc8560b9b4bdd5f55abb40db42feac17284b294419663599c75
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04745946c21cb6d9e7af44a9e436e4207c8437de1201d76c48a7771e0e0f3bad
3
  size 14244
checkpoint-200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a39755222377151208b988e4341f8c8d9958c119c2f9cf7ee718109a8d7880b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5fcb2e0acbfc451125ca5cd950e0420d092eee3eeb6dd5b682202b7a14cd22e
3
  size 1064
checkpoint-200/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.15891934843067143,
5
  "eval_steps": 500,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,1403 +9,1403 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0007945967421533572,
13
- "grad_norm": NaN,
14
- "learning_rate": 0.0,
15
- "loss": 2.9992,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.0015891934843067143,
20
- "grad_norm": 2.3542397022247314,
21
- "learning_rate": 2.0000000000000003e-06,
22
- "loss": 2.4211,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.0023837902264600714,
27
- "grad_norm": 1.7234137058258057,
28
- "learning_rate": 4.000000000000001e-06,
29
- "loss": 2.4802,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.0031783869686134287,
34
- "grad_norm": 1.8402104377746582,
35
- "learning_rate": 6e-06,
36
- "loss": 2.5004,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.003972983710766786,
41
- "grad_norm": 1.9447062015533447,
42
- "learning_rate": 8.000000000000001e-06,
43
- "loss": 2.5601,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.004767580452920143,
48
- "grad_norm": 1.6480785608291626,
49
- "learning_rate": 1e-05,
50
- "loss": 2.3849,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.0055621771950735005,
55
- "grad_norm": 8.942625999450684,
56
- "learning_rate": 1.2e-05,
57
- "loss": 2.9789,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.006356773937226857,
62
- "grad_norm": 4.155797958374023,
63
- "learning_rate": 1.4000000000000001e-05,
64
- "loss": 2.6198,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.007151370679380214,
69
- "grad_norm": 1.6816856861114502,
70
- "learning_rate": 1.6000000000000003e-05,
71
- "loss": 2.4154,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.007945967421533572,
76
- "grad_norm": 2.22641921043396,
77
- "learning_rate": 1.8e-05,
78
- "loss": 2.5419,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.00874056416368693,
83
- "grad_norm": 8.44351577758789,
84
- "learning_rate": 2e-05,
85
- "loss": 2.2415,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 0.009535160905840286,
90
- "grad_norm": 1.797033667564392,
91
- "learning_rate": 2.2000000000000003e-05,
92
- "loss": 2.4743,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 0.010329757647993643,
97
- "grad_norm": 2.222118616104126,
98
- "learning_rate": 2.4e-05,
99
- "loss": 2.3,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 0.011124354390147001,
104
- "grad_norm": 2.0043039321899414,
105
- "learning_rate": 2.6000000000000002e-05,
106
- "loss": 2.1184,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 0.011918951132300357,
111
- "grad_norm": 2.0246055126190186,
112
- "learning_rate": 2.8000000000000003e-05,
113
- "loss": 2.159,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 0.012713547874453715,
118
- "grad_norm": 9.5096435546875,
119
- "learning_rate": 3e-05,
120
- "loss": 3.0276,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 0.013508144616607072,
125
- "grad_norm": 2.858501672744751,
126
- "learning_rate": 3.2000000000000005e-05,
127
- "loss": 2.4969,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 0.014302741358760428,
132
- "grad_norm": 10.519010543823242,
133
- "learning_rate": 3.4000000000000007e-05,
134
- "loss": 2.9221,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 0.015097338100913786,
139
- "grad_norm": 2.304163932800293,
140
- "learning_rate": 3.6e-05,
141
- "loss": 2.4857,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 0.015891934843067144,
146
- "grad_norm": 2.059422731399536,
147
- "learning_rate": 3.8e-05,
148
- "loss": 2.2824,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 0.0166865315852205,
153
- "grad_norm": 2.584183692932129,
154
- "learning_rate": 4e-05,
155
- "loss": 2.2176,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 0.01748112832737386,
160
- "grad_norm": 3.7771871089935303,
161
- "learning_rate": 4.2e-05,
162
- "loss": 2.9221,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 0.018275725069527213,
167
- "grad_norm": 1.889650583267212,
168
- "learning_rate": 4.4000000000000006e-05,
169
- "loss": 2.3082,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 0.01907032181168057,
174
- "grad_norm": 3.319291353225708,
175
- "learning_rate": 4.600000000000001e-05,
176
- "loss": 2.19,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 0.01986491855383393,
181
- "grad_norm": 4.1281352043151855,
182
- "learning_rate": 4.8e-05,
183
- "loss": 2.1096,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 0.020659515295987287,
188
- "grad_norm": 2.4024288654327393,
189
- "learning_rate": 5e-05,
190
- "loss": 2.208,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 0.021454112038140644,
195
- "grad_norm": 15.567670822143555,
196
- "learning_rate": 5.2000000000000004e-05,
197
- "loss": 2.4704,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 0.022248708780294002,
202
- "grad_norm": 2.64872145652771,
203
- "learning_rate": 5.4000000000000005e-05,
204
- "loss": 2.1112,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 0.023043305522447356,
209
- "grad_norm": 6.033721446990967,
210
- "learning_rate": 5.6000000000000006e-05,
211
- "loss": 2.7668,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 0.023837902264600714,
216
- "grad_norm": 2.7915563583374023,
217
- "learning_rate": 5.8e-05,
218
- "loss": 2.1103,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 0.02463249900675407,
223
- "grad_norm": 2.611234664916992,
224
- "learning_rate": 6e-05,
225
- "loss": 2.038,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 0.02542709574890743,
230
- "grad_norm": 2.3000996112823486,
231
- "learning_rate": 6.2e-05,
232
- "loss": 1.6268,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 0.026221692491060787,
237
- "grad_norm": 3.713061571121216,
238
- "learning_rate": 6.400000000000001e-05,
239
- "loss": 1.9121,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 0.027016289233214145,
244
- "grad_norm": 2.776019811630249,
245
- "learning_rate": 6.6e-05,
246
- "loss": 1.817,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 0.027810885975367503,
251
- "grad_norm": 2.564723491668701,
252
- "learning_rate": 6.800000000000001e-05,
253
- "loss": 1.9355,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 0.028605482717520857,
258
- "grad_norm": 6.116189002990723,
259
- "learning_rate": 7e-05,
260
- "loss": 1.7766,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 0.029400079459674214,
265
- "grad_norm": 2.6253204345703125,
266
- "learning_rate": 7.2e-05,
267
- "loss": 1.7565,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 0.030194676201827572,
272
- "grad_norm": 2.706721544265747,
273
- "learning_rate": 7.4e-05,
274
- "loss": 1.9456,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 0.03098927294398093,
279
- "grad_norm": 2.7343411445617676,
280
- "learning_rate": 7.6e-05,
281
- "loss": 1.6768,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 0.03178386968613429,
286
- "grad_norm": 2.1730287075042725,
287
- "learning_rate": 7.800000000000001e-05,
288
- "loss": 1.4658,
289
  "step": 40
290
  },
291
  {
292
- "epoch": 0.03257846642828764,
293
- "grad_norm": 2.681889295578003,
294
- "learning_rate": 8e-05,
295
- "loss": 1.5583,
296
  "step": 41
297
  },
298
  {
299
- "epoch": 0.033373063170441,
300
- "grad_norm": 2.6841020584106445,
301
- "learning_rate": 8.2e-05,
302
- "loss": 1.6734,
303
  "step": 42
304
  },
305
  {
306
- "epoch": 0.03416765991259436,
307
- "grad_norm": 3.1267893314361572,
308
- "learning_rate": 8.4e-05,
309
- "loss": 1.6114,
310
  "step": 43
311
  },
312
  {
313
- "epoch": 0.03496225665474772,
314
- "grad_norm": 3.066925525665283,
315
- "learning_rate": 8.6e-05,
316
- "loss": 1.558,
317
  "step": 44
318
  },
319
  {
320
- "epoch": 0.03575685339690107,
321
- "grad_norm": 2.6647801399230957,
322
- "learning_rate": 8.800000000000001e-05,
323
- "loss": 1.4592,
324
  "step": 45
325
  },
326
  {
327
- "epoch": 0.03655145013905443,
328
- "grad_norm": 2.6715352535247803,
329
- "learning_rate": 9e-05,
330
- "loss": 1.4846,
331
  "step": 46
332
  },
333
  {
334
- "epoch": 0.03734604688120779,
335
- "grad_norm": 8.9569091796875,
336
- "learning_rate": 9.200000000000001e-05,
337
- "loss": 1.7494,
338
  "step": 47
339
  },
340
  {
341
- "epoch": 0.03814064362336114,
342
- "grad_norm": 2.7488503456115723,
343
- "learning_rate": 9.4e-05,
344
- "loss": 1.2653,
345
  "step": 48
346
  },
347
  {
348
- "epoch": 0.038935240365514504,
349
- "grad_norm": 2.520211696624756,
350
- "learning_rate": 9.6e-05,
351
- "loss": 1.2965,
352
  "step": 49
353
  },
354
  {
355
- "epoch": 0.03972983710766786,
356
- "grad_norm": 2.678025960922241,
357
- "learning_rate": 9.8e-05,
358
- "loss": 1.199,
359
  "step": 50
360
  },
361
  {
362
- "epoch": 0.04052443384982122,
363
- "grad_norm": 2.6799986362457275,
364
- "learning_rate": 0.0001,
365
- "loss": 1.2672,
366
  "step": 51
367
  },
368
  {
369
- "epoch": 0.04131903059197457,
370
- "grad_norm": 2.907582998275757,
371
- "learning_rate": 0.00010200000000000001,
372
- "loss": 1.3114,
373
  "step": 52
374
  },
375
  {
376
- "epoch": 0.04211362733412793,
377
- "grad_norm": 2.7622241973876953,
378
- "learning_rate": 0.00010400000000000001,
379
- "loss": 1.2173,
380
  "step": 53
381
  },
382
  {
383
- "epoch": 0.04290822407628129,
384
- "grad_norm": 3.829127550125122,
385
- "learning_rate": 0.00010600000000000002,
386
- "loss": 1.2657,
387
  "step": 54
388
  },
389
  {
390
- "epoch": 0.04370282081843464,
391
- "grad_norm": 4.464277267456055,
392
- "learning_rate": 0.00010800000000000001,
393
- "loss": 1.3206,
394
  "step": 55
395
  },
396
  {
397
- "epoch": 0.044497417560588004,
398
- "grad_norm": 2.5400142669677734,
399
- "learning_rate": 0.00011000000000000002,
400
- "loss": 1.1097,
401
  "step": 56
402
  },
403
  {
404
- "epoch": 0.04529201430274136,
405
- "grad_norm": 2.6496193408966064,
406
- "learning_rate": 0.00011200000000000001,
407
- "loss": 1.097,
408
  "step": 57
409
  },
410
  {
411
- "epoch": 0.04608661104489471,
412
- "grad_norm": 4.215746879577637,
413
  "learning_rate": 0.00011399999999999999,
414
- "loss": 1.1642,
415
  "step": 58
416
  },
417
  {
418
- "epoch": 0.046881207787048074,
419
- "grad_norm": 2.93621826171875,
420
  "learning_rate": 0.000116,
421
- "loss": 1.2385,
422
  "step": 59
423
  },
424
  {
425
- "epoch": 0.04767580452920143,
426
- "grad_norm": 10.44421672821045,
427
  "learning_rate": 0.000118,
428
- "loss": 1.1682,
429
  "step": 60
430
  },
431
  {
432
- "epoch": 0.04847040127135479,
433
- "grad_norm": 3.617307424545288,
434
  "learning_rate": 0.00012,
435
- "loss": 1.3271,
436
  "step": 61
437
  },
438
  {
439
- "epoch": 0.04926499801350814,
440
- "grad_norm": 2.5892326831817627,
441
  "learning_rate": 0.000122,
442
- "loss": 0.9714,
443
  "step": 62
444
  },
445
  {
446
- "epoch": 0.050059594755661505,
447
- "grad_norm": NaN,
448
- "learning_rate": 0.000122,
449
- "loss": 1.2764,
450
  "step": 63
451
  },
452
  {
453
- "epoch": 0.05085419149781486,
454
- "grad_norm": 2.9196863174438477,
455
- "learning_rate": 0.000124,
456
- "loss": 1.1406,
457
  "step": 64
458
  },
459
  {
460
- "epoch": 0.05164878823996821,
461
- "grad_norm": 4.006603717803955,
462
- "learning_rate": 0.000126,
463
- "loss": 0.9797,
464
  "step": 65
465
  },
466
  {
467
- "epoch": 0.052443384982121574,
468
- "grad_norm": 2.5575106143951416,
469
- "learning_rate": 0.00012800000000000002,
470
- "loss": 1.2092,
471
  "step": 66
472
  },
473
  {
474
- "epoch": 0.05323798172427493,
475
- "grad_norm": 3.1613786220550537,
476
- "learning_rate": 0.00013000000000000002,
477
- "loss": 1.2598,
478
  "step": 67
479
  },
480
  {
481
- "epoch": 0.05403257846642829,
482
- "grad_norm": 2.542789936065674,
483
- "learning_rate": 0.000132,
484
- "loss": 0.9497,
485
  "step": 68
486
  },
487
  {
488
- "epoch": 0.054827175208581644,
489
- "grad_norm": 2.4051780700683594,
490
- "learning_rate": 0.000134,
491
- "loss": 0.7992,
492
  "step": 69
493
  },
494
  {
495
- "epoch": 0.055621771950735005,
496
- "grad_norm": 51.893409729003906,
497
- "learning_rate": 0.00013600000000000003,
498
- "loss": 0.9683,
499
  "step": 70
500
  },
501
  {
502
- "epoch": 0.05641636869288836,
503
- "grad_norm": 4.105207920074463,
504
- "learning_rate": 0.000138,
505
- "loss": 1.0815,
506
  "step": 71
507
  },
508
  {
509
- "epoch": 0.057210965435041714,
510
- "grad_norm": 92.10639190673828,
511
- "learning_rate": 0.00014,
512
- "loss": 1.1767,
513
  "step": 72
514
  },
515
  {
516
- "epoch": 0.058005562177195075,
517
- "grad_norm": 2.4499878883361816,
518
- "learning_rate": 0.000142,
519
- "loss": 0.8912,
520
  "step": 73
521
  },
522
  {
523
- "epoch": 0.05880015891934843,
524
- "grad_norm": 2.4928042888641357,
525
- "learning_rate": 0.000144,
526
- "loss": 0.9457,
527
  "step": 74
528
  },
529
  {
530
- "epoch": 0.05959475566150179,
531
- "grad_norm": 2.761523485183716,
532
- "learning_rate": 0.000146,
533
- "loss": 0.9493,
534
  "step": 75
535
  },
536
  {
537
- "epoch": 0.060389352403655144,
538
- "grad_norm": 2.9320006370544434,
539
- "learning_rate": 0.000148,
540
- "loss": 0.9657,
541
  "step": 76
542
  },
543
  {
544
- "epoch": 0.061183949145808506,
545
- "grad_norm": 2.067227840423584,
546
- "learning_rate": 0.00015000000000000001,
547
- "loss": 0.8115,
548
  "step": 77
549
  },
550
  {
551
- "epoch": 0.06197854588796186,
552
- "grad_norm": 78.78492736816406,
553
- "learning_rate": 0.000152,
554
- "loss": 1.4676,
555
  "step": 78
556
  },
557
  {
558
- "epoch": 0.06277314263011521,
559
- "grad_norm": 2.335451364517212,
560
- "learning_rate": 0.000154,
561
- "loss": 0.9271,
562
  "step": 79
563
  },
564
  {
565
- "epoch": 0.06356773937226858,
566
- "grad_norm": 3.434222459793091,
567
- "learning_rate": 0.00015600000000000002,
568
- "loss": 0.8938,
569
  "step": 80
570
  },
571
  {
572
- "epoch": 0.06436233611442194,
573
- "grad_norm": 2.38944673538208,
574
  "learning_rate": 0.00015800000000000002,
575
- "loss": 0.9277,
576
  "step": 81
577
  },
578
  {
579
- "epoch": 0.06515693285657528,
580
- "grad_norm": 2.960742712020874,
581
  "learning_rate": 0.00016,
582
- "loss": 1.1237,
583
  "step": 82
584
  },
585
  {
586
- "epoch": 0.06595152959872864,
587
- "grad_norm": 2.1033873558044434,
588
  "learning_rate": 0.000162,
589
- "loss": 0.8095,
590
  "step": 83
591
  },
592
  {
593
- "epoch": 0.066746126340882,
594
- "grad_norm": 2.611531972885132,
595
  "learning_rate": 0.000164,
596
- "loss": 0.927,
597
  "step": 84
598
  },
599
  {
600
- "epoch": 0.06754072308303535,
601
- "grad_norm": 2.9163801670074463,
602
  "learning_rate": 0.000166,
603
- "loss": 0.8363,
604
  "step": 85
605
  },
606
  {
607
- "epoch": 0.06833531982518871,
608
- "grad_norm": 2.6492106914520264,
609
  "learning_rate": 0.000168,
610
- "loss": 0.9831,
611
  "step": 86
612
  },
613
  {
614
- "epoch": 0.06912991656734208,
615
- "grad_norm": 2.027588367462158,
616
  "learning_rate": 0.00017,
617
- "loss": 0.8676,
618
  "step": 87
619
  },
620
  {
621
- "epoch": 0.06992451330949544,
622
- "grad_norm": 2.7688043117523193,
623
  "learning_rate": 0.000172,
624
- "loss": 1.1266,
625
  "step": 88
626
  },
627
  {
628
- "epoch": 0.07071911005164878,
629
- "grad_norm": 2.4787685871124268,
630
  "learning_rate": 0.000174,
631
- "loss": 0.8832,
632
  "step": 89
633
  },
634
  {
635
- "epoch": 0.07151370679380215,
636
- "grad_norm": 5.127519130706787,
637
  "learning_rate": 0.00017600000000000002,
638
- "loss": 0.8945,
639
  "step": 90
640
  },
641
  {
642
- "epoch": 0.0723083035359555,
643
- "grad_norm": 3.4531705379486084,
644
  "learning_rate": 0.00017800000000000002,
645
- "loss": 1.1205,
646
  "step": 91
647
  },
648
  {
649
- "epoch": 0.07310290027810885,
650
- "grad_norm": 2.749129056930542,
651
  "learning_rate": 0.00018,
652
- "loss": 1.019,
653
  "step": 92
654
  },
655
  {
656
- "epoch": 0.07389749702026222,
657
- "grad_norm": Infinity,
658
- "learning_rate": 0.00018,
659
- "loss": 1.3832,
660
  "step": 93
661
  },
662
  {
663
- "epoch": 0.07469209376241558,
664
- "grad_norm": 2.6057441234588623,
665
- "learning_rate": 0.000182,
666
- "loss": 1.1318,
667
  "step": 94
668
  },
669
  {
670
- "epoch": 0.07548669050456894,
671
- "grad_norm": 4.503351211547852,
672
- "learning_rate": 0.00018400000000000003,
673
- "loss": 1.033,
674
  "step": 95
675
  },
676
  {
677
- "epoch": 0.07628128724672228,
678
- "grad_norm": 2.7054409980773926,
679
- "learning_rate": 0.00018600000000000002,
680
- "loss": 0.9466,
681
  "step": 96
682
  },
683
  {
684
- "epoch": 0.07707588398887565,
685
- "grad_norm": 2.1828339099884033,
686
- "learning_rate": 0.000188,
687
- "loss": 0.8253,
688
  "step": 97
689
  },
690
  {
691
- "epoch": 0.07787048073102901,
692
- "grad_norm": 2.062911033630371,
693
- "learning_rate": 0.00019,
694
- "loss": 0.8997,
695
  "step": 98
696
  },
697
  {
698
- "epoch": 0.07866507747318235,
699
- "grad_norm": 2.1747360229492188,
700
- "learning_rate": 0.000192,
701
- "loss": 0.928,
702
  "step": 99
703
  },
704
  {
705
- "epoch": 0.07945967421533572,
706
- "grad_norm": 2.3616063594818115,
707
- "learning_rate": 0.000194,
708
- "loss": 0.8595,
709
  "step": 100
710
  },
711
  {
712
- "epoch": 0.08025427095748908,
713
- "grad_norm": 2.007453203201294,
714
- "learning_rate": 0.000196,
715
- "loss": 0.9071,
716
  "step": 101
717
  },
718
  {
719
- "epoch": 0.08104886769964244,
720
- "grad_norm": 2.822878837585449,
721
- "learning_rate": 0.00019800000000000002,
722
- "loss": 1.1856,
723
  "step": 102
724
  },
725
  {
726
- "epoch": 0.08184346444179579,
727
- "grad_norm": 2.162837505340576,
728
- "learning_rate": 0.0002,
729
- "loss": 1.0198,
730
  "step": 103
731
  },
732
  {
733
- "epoch": 0.08263806118394915,
734
- "grad_norm": 2.004056453704834,
735
- "learning_rate": 0.00019800000000000002,
736
- "loss": 0.8352,
737
  "step": 104
738
  },
739
  {
740
- "epoch": 0.08343265792610251,
741
- "grad_norm": 2.5484063625335693,
742
- "learning_rate": 0.000196,
743
- "loss": 1.0226,
744
  "step": 105
745
  },
746
  {
747
- "epoch": 0.08422725466825585,
748
- "grad_norm": 2.2759130001068115,
749
- "learning_rate": 0.000194,
750
- "loss": 0.8884,
751
  "step": 106
752
  },
753
  {
754
- "epoch": 0.08502185141040922,
755
- "grad_norm": 2.4136946201324463,
756
- "learning_rate": 0.000192,
757
- "loss": 0.8374,
758
  "step": 107
759
  },
760
  {
761
- "epoch": 0.08581644815256258,
762
- "grad_norm": 2.8566195964813232,
763
- "learning_rate": 0.00019,
764
- "loss": 0.9112,
765
  "step": 108
766
  },
767
  {
768
- "epoch": 0.08661104489471594,
769
- "grad_norm": 2.241708755493164,
770
- "learning_rate": 0.000188,
771
- "loss": 0.8695,
772
  "step": 109
773
  },
774
  {
775
- "epoch": 0.08740564163686929,
776
- "grad_norm": 2.1674909591674805,
777
- "learning_rate": 0.00018600000000000002,
778
- "loss": 0.708,
779
  "step": 110
780
  },
781
  {
782
- "epoch": 0.08820023837902265,
783
- "grad_norm": 3.16097092628479,
784
- "learning_rate": 0.00018400000000000003,
785
- "loss": 1.0946,
786
  "step": 111
787
  },
788
  {
789
- "epoch": 0.08899483512117601,
790
- "grad_norm": 2.7531492710113525,
791
- "learning_rate": 0.000182,
792
- "loss": 0.8907,
793
  "step": 112
794
  },
795
  {
796
- "epoch": 0.08978943186332936,
797
- "grad_norm": 2.3830761909484863,
798
- "learning_rate": 0.00018,
799
- "loss": 0.9243,
800
  "step": 113
801
  },
802
  {
803
- "epoch": 0.09058402860548272,
804
- "grad_norm": 27.347421646118164,
805
- "learning_rate": 0.00017800000000000002,
806
- "loss": 1.0328,
807
  "step": 114
808
  },
809
  {
810
- "epoch": 0.09137862534763608,
811
- "grad_norm": 2.1043972969055176,
812
- "learning_rate": 0.00017600000000000002,
813
- "loss": 0.9195,
814
  "step": 115
815
  },
816
  {
817
- "epoch": 0.09217322208978942,
818
- "grad_norm": 1.9787133932113647,
819
- "learning_rate": 0.000174,
820
- "loss": 0.7395,
821
  "step": 116
822
  },
823
  {
824
- "epoch": 0.09296781883194279,
825
- "grad_norm": 2.395308256149292,
826
- "learning_rate": 0.000172,
827
- "loss": 1.0193,
828
  "step": 117
829
  },
830
  {
831
- "epoch": 0.09376241557409615,
832
- "grad_norm": 1.9864846467971802,
833
- "learning_rate": 0.00017,
834
- "loss": 0.948,
835
  "step": 118
836
  },
837
  {
838
- "epoch": 0.09455701231624951,
839
- "grad_norm": 2.823315382003784,
840
- "learning_rate": 0.000168,
841
- "loss": 1.0902,
842
  "step": 119
843
  },
844
  {
845
- "epoch": 0.09535160905840286,
846
- "grad_norm": 2.5823678970336914,
847
- "learning_rate": 0.000166,
848
- "loss": 1.0905,
849
  "step": 120
850
  },
851
  {
852
- "epoch": 0.09614620580055622,
853
- "grad_norm": 1.8948626518249512,
854
- "learning_rate": 0.000164,
855
- "loss": 0.9203,
856
  "step": 121
857
  },
858
  {
859
- "epoch": 0.09694080254270958,
860
- "grad_norm": 1.9721623659133911,
861
- "learning_rate": 0.000162,
862
- "loss": 0.9591,
863
  "step": 122
864
  },
865
  {
866
- "epoch": 0.09773539928486293,
867
- "grad_norm": 1.9776030778884888,
868
- "learning_rate": 0.00016,
869
- "loss": 0.9887,
870
  "step": 123
871
  },
872
  {
873
- "epoch": 0.09852999602701629,
874
- "grad_norm": 2.8131155967712402,
875
- "learning_rate": 0.00015800000000000002,
876
- "loss": 1.0197,
877
  "step": 124
878
  },
879
  {
880
- "epoch": 0.09932459276916965,
881
- "grad_norm": 2.0642547607421875,
882
- "learning_rate": 0.00015600000000000002,
883
- "loss": 0.9931,
884
  "step": 125
885
  },
886
  {
887
- "epoch": 0.10011918951132301,
888
- "grad_norm": 2.9410674571990967,
889
- "learning_rate": 0.000154,
890
- "loss": 1.3742,
891
  "step": 126
892
  },
893
  {
894
- "epoch": 0.10091378625347636,
895
- "grad_norm": 2.1221704483032227,
896
- "learning_rate": 0.000152,
897
- "loss": 1.0526,
898
  "step": 127
899
  },
900
  {
901
- "epoch": 0.10170838299562972,
902
- "grad_norm": 2.831902503967285,
903
- "learning_rate": 0.00015000000000000001,
904
- "loss": 0.8048,
905
  "step": 128
906
  },
907
  {
908
- "epoch": 0.10250297973778308,
909
- "grad_norm": 2.066681146621704,
910
- "learning_rate": 0.000148,
911
- "loss": 0.985,
912
  "step": 129
913
  },
914
  {
915
- "epoch": 0.10329757647993643,
916
- "grad_norm": 1.8252568244934082,
917
- "learning_rate": 0.000146,
918
- "loss": 0.7689,
919
  "step": 130
920
  },
921
  {
922
- "epoch": 0.10409217322208979,
923
- "grad_norm": 2.0231878757476807,
924
- "learning_rate": 0.000144,
925
- "loss": 1.0247,
926
  "step": 131
927
  },
928
  {
929
- "epoch": 0.10488676996424315,
930
- "grad_norm": 2.200442314147949,
931
- "learning_rate": 0.000142,
932
- "loss": 0.8775,
933
  "step": 132
934
  },
935
  {
936
- "epoch": 0.10568136670639651,
937
- "grad_norm": 1.7937211990356445,
938
- "learning_rate": 0.00014,
939
- "loss": 0.9357,
940
  "step": 133
941
  },
942
  {
943
- "epoch": 0.10647596344854986,
944
- "grad_norm": 1.685763955116272,
945
- "learning_rate": 0.000138,
946
- "loss": 0.8664,
947
  "step": 134
948
  },
949
  {
950
- "epoch": 0.10727056019070322,
951
- "grad_norm": 1.9033604860305786,
952
- "learning_rate": 0.00013600000000000003,
953
- "loss": 0.9793,
954
  "step": 135
955
  },
956
  {
957
- "epoch": 0.10806515693285658,
958
- "grad_norm": 1.8052781820297241,
959
- "learning_rate": 0.000134,
960
- "loss": 0.8154,
961
  "step": 136
962
  },
963
  {
964
- "epoch": 0.10885975367500993,
965
- "grad_norm": 2.400908946990967,
966
- "learning_rate": 0.000132,
967
- "loss": 0.9604,
968
  "step": 137
969
  },
970
  {
971
- "epoch": 0.10965435041716329,
972
- "grad_norm": 1.7619413137435913,
973
- "learning_rate": 0.00013000000000000002,
974
- "loss": 0.8187,
975
  "step": 138
976
  },
977
  {
978
- "epoch": 0.11044894715931665,
979
- "grad_norm": 1.8535854816436768,
980
- "learning_rate": 0.00012800000000000002,
981
- "loss": 0.9487,
982
  "step": 139
983
  },
984
  {
985
- "epoch": 0.11124354390147001,
986
- "grad_norm": 1.8936333656311035,
987
- "learning_rate": 0.000126,
988
- "loss": 0.9852,
989
  "step": 140
990
  },
991
  {
992
- "epoch": 0.11203814064362336,
993
- "grad_norm": 1.821515440940857,
994
- "learning_rate": 0.000124,
995
- "loss": 0.8442,
996
  "step": 141
997
  },
998
  {
999
- "epoch": 0.11283273738577672,
1000
- "grad_norm": 2.1713123321533203,
1001
- "learning_rate": 0.000122,
1002
- "loss": 1.1723,
1003
  "step": 142
1004
  },
1005
  {
1006
- "epoch": 0.11362733412793008,
1007
- "grad_norm": 1.6912201642990112,
1008
- "learning_rate": 0.00012,
1009
- "loss": 0.7486,
1010
  "step": 143
1011
  },
1012
  {
1013
- "epoch": 0.11442193087008343,
1014
- "grad_norm": 1.630624532699585,
1015
- "learning_rate": 0.000118,
1016
- "loss": 0.7462,
1017
  "step": 144
1018
  },
1019
  {
1020
- "epoch": 0.11521652761223679,
1021
- "grad_norm": 2.0852651596069336,
1022
- "learning_rate": 0.000116,
1023
- "loss": 0.7821,
1024
  "step": 145
1025
  },
1026
  {
1027
- "epoch": 0.11601112435439015,
1028
- "grad_norm": 1.5113400220870972,
1029
- "learning_rate": 0.00011399999999999999,
1030
- "loss": 0.7768,
1031
  "step": 146
1032
  },
1033
  {
1034
- "epoch": 0.11680572109654351,
1035
- "grad_norm": 1.9536205530166626,
1036
- "learning_rate": 0.00011200000000000001,
1037
- "loss": 0.9478,
1038
  "step": 147
1039
  },
1040
  {
1041
- "epoch": 0.11760031783869686,
1042
- "grad_norm": 1.8963311910629272,
1043
- "learning_rate": 0.00011000000000000002,
1044
- "loss": 0.9085,
1045
  "step": 148
1046
  },
1047
  {
1048
- "epoch": 0.11839491458085022,
1049
- "grad_norm": 1.8368561267852783,
1050
- "learning_rate": 0.00010800000000000001,
1051
- "loss": 0.757,
1052
  "step": 149
1053
  },
1054
  {
1055
- "epoch": 0.11918951132300358,
1056
- "grad_norm": 2.7951648235321045,
1057
- "learning_rate": 0.00010600000000000002,
1058
- "loss": 1.0677,
1059
  "step": 150
1060
  },
1061
  {
1062
- "epoch": 0.11998410806515693,
1063
- "grad_norm": 2.015962839126587,
1064
- "learning_rate": 0.00010400000000000001,
1065
- "loss": 0.7874,
1066
  "step": 151
1067
  },
1068
  {
1069
- "epoch": 0.12077870480731029,
1070
- "grad_norm": 2.661062479019165,
1071
- "learning_rate": 0.00010200000000000001,
1072
- "loss": 0.9311,
1073
  "step": 152
1074
  },
1075
  {
1076
- "epoch": 0.12157330154946365,
1077
- "grad_norm": 2.020232677459717,
1078
- "learning_rate": 0.0001,
1079
- "loss": 0.7958,
1080
  "step": 153
1081
  },
1082
  {
1083
- "epoch": 0.12236789829161701,
1084
- "grad_norm": 2.5150179862976074,
1085
- "learning_rate": 9.8e-05,
1086
- "loss": 0.8354,
1087
  "step": 154
1088
  },
1089
  {
1090
- "epoch": 0.12316249503377036,
1091
- "grad_norm": 1.65029776096344,
1092
- "learning_rate": 9.6e-05,
1093
- "loss": 0.8512,
1094
  "step": 155
1095
  },
1096
  {
1097
- "epoch": 0.12395709177592372,
1098
- "grad_norm": 2.004103183746338,
1099
- "learning_rate": 9.4e-05,
1100
- "loss": 0.7002,
1101
  "step": 156
1102
  },
1103
  {
1104
- "epoch": 0.12475168851807708,
1105
- "grad_norm": 2.198091506958008,
1106
- "learning_rate": 9.200000000000001e-05,
1107
- "loss": 0.8539,
1108
  "step": 157
1109
  },
1110
  {
1111
- "epoch": 0.12554628526023043,
1112
- "grad_norm": 2.1458988189697266,
1113
- "learning_rate": 9e-05,
1114
- "loss": 1.1292,
1115
  "step": 158
1116
  },
1117
  {
1118
- "epoch": 0.1263408820023838,
1119
- "grad_norm": 1.7759689092636108,
1120
- "learning_rate": 8.800000000000001e-05,
1121
- "loss": 0.8792,
1122
  "step": 159
1123
  },
1124
  {
1125
- "epoch": 0.12713547874453715,
1126
- "grad_norm": 2.0215320587158203,
1127
- "learning_rate": 8.6e-05,
1128
- "loss": 0.9206,
1129
  "step": 160
1130
  },
1131
  {
1132
- "epoch": 0.1279300754866905,
1133
- "grad_norm": 2.324572801589966,
1134
- "learning_rate": 8.4e-05,
1135
- "loss": 1.0041,
1136
  "step": 161
1137
  },
1138
  {
1139
- "epoch": 0.12872467222884387,
1140
- "grad_norm": 2.3949570655822754,
1141
- "learning_rate": 8.2e-05,
1142
- "loss": 0.7894,
1143
  "step": 162
1144
  },
1145
  {
1146
- "epoch": 0.12951926897099722,
1147
- "grad_norm": 1.8736156225204468,
1148
- "learning_rate": 8e-05,
1149
- "loss": 0.8928,
1150
  "step": 163
1151
  },
1152
  {
1153
- "epoch": 0.13031386571315057,
1154
- "grad_norm": 2.373562812805176,
1155
- "learning_rate": 7.800000000000001e-05,
1156
- "loss": 0.8739,
1157
  "step": 164
1158
  },
1159
  {
1160
- "epoch": 0.13110846245530394,
1161
- "grad_norm": 1.6126917600631714,
1162
- "learning_rate": 7.6e-05,
1163
- "loss": 0.6292,
1164
  "step": 165
1165
  },
1166
  {
1167
- "epoch": 0.1319030591974573,
1168
- "grad_norm": 2.6151697635650635,
1169
- "learning_rate": 7.4e-05,
1170
- "loss": 1.2176,
1171
  "step": 166
1172
  },
1173
  {
1174
- "epoch": 0.13269765593961064,
1175
- "grad_norm": 1.883436918258667,
1176
- "learning_rate": 7.2e-05,
1177
- "loss": 0.7664,
1178
  "step": 167
1179
  },
1180
  {
1181
- "epoch": 0.133492252681764,
1182
- "grad_norm": 2.2091007232666016,
1183
- "learning_rate": 7e-05,
1184
- "loss": 0.9501,
1185
  "step": 168
1186
  },
1187
  {
1188
- "epoch": 0.13428684942391736,
1189
- "grad_norm": 1.903889775276184,
1190
- "learning_rate": 6.800000000000001e-05,
1191
- "loss": 0.9322,
1192
  "step": 169
1193
  },
1194
  {
1195
- "epoch": 0.1350814461660707,
1196
- "grad_norm": 1.9040536880493164,
1197
- "learning_rate": 6.6e-05,
1198
- "loss": 0.851,
1199
  "step": 170
1200
  },
1201
  {
1202
- "epoch": 0.13587604290822408,
1203
- "grad_norm": 1.7239336967468262,
1204
- "learning_rate": 6.400000000000001e-05,
1205
- "loss": 0.9033,
1206
  "step": 171
1207
  },
1208
  {
1209
- "epoch": 0.13667063965037743,
1210
- "grad_norm": 3.3629558086395264,
1211
- "learning_rate": 6.2e-05,
1212
- "loss": 0.918,
1213
  "step": 172
1214
  },
1215
  {
1216
- "epoch": 0.1374652363925308,
1217
- "grad_norm": 1.9918290376663208,
1218
- "learning_rate": 6e-05,
1219
- "loss": 0.8825,
1220
  "step": 173
1221
  },
1222
  {
1223
- "epoch": 0.13825983313468415,
1224
- "grad_norm": 2.3540871143341064,
1225
- "learning_rate": 5.8e-05,
1226
- "loss": 0.6484,
1227
  "step": 174
1228
  },
1229
  {
1230
- "epoch": 0.1390544298768375,
1231
- "grad_norm": 2.2806777954101562,
1232
- "learning_rate": 5.6000000000000006e-05,
1233
- "loss": 0.7137,
1234
  "step": 175
1235
  },
1236
  {
1237
- "epoch": 0.13984902661899087,
1238
- "grad_norm": 2.21384334564209,
1239
- "learning_rate": 5.4000000000000005e-05,
1240
- "loss": 0.5965,
1241
  "step": 176
1242
  },
1243
  {
1244
- "epoch": 0.14064362336114422,
1245
- "grad_norm": 1.9977487325668335,
1246
- "learning_rate": 5.2000000000000004e-05,
1247
- "loss": 0.7671,
1248
  "step": 177
1249
  },
1250
  {
1251
- "epoch": 0.14143822010329757,
1252
- "grad_norm": 31.215999603271484,
1253
- "learning_rate": 5e-05,
1254
- "loss": 0.9116,
1255
  "step": 178
1256
  },
1257
  {
1258
- "epoch": 0.14223281684545094,
1259
- "grad_norm": 2.2257730960845947,
1260
- "learning_rate": 4.8e-05,
1261
- "loss": 0.8838,
1262
  "step": 179
1263
  },
1264
  {
1265
- "epoch": 0.1430274135876043,
1266
- "grad_norm": 2.9200854301452637,
1267
- "learning_rate": 4.600000000000001e-05,
1268
- "loss": 0.9615,
1269
  "step": 180
1270
  },
1271
  {
1272
- "epoch": 0.14382201032975764,
1273
- "grad_norm": 2.2704315185546875,
1274
- "learning_rate": 4.4000000000000006e-05,
1275
- "loss": 1.0132,
1276
  "step": 181
1277
  },
1278
  {
1279
- "epoch": 0.144616607071911,
1280
- "grad_norm": 2.058189868927002,
1281
- "learning_rate": 4.2e-05,
1282
- "loss": 0.8201,
1283
  "step": 182
1284
  },
1285
  {
1286
- "epoch": 0.14541120381406436,
1287
- "grad_norm": 1.7903186082839966,
1288
- "learning_rate": 4e-05,
1289
- "loss": 0.7128,
1290
  "step": 183
1291
  },
1292
  {
1293
- "epoch": 0.1462058005562177,
1294
- "grad_norm": 1.9608381986618042,
1295
- "learning_rate": 3.8e-05,
1296
- "loss": 0.8236,
1297
  "step": 184
1298
  },
1299
  {
1300
- "epoch": 0.14700039729837108,
1301
- "grad_norm": 2.1937220096588135,
1302
- "learning_rate": 3.6e-05,
1303
- "loss": 0.8442,
1304
  "step": 185
1305
  },
1306
  {
1307
- "epoch": 0.14779499404052443,
1308
- "grad_norm": 2.047407627105713,
1309
- "learning_rate": 3.4000000000000007e-05,
1310
- "loss": 0.966,
1311
  "step": 186
1312
  },
1313
  {
1314
- "epoch": 0.14858959078267778,
1315
- "grad_norm": 2.0216119289398193,
1316
- "learning_rate": 3.2000000000000005e-05,
1317
- "loss": 0.9564,
1318
  "step": 187
1319
  },
1320
  {
1321
- "epoch": 0.14938418752483115,
1322
- "grad_norm": 1.629753589630127,
1323
- "learning_rate": 3e-05,
1324
- "loss": 0.6504,
1325
  "step": 188
1326
  },
1327
  {
1328
- "epoch": 0.1501787842669845,
1329
- "grad_norm": 1.7381154298782349,
1330
- "learning_rate": 2.8000000000000003e-05,
1331
- "loss": 0.8346,
1332
  "step": 189
1333
  },
1334
  {
1335
- "epoch": 0.15097338100913787,
1336
- "grad_norm": 2.1300833225250244,
1337
- "learning_rate": 2.6000000000000002e-05,
1338
- "loss": 0.8631,
1339
  "step": 190
1340
  },
1341
  {
1342
- "epoch": 0.15176797775129122,
1343
- "grad_norm": 1.8977206945419312,
1344
- "learning_rate": 2.4e-05,
1345
- "loss": 0.8653,
1346
  "step": 191
1347
  },
1348
  {
1349
- "epoch": 0.15256257449344457,
1350
- "grad_norm": 1.7362009286880493,
1351
- "learning_rate": 2.2000000000000003e-05,
1352
- "loss": 0.8278,
1353
  "step": 192
1354
  },
1355
  {
1356
- "epoch": 0.15335717123559794,
1357
- "grad_norm": 1.838982105255127,
1358
- "learning_rate": 2e-05,
1359
- "loss": 0.6521,
1360
  "step": 193
1361
  },
1362
  {
1363
- "epoch": 0.1541517679777513,
1364
- "grad_norm": 2.4685189723968506,
1365
- "learning_rate": 1.8e-05,
1366
- "loss": 0.9639,
1367
  "step": 194
1368
  },
1369
  {
1370
- "epoch": 0.15494636471990464,
1371
- "grad_norm": 1.8960801362991333,
1372
- "learning_rate": 1.6000000000000003e-05,
1373
- "loss": 0.7338,
1374
  "step": 195
1375
  },
1376
  {
1377
- "epoch": 0.15574096146205801,
1378
- "grad_norm": 2.139636993408203,
1379
- "learning_rate": 1.4000000000000001e-05,
1380
- "loss": 0.6774,
1381
  "step": 196
1382
  },
1383
  {
1384
- "epoch": 0.15653555820421136,
1385
- "grad_norm": 2.0349552631378174,
1386
- "learning_rate": 1.2e-05,
1387
- "loss": 0.8435,
1388
  "step": 197
1389
  },
1390
  {
1391
- "epoch": 0.1573301549463647,
1392
- "grad_norm": 1.5157055854797363,
1393
- "learning_rate": 1e-05,
1394
- "loss": 0.7087,
1395
  "step": 198
1396
  },
1397
  {
1398
- "epoch": 0.15812475168851808,
1399
- "grad_norm": 2.498692035675049,
1400
- "learning_rate": 8.000000000000001e-06,
1401
- "loss": 1.0298,
1402
  "step": 199
1403
  },
1404
  {
1405
- "epoch": 0.15891934843067143,
1406
- "grad_norm": 1.7967320680618286,
1407
- "learning_rate": 6e-06,
1408
- "loss": 0.7717,
1409
  "step": 200
1410
  }
1411
  ],
@@ -1426,7 +1426,7 @@
1426
  "attributes": {}
1427
  }
1428
  },
1429
- "total_flos": 2473157095243776.0,
1430
  "train_batch_size": 1,
1431
  "trial_name": null,
1432
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.00485074853113271,
5
  "eval_steps": 500,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 2.4253742655663552e-05,
13
+ "grad_norm": 2.9237160682678223,
14
+ "learning_rate": 2.0000000000000003e-06,
15
+ "loss": 1.4192,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 4.8507485311327104e-05,
20
+ "grad_norm": 8.286487579345703,
21
+ "learning_rate": 4.000000000000001e-06,
22
+ "loss": 1.7523,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 7.276122796699065e-05,
27
+ "grad_norm": 1.957672119140625,
28
+ "learning_rate": 6e-06,
29
+ "loss": 1.9875,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 9.701497062265421e-05,
34
+ "grad_norm": 0.5736268758773804,
35
+ "learning_rate": 8.000000000000001e-06,
36
+ "loss": 1.427,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.00012126871327831776,
41
+ "grad_norm": 0.7024741172790527,
42
+ "learning_rate": 1e-05,
43
+ "loss": 1.4541,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.0001455224559339813,
48
+ "grad_norm": 6.171888828277588,
49
+ "learning_rate": 1.2e-05,
50
+ "loss": 1.9617,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.00016977619858964487,
55
+ "grad_norm": 1.4335829019546509,
56
+ "learning_rate": 1.4000000000000001e-05,
57
+ "loss": 1.6549,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.00019402994124530841,
62
+ "grad_norm": 0.8052822947502136,
63
+ "learning_rate": 1.6000000000000003e-05,
64
+ "loss": 1.681,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.00021828368390097196,
69
+ "grad_norm": 1.0054032802581787,
70
+ "learning_rate": 1.8e-05,
71
+ "loss": 1.7113,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.00024253742655663553,
76
+ "grad_norm": 2.378899097442627,
77
+ "learning_rate": 2e-05,
78
+ "loss": 2.0225,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.00026679116921229907,
83
+ "grad_norm": 12.526110649108887,
84
+ "learning_rate": 2.2000000000000003e-05,
85
+ "loss": 2.3219,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.0002910449118679626,
90
+ "grad_norm": 1.658133864402771,
91
+ "learning_rate": 2.4e-05,
92
+ "loss": 1.5336,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.00031529865452362615,
97
+ "grad_norm": 18.127710342407227,
98
+ "learning_rate": 2.6000000000000002e-05,
99
+ "loss": 2.1609,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.00033955239717928975,
104
+ "grad_norm": 0.7923838496208191,
105
+ "learning_rate": 2.8000000000000003e-05,
106
+ "loss": 1.7354,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.0003638061398349533,
111
+ "grad_norm": 1.6248362064361572,
112
+ "learning_rate": 3e-05,
113
+ "loss": 1.5774,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.00038805988249061683,
118
+ "grad_norm": 1.6371029615402222,
119
+ "learning_rate": 3.2000000000000005e-05,
120
+ "loss": 2.2978,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.00041231362514628037,
125
+ "grad_norm": 1.409232497215271,
126
+ "learning_rate": 3.4000000000000007e-05,
127
+ "loss": 1.1136,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.0004365673678019439,
132
+ "grad_norm": 5.593843936920166,
133
+ "learning_rate": 3.6e-05,
134
+ "loss": 2.3497,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.0004608211104576075,
139
+ "grad_norm": 1.9881709814071655,
140
+ "learning_rate": 3.8e-05,
141
+ "loss": 2.0849,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.00048507485311327105,
146
+ "grad_norm": 1.7124191522598267,
147
+ "learning_rate": 4e-05,
148
+ "loss": 1.565,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.0005093285957689346,
153
+ "grad_norm": 0.6472250819206238,
154
+ "learning_rate": 4.2e-05,
155
+ "loss": 1.0869,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.0005335823384245981,
160
+ "grad_norm": 4.816655158996582,
161
+ "learning_rate": 4.4000000000000006e-05,
162
+ "loss": 2.2343,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.0005578360810802617,
167
+ "grad_norm": 1.3076063394546509,
168
+ "learning_rate": 4.600000000000001e-05,
169
+ "loss": 1.8767,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.0005820898237359252,
174
+ "grad_norm": 1.0358009338378906,
175
+ "learning_rate": 4.8e-05,
176
+ "loss": 1.7001,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 0.0006063435663915888,
181
+ "grad_norm": 0.8611243367195129,
182
+ "learning_rate": 5e-05,
183
+ "loss": 1.0703,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 0.0006305973090472523,
188
+ "grad_norm": 1.8082849979400635,
189
+ "learning_rate": 5.2000000000000004e-05,
190
+ "loss": 2.409,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 0.000654851051702916,
195
+ "grad_norm": 2.4376935958862305,
196
+ "learning_rate": 5.4000000000000005e-05,
197
+ "loss": 1.6188,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 0.0006791047943585795,
202
+ "grad_norm": 1.7561759948730469,
203
+ "learning_rate": 5.6000000000000006e-05,
204
+ "loss": 1.8782,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 0.000703358537014243,
209
+ "grad_norm": 1.4189746379852295,
210
+ "learning_rate": 5.8e-05,
211
+ "loss": 2.1414,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 0.0007276122796699066,
216
+ "grad_norm": 1.186546802520752,
217
+ "learning_rate": 6e-05,
218
+ "loss": 1.5557,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 0.0007518660223255701,
223
+ "grad_norm": 1.365446925163269,
224
+ "learning_rate": 6.2e-05,
225
+ "loss": 1.9806,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 0.0007761197649812337,
230
+ "grad_norm": 5.6013641357421875,
231
+ "learning_rate": 6.400000000000001e-05,
232
+ "loss": 2.836,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 0.0008003735076368972,
237
+ "grad_norm": 1.5393822193145752,
238
+ "learning_rate": 6.6e-05,
239
+ "loss": 1.6833,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 0.0008246272502925607,
244
+ "grad_norm": 1.7934705018997192,
245
+ "learning_rate": 6.800000000000001e-05,
246
+ "loss": 1.6762,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 0.0008488809929482243,
251
+ "grad_norm": 1.1468597650527954,
252
+ "learning_rate": 7e-05,
253
+ "loss": 1.2819,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 0.0008731347356038878,
258
+ "grad_norm": 4.899059772491455,
259
+ "learning_rate": 7.2e-05,
260
+ "loss": 1.7406,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 0.0008973884782595515,
265
+ "grad_norm": 1.6008192300796509,
266
+ "learning_rate": 7.4e-05,
267
+ "loss": 1.5028,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 0.000921642220915215,
272
+ "grad_norm": 3.4235761165618896,
273
+ "learning_rate": 7.6e-05,
274
+ "loss": 1.6816,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 0.0009458959635708786,
279
+ "grad_norm": 1.1148329973220825,
280
+ "learning_rate": 7.800000000000001e-05,
281
+ "loss": 1.4805,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 0.0009701497062265421,
286
+ "grad_norm": 4.683089733123779,
287
+ "learning_rate": 8e-05,
288
+ "loss": 1.9679,
289
  "step": 40
290
  },
291
  {
292
+ "epoch": 0.0009944034488822055,
293
+ "grad_norm": 0.75764000415802,
294
+ "learning_rate": 8.2e-05,
295
+ "loss": 1.3406,
296
  "step": 41
297
  },
298
  {
299
+ "epoch": 0.0010186571915378692,
300
+ "grad_norm": 1.3041810989379883,
301
+ "learning_rate": 8.4e-05,
302
+ "loss": 1.4009,
303
  "step": 42
304
  },
305
  {
306
+ "epoch": 0.0010429109341935328,
307
+ "grad_norm": 1.4765511751174927,
308
+ "learning_rate": 8.6e-05,
309
+ "loss": 1.3446,
310
  "step": 43
311
  },
312
  {
313
+ "epoch": 0.0010671646768491963,
314
+ "grad_norm": 3.716846227645874,
315
+ "learning_rate": 8.800000000000001e-05,
316
+ "loss": 2.1417,
317
  "step": 44
318
  },
319
  {
320
+ "epoch": 0.00109141841950486,
321
+ "grad_norm": 1.0331724882125854,
322
+ "learning_rate": 9e-05,
323
+ "loss": 1.396,
324
  "step": 45
325
  },
326
  {
327
+ "epoch": 0.0011156721621605234,
328
+ "grad_norm": 2.426039934158325,
329
+ "learning_rate": 9.200000000000001e-05,
330
+ "loss": 1.466,
331
  "step": 46
332
  },
333
  {
334
+ "epoch": 0.001139925904816187,
335
+ "grad_norm": 1.3340977430343628,
336
+ "learning_rate": 9.4e-05,
337
+ "loss": 1.7891,
338
  "step": 47
339
  },
340
  {
341
+ "epoch": 0.0011641796474718504,
342
+ "grad_norm": 3.5191874504089355,
343
+ "learning_rate": 9.6e-05,
344
+ "loss": 1.6633,
345
  "step": 48
346
  },
347
  {
348
+ "epoch": 0.001188433390127514,
349
+ "grad_norm": 1.6775161027908325,
350
+ "learning_rate": 9.8e-05,
351
+ "loss": 1.4875,
352
  "step": 49
353
  },
354
  {
355
+ "epoch": 0.0012126871327831775,
356
+ "grad_norm": 8.406580924987793,
357
+ "learning_rate": 0.0001,
358
+ "loss": 2.1048,
359
  "step": 50
360
  },
361
  {
362
+ "epoch": 0.0012369408754388412,
363
+ "grad_norm": 6.012252330780029,
364
+ "learning_rate": 0.00010200000000000001,
365
+ "loss": 2.0742,
366
  "step": 51
367
  },
368
  {
369
+ "epoch": 0.0012611946180945046,
370
+ "grad_norm": 12.059433937072754,
371
+ "learning_rate": 0.00010400000000000001,
372
+ "loss": 1.843,
373
  "step": 52
374
  },
375
  {
376
+ "epoch": 0.0012854483607501683,
377
+ "grad_norm": 1.3695584535598755,
378
+ "learning_rate": 0.00010600000000000002,
379
+ "loss": 1.8865,
380
  "step": 53
381
  },
382
  {
383
+ "epoch": 0.001309702103405832,
384
+ "grad_norm": 2.331925630569458,
385
+ "learning_rate": 0.00010800000000000001,
386
+ "loss": 1.8662,
387
  "step": 54
388
  },
389
  {
390
+ "epoch": 0.0013339558460614953,
391
+ "grad_norm": 2.2023472785949707,
392
+ "learning_rate": 0.00011000000000000002,
393
+ "loss": 1.3941,
394
  "step": 55
395
  },
396
  {
397
+ "epoch": 0.001358209588717159,
398
+ "grad_norm": 1.971096396446228,
399
+ "learning_rate": 0.00011200000000000001,
400
+ "loss": 1.9362,
401
  "step": 56
402
  },
403
  {
404
+ "epoch": 0.0013824633313728224,
405
+ "grad_norm": 1.254345417022705,
406
+ "learning_rate": 0.00011399999999999999,
407
+ "loss": 1.5581,
408
  "step": 57
409
  },
410
  {
411
+ "epoch": 0.001406717074028486,
412
+ "grad_norm": Infinity,
413
  "learning_rate": 0.00011399999999999999,
414
+ "loss": 1.5454,
415
  "step": 58
416
  },
417
  {
418
+ "epoch": 0.0014309708166841495,
419
+ "grad_norm": 1.5801585912704468,
420
  "learning_rate": 0.000116,
421
+ "loss": 1.553,
422
  "step": 59
423
  },
424
  {
425
+ "epoch": 0.0014552245593398132,
426
+ "grad_norm": 0.9962314963340759,
427
  "learning_rate": 0.000118,
428
+ "loss": 1.4924,
429
  "step": 60
430
  },
431
  {
432
+ "epoch": 0.0014794783019954766,
433
+ "grad_norm": 5.291327476501465,
434
  "learning_rate": 0.00012,
435
+ "loss": 1.8365,
436
  "step": 61
437
  },
438
  {
439
+ "epoch": 0.0015037320446511402,
440
+ "grad_norm": 1.6945866346359253,
441
  "learning_rate": 0.000122,
442
+ "loss": 1.4683,
443
  "step": 62
444
  },
445
  {
446
+ "epoch": 0.0015279857873068039,
447
+ "grad_norm": 1.0400323867797852,
448
+ "learning_rate": 0.000124,
449
+ "loss": 0.8587,
450
  "step": 63
451
  },
452
  {
453
+ "epoch": 0.0015522395299624673,
454
+ "grad_norm": 3.5120689868927,
455
+ "learning_rate": 0.000126,
456
+ "loss": 1.2127,
457
  "step": 64
458
  },
459
  {
460
+ "epoch": 0.001576493272618131,
461
+ "grad_norm": 1.0469675064086914,
462
+ "learning_rate": 0.00012800000000000002,
463
+ "loss": 1.654,
464
  "step": 65
465
  },
466
  {
467
+ "epoch": 0.0016007470152737944,
468
+ "grad_norm": 1.5673848390579224,
469
+ "learning_rate": 0.00013000000000000002,
470
+ "loss": 1.801,
471
  "step": 66
472
  },
473
  {
474
+ "epoch": 0.001625000757929458,
475
+ "grad_norm": 2.0134799480438232,
476
+ "learning_rate": 0.000132,
477
+ "loss": 2.0432,
478
  "step": 67
479
  },
480
  {
481
+ "epoch": 0.0016492545005851215,
482
+ "grad_norm": 7.747060775756836,
483
+ "learning_rate": 0.000134,
484
+ "loss": 1.4823,
485
  "step": 68
486
  },
487
  {
488
+ "epoch": 0.0016735082432407851,
489
+ "grad_norm": 3.2787840366363525,
490
+ "learning_rate": 0.00013600000000000003,
491
+ "loss": 1.4865,
492
  "step": 69
493
  },
494
  {
495
+ "epoch": 0.0016977619858964486,
496
+ "grad_norm": 4.277163028717041,
497
+ "learning_rate": 0.000138,
498
+ "loss": 1.4665,
499
  "step": 70
500
  },
501
  {
502
+ "epoch": 0.0017220157285521122,
503
+ "grad_norm": 11.562691688537598,
504
+ "learning_rate": 0.00014,
505
+ "loss": 2.0221,
506
  "step": 71
507
  },
508
  {
509
+ "epoch": 0.0017462694712077757,
510
+ "grad_norm": 2.5653882026672363,
511
+ "learning_rate": 0.000142,
512
+ "loss": 1.4631,
513
  "step": 72
514
  },
515
  {
516
+ "epoch": 0.0017705232138634393,
517
+ "grad_norm": 2.392688751220703,
518
+ "learning_rate": 0.000144,
519
+ "loss": 1.3514,
520
  "step": 73
521
  },
522
  {
523
+ "epoch": 0.001794776956519103,
524
+ "grad_norm": 3.874115467071533,
525
+ "learning_rate": 0.000146,
526
+ "loss": 1.5983,
527
  "step": 74
528
  },
529
  {
530
+ "epoch": 0.0018190306991747664,
531
+ "grad_norm": 3.085160732269287,
532
+ "learning_rate": 0.000148,
533
+ "loss": 1.321,
534
  "step": 75
535
  },
536
  {
537
+ "epoch": 0.00184328444183043,
538
+ "grad_norm": 1.7499727010726929,
539
+ "learning_rate": 0.00015000000000000001,
540
+ "loss": 1.385,
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 0.0018675381844860935,
545
+ "grad_norm": 14.051904678344727,
546
+ "learning_rate": 0.000152,
547
+ "loss": 1.5748,
548
  "step": 77
549
  },
550
  {
551
+ "epoch": 0.0018917919271417571,
552
+ "grad_norm": 4.689906597137451,
553
+ "learning_rate": 0.000154,
554
+ "loss": 1.2126,
555
  "step": 78
556
  },
557
  {
558
+ "epoch": 0.0019160456697974206,
559
+ "grad_norm": 1.0368577241897583,
560
+ "learning_rate": 0.00015600000000000002,
561
+ "loss": 1.0542,
562
  "step": 79
563
  },
564
  {
565
+ "epoch": 0.0019402994124530842,
566
+ "grad_norm": 3.577094078063965,
567
+ "learning_rate": 0.00015800000000000002,
568
+ "loss": 1.9481,
569
  "step": 80
570
  },
571
  {
572
+ "epoch": 0.001964553155108748,
573
+ "grad_norm": NaN,
574
  "learning_rate": 0.00015800000000000002,
575
+ "loss": 1.2913,
576
  "step": 81
577
  },
578
  {
579
+ "epoch": 0.001988806897764411,
580
+ "grad_norm": 4.969227313995361,
581
  "learning_rate": 0.00016,
582
+ "loss": 1.5551,
583
  "step": 82
584
  },
585
  {
586
+ "epoch": 0.0020130606404200747,
587
+ "grad_norm": 71.41371154785156,
588
  "learning_rate": 0.000162,
589
+ "loss": 1.5847,
590
  "step": 83
591
  },
592
  {
593
+ "epoch": 0.0020373143830757384,
594
+ "grad_norm": 0.9679685831069946,
595
  "learning_rate": 0.000164,
596
+ "loss": 1.1516,
597
  "step": 84
598
  },
599
  {
600
+ "epoch": 0.002061568125731402,
601
+ "grad_norm": 7.050138473510742,
602
  "learning_rate": 0.000166,
603
+ "loss": 1.6904,
604
  "step": 85
605
  },
606
  {
607
+ "epoch": 0.0020858218683870657,
608
+ "grad_norm": 1.4181028604507446,
609
  "learning_rate": 0.000168,
610
+ "loss": 1.5385,
611
  "step": 86
612
  },
613
  {
614
+ "epoch": 0.002110075611042729,
615
+ "grad_norm": 1.122887134552002,
616
  "learning_rate": 0.00017,
617
+ "loss": 1.282,
618
  "step": 87
619
  },
620
  {
621
+ "epoch": 0.0021343293536983925,
622
+ "grad_norm": 3.0532054901123047,
623
  "learning_rate": 0.000172,
624
+ "loss": 1.7126,
625
  "step": 88
626
  },
627
  {
628
+ "epoch": 0.002158583096354056,
629
+ "grad_norm": 1.1368091106414795,
630
  "learning_rate": 0.000174,
631
+ "loss": 1.3792,
632
  "step": 89
633
  },
634
  {
635
+ "epoch": 0.00218283683900972,
636
+ "grad_norm": 1.5556614398956299,
637
  "learning_rate": 0.00017600000000000002,
638
+ "loss": 1.1266,
639
  "step": 90
640
  },
641
  {
642
+ "epoch": 0.002207090581665383,
643
+ "grad_norm": 2.5969090461730957,
644
  "learning_rate": 0.00017800000000000002,
645
+ "loss": 1.1916,
646
  "step": 91
647
  },
648
  {
649
+ "epoch": 0.0022313443243210467,
650
+ "grad_norm": 7.9968719482421875,
651
  "learning_rate": 0.00018,
652
+ "loss": 2.3005,
653
  "step": 92
654
  },
655
  {
656
+ "epoch": 0.0022555980669767104,
657
+ "grad_norm": 1.0678150653839111,
658
+ "learning_rate": 0.000182,
659
+ "loss": 1.1991,
660
  "step": 93
661
  },
662
  {
663
+ "epoch": 0.002279851809632374,
664
+ "grad_norm": 6.678997993469238,
665
+ "learning_rate": 0.00018400000000000003,
666
+ "loss": 1.9749,
667
  "step": 94
668
  },
669
  {
670
+ "epoch": 0.0023041055522880377,
671
+ "grad_norm": 2.6500961780548096,
672
+ "learning_rate": 0.00018600000000000002,
673
+ "loss": 1.7569,
674
  "step": 95
675
  },
676
  {
677
+ "epoch": 0.002328359294943701,
678
+ "grad_norm": 1.547878384590149,
679
+ "learning_rate": 0.000188,
680
+ "loss": 1.7286,
681
  "step": 96
682
  },
683
  {
684
+ "epoch": 0.0023526130375993645,
685
+ "grad_norm": 1.910861849784851,
686
+ "learning_rate": 0.00019,
687
+ "loss": 2.0815,
688
  "step": 97
689
  },
690
  {
691
+ "epoch": 0.002376866780255028,
692
+ "grad_norm": 3.678144693374634,
693
+ "learning_rate": 0.000192,
694
+ "loss": 1.7011,
695
  "step": 98
696
  },
697
  {
698
+ "epoch": 0.002401120522910692,
699
+ "grad_norm": 1.6496407985687256,
700
+ "learning_rate": 0.000194,
701
+ "loss": 1.8013,
702
  "step": 99
703
  },
704
  {
705
+ "epoch": 0.002425374265566355,
706
+ "grad_norm": 1.3245445489883423,
707
+ "learning_rate": 0.000196,
708
+ "loss": 1.5302,
709
  "step": 100
710
  },
711
  {
712
+ "epoch": 0.0024496280082220187,
713
+ "grad_norm": 1.609408974647522,
714
+ "learning_rate": 0.00019800000000000002,
715
+ "loss": 1.6082,
716
  "step": 101
717
  },
718
  {
719
+ "epoch": 0.0024738817508776823,
720
+ "grad_norm": 2.7757482528686523,
721
+ "learning_rate": 0.0002,
722
+ "loss": 1.6157,
723
  "step": 102
724
  },
725
  {
726
+ "epoch": 0.002498135493533346,
727
+ "grad_norm": 2.0618674755096436,
728
+ "learning_rate": 0.00019800000000000002,
729
+ "loss": 1.7212,
730
  "step": 103
731
  },
732
  {
733
+ "epoch": 0.002522389236189009,
734
+ "grad_norm": 1.7433160543441772,
735
+ "learning_rate": 0.000196,
736
+ "loss": 1.9133,
737
  "step": 104
738
  },
739
  {
740
+ "epoch": 0.002546642978844673,
741
+ "grad_norm": 1.4754176139831543,
742
+ "learning_rate": 0.000194,
743
+ "loss": 1.3311,
744
  "step": 105
745
  },
746
  {
747
+ "epoch": 0.0025708967215003365,
748
+ "grad_norm": 1.4527512788772583,
749
+ "learning_rate": 0.000192,
750
+ "loss": 1.6432,
751
  "step": 106
752
  },
753
  {
754
+ "epoch": 0.002595150464156,
755
+ "grad_norm": 5.3099045753479,
756
+ "learning_rate": 0.00019,
757
+ "loss": 1.3233,
758
  "step": 107
759
  },
760
  {
761
+ "epoch": 0.002619404206811664,
762
+ "grad_norm": 1.016135573387146,
763
+ "learning_rate": 0.000188,
764
+ "loss": 1.3291,
765
  "step": 108
766
  },
767
  {
768
+ "epoch": 0.002643657949467327,
769
+ "grad_norm": 1.0468915700912476,
770
+ "learning_rate": 0.00018600000000000002,
771
+ "loss": 1.4454,
772
  "step": 109
773
  },
774
  {
775
+ "epoch": 0.0026679116921229907,
776
+ "grad_norm": 3.245159387588501,
777
+ "learning_rate": 0.00018400000000000003,
778
+ "loss": 1.6501,
779
  "step": 110
780
  },
781
  {
782
+ "epoch": 0.0026921654347786543,
783
+ "grad_norm": 1.1628005504608154,
784
+ "learning_rate": 0.000182,
785
+ "loss": 1.399,
786
  "step": 111
787
  },
788
  {
789
+ "epoch": 0.002716419177434318,
790
+ "grad_norm": 3.3215761184692383,
791
+ "learning_rate": 0.00018,
792
+ "loss": 1.4829,
793
  "step": 112
794
  },
795
  {
796
+ "epoch": 0.002740672920089981,
797
+ "grad_norm": 1.424054503440857,
798
+ "learning_rate": 0.00017800000000000002,
799
+ "loss": 1.7692,
800
  "step": 113
801
  },
802
  {
803
+ "epoch": 0.002764926662745645,
804
+ "grad_norm": 1.2700444459915161,
805
+ "learning_rate": 0.00017600000000000002,
806
+ "loss": 1.6959,
807
  "step": 114
808
  },
809
  {
810
+ "epoch": 0.0027891804054013085,
811
+ "grad_norm": 1.262736201286316,
812
+ "learning_rate": 0.000174,
813
+ "loss": 1.5445,
814
  "step": 115
815
  },
816
  {
817
+ "epoch": 0.002813434148056972,
818
+ "grad_norm": 4.976276397705078,
819
+ "learning_rate": 0.000172,
820
+ "loss": 1.4571,
821
  "step": 116
822
  },
823
  {
824
+ "epoch": 0.002837687890712636,
825
+ "grad_norm": 1.2716983556747437,
826
+ "learning_rate": 0.00017,
827
+ "loss": 1.217,
828
  "step": 117
829
  },
830
  {
831
+ "epoch": 0.002861941633368299,
832
+ "grad_norm": 1.5143672227859497,
833
+ "learning_rate": 0.000168,
834
+ "loss": 1.3059,
835
  "step": 118
836
  },
837
  {
838
+ "epoch": 0.0028861953760239627,
839
+ "grad_norm": 0.9907928705215454,
840
+ "learning_rate": 0.000166,
841
+ "loss": 1.8035,
842
  "step": 119
843
  },
844
  {
845
+ "epoch": 0.0029104491186796263,
846
+ "grad_norm": 1.7163773775100708,
847
+ "learning_rate": 0.000164,
848
+ "loss": 2.1635,
849
  "step": 120
850
  },
851
  {
852
+ "epoch": 0.00293470286133529,
853
+ "grad_norm": 7.744179725646973,
854
+ "learning_rate": 0.000162,
855
+ "loss": 2.2964,
856
  "step": 121
857
  },
858
  {
859
+ "epoch": 0.002958956603990953,
860
+ "grad_norm": 1.101208209991455,
861
+ "learning_rate": 0.00016,
862
+ "loss": 1.46,
863
  "step": 122
864
  },
865
  {
866
+ "epoch": 0.002983210346646617,
867
+ "grad_norm": 0.7589418888092041,
868
+ "learning_rate": 0.00015800000000000002,
869
+ "loss": 1.1036,
870
  "step": 123
871
  },
872
  {
873
+ "epoch": 0.0030074640893022805,
874
+ "grad_norm": 4.626345634460449,
875
+ "learning_rate": 0.00015600000000000002,
876
+ "loss": 1.5257,
877
  "step": 124
878
  },
879
  {
880
+ "epoch": 0.003031717831957944,
881
+ "grad_norm": 1.5994218587875366,
882
+ "learning_rate": 0.000154,
883
+ "loss": 1.2856,
884
  "step": 125
885
  },
886
  {
887
+ "epoch": 0.0030559715746136078,
888
+ "grad_norm": 1.3893097639083862,
889
+ "learning_rate": 0.000152,
890
+ "loss": 1.3359,
891
  "step": 126
892
  },
893
  {
894
+ "epoch": 0.003080225317269271,
895
+ "grad_norm": 1.0536174774169922,
896
+ "learning_rate": 0.00015000000000000001,
897
+ "loss": 1.1718,
898
  "step": 127
899
  },
900
  {
901
+ "epoch": 0.0031044790599249346,
902
+ "grad_norm": 1.2898694276809692,
903
+ "learning_rate": 0.000148,
904
+ "loss": 1.2908,
905
  "step": 128
906
  },
907
  {
908
+ "epoch": 0.0031287328025805983,
909
+ "grad_norm": 1.6943238973617554,
910
+ "learning_rate": 0.000146,
911
+ "loss": 1.7723,
912
  "step": 129
913
  },
914
  {
915
+ "epoch": 0.003152986545236262,
916
+ "grad_norm": 1.2562038898468018,
917
+ "learning_rate": 0.000144,
918
+ "loss": 1.4476,
919
  "step": 130
920
  },
921
  {
922
+ "epoch": 0.003177240287891925,
923
+ "grad_norm": 1.0170366764068604,
924
+ "learning_rate": 0.000142,
925
+ "loss": 1.2967,
926
  "step": 131
927
  },
928
  {
929
+ "epoch": 0.003201494030547589,
930
+ "grad_norm": 5.646616458892822,
931
+ "learning_rate": 0.00014,
932
+ "loss": 1.4366,
933
  "step": 132
934
  },
935
  {
936
+ "epoch": 0.0032257477732032525,
937
+ "grad_norm": 1.29356849193573,
938
+ "learning_rate": 0.000138,
939
+ "loss": 1.4713,
940
  "step": 133
941
  },
942
  {
943
+ "epoch": 0.003250001515858916,
944
+ "grad_norm": 1.3575730323791504,
945
+ "learning_rate": 0.00013600000000000003,
946
+ "loss": 1.2233,
947
  "step": 134
948
  },
949
  {
950
+ "epoch": 0.0032742552585145793,
951
+ "grad_norm": 1.158686876296997,
952
+ "learning_rate": 0.000134,
953
+ "loss": 1.3622,
954
  "step": 135
955
  },
956
  {
957
+ "epoch": 0.003298509001170243,
958
+ "grad_norm": 1.4680081605911255,
959
+ "learning_rate": 0.000132,
960
+ "loss": 1.7191,
961
  "step": 136
962
  },
963
  {
964
+ "epoch": 0.0033227627438259066,
965
+ "grad_norm": 1.2448982000350952,
966
+ "learning_rate": 0.00013000000000000002,
967
+ "loss": 1.9376,
968
  "step": 137
969
  },
970
  {
971
+ "epoch": 0.0033470164864815703,
972
+ "grad_norm": 1.2404478788375854,
973
+ "learning_rate": 0.00012800000000000002,
974
+ "loss": 1.5334,
975
  "step": 138
976
  },
977
  {
978
+ "epoch": 0.003371270229137234,
979
+ "grad_norm": 0.9862974882125854,
980
+ "learning_rate": 0.000126,
981
+ "loss": 1.2073,
982
  "step": 139
983
  },
984
  {
985
+ "epoch": 0.003395523971792897,
986
+ "grad_norm": 7.36776876449585,
987
+ "learning_rate": 0.000124,
988
+ "loss": 1.5745,
989
  "step": 140
990
  },
991
  {
992
+ "epoch": 0.003419777714448561,
993
+ "grad_norm": 1.1356145143508911,
994
+ "learning_rate": 0.000122,
995
+ "loss": 1.563,
996
  "step": 141
997
  },
998
  {
999
+ "epoch": 0.0034440314571042244,
1000
+ "grad_norm": 2.073420286178589,
1001
+ "learning_rate": 0.00012,
1002
+ "loss": 1.7438,
1003
  "step": 142
1004
  },
1005
  {
1006
+ "epoch": 0.003468285199759888,
1007
+ "grad_norm": 3.029547691345215,
1008
+ "learning_rate": 0.000118,
1009
+ "loss": 1.6755,
1010
  "step": 143
1011
  },
1012
  {
1013
+ "epoch": 0.0034925389424155513,
1014
+ "grad_norm": 14.66256046295166,
1015
+ "learning_rate": 0.000116,
1016
+ "loss": 1.8375,
1017
  "step": 144
1018
  },
1019
  {
1020
+ "epoch": 0.003516792685071215,
1021
+ "grad_norm": 0.8765383958816528,
1022
+ "learning_rate": 0.00011399999999999999,
1023
+ "loss": 0.6782,
1024
  "step": 145
1025
  },
1026
  {
1027
+ "epoch": 0.0035410464277268786,
1028
+ "grad_norm": 3.167731761932373,
1029
+ "learning_rate": 0.00011200000000000001,
1030
+ "loss": 1.4619,
1031
  "step": 146
1032
  },
1033
  {
1034
+ "epoch": 0.0035653001703825423,
1035
+ "grad_norm": 1.1016606092453003,
1036
+ "learning_rate": 0.00011000000000000002,
1037
+ "loss": 1.5817,
1038
  "step": 147
1039
  },
1040
  {
1041
+ "epoch": 0.003589553913038206,
1042
+ "grad_norm": 2.5535902976989746,
1043
+ "learning_rate": 0.00010800000000000001,
1044
+ "loss": 1.3491,
1045
  "step": 148
1046
  },
1047
  {
1048
+ "epoch": 0.003613807655693869,
1049
+ "grad_norm": 1.1920926570892334,
1050
+ "learning_rate": 0.00010600000000000002,
1051
+ "loss": 1.5675,
1052
  "step": 149
1053
  },
1054
  {
1055
+ "epoch": 0.0036380613983495328,
1056
+ "grad_norm": 1.1023300886154175,
1057
+ "learning_rate": 0.00010400000000000001,
1058
+ "loss": 1.2821,
1059
  "step": 150
1060
  },
1061
  {
1062
+ "epoch": 0.0036623151410051964,
1063
+ "grad_norm": 2.1146063804626465,
1064
+ "learning_rate": 0.00010200000000000001,
1065
+ "loss": 1.5396,
1066
  "step": 151
1067
  },
1068
  {
1069
+ "epoch": 0.00368656888366086,
1070
+ "grad_norm": 1.1838195323944092,
1071
+ "learning_rate": 0.0001,
1072
+ "loss": 1.2992,
1073
  "step": 152
1074
  },
1075
  {
1076
+ "epoch": 0.0037108226263165233,
1077
+ "grad_norm": 1.414258360862732,
1078
+ "learning_rate": 9.8e-05,
1079
+ "loss": 1.6888,
1080
  "step": 153
1081
  },
1082
  {
1083
+ "epoch": 0.003735076368972187,
1084
+ "grad_norm": 0.8630995154380798,
1085
+ "learning_rate": 9.6e-05,
1086
+ "loss": 1.141,
1087
  "step": 154
1088
  },
1089
  {
1090
+ "epoch": 0.0037593301116278506,
1091
+ "grad_norm": 2.4763197898864746,
1092
+ "learning_rate": 9.4e-05,
1093
+ "loss": 1.5743,
1094
  "step": 155
1095
  },
1096
  {
1097
+ "epoch": 0.0037835838542835142,
1098
+ "grad_norm": 0.9457703828811646,
1099
+ "learning_rate": 9.200000000000001e-05,
1100
+ "loss": 1.3879,
1101
  "step": 156
1102
  },
1103
  {
1104
+ "epoch": 0.003807837596939178,
1105
+ "grad_norm": 1.308862328529358,
1106
+ "learning_rate": 9e-05,
1107
+ "loss": 1.1564,
1108
  "step": 157
1109
  },
1110
  {
1111
+ "epoch": 0.003832091339594841,
1112
+ "grad_norm": 1.208833932876587,
1113
+ "learning_rate": 8.800000000000001e-05,
1114
+ "loss": 1.2059,
1115
  "step": 158
1116
  },
1117
  {
1118
+ "epoch": 0.0038563450822505048,
1119
+ "grad_norm": 1.9145225286483765,
1120
+ "learning_rate": 8.6e-05,
1121
+ "loss": 1.4744,
1122
  "step": 159
1123
  },
1124
  {
1125
+ "epoch": 0.0038805988249061684,
1126
+ "grad_norm": 1.0121599435806274,
1127
+ "learning_rate": 8.4e-05,
1128
+ "loss": 1.8676,
1129
  "step": 160
1130
  },
1131
  {
1132
+ "epoch": 0.003904852567561832,
1133
+ "grad_norm": 1.9128226041793823,
1134
+ "learning_rate": 8.2e-05,
1135
+ "loss": 1.6948,
1136
  "step": 161
1137
  },
1138
  {
1139
+ "epoch": 0.003929106310217496,
1140
+ "grad_norm": 1.7783511877059937,
1141
+ "learning_rate": 8e-05,
1142
+ "loss": 1.4198,
1143
  "step": 162
1144
  },
1145
  {
1146
+ "epoch": 0.003953360052873159,
1147
+ "grad_norm": 0.9799928665161133,
1148
+ "learning_rate": 7.800000000000001e-05,
1149
+ "loss": 1.185,
1150
  "step": 163
1151
  },
1152
  {
1153
+ "epoch": 0.003977613795528822,
1154
+ "grad_norm": 1.5025819540023804,
1155
+ "learning_rate": 7.6e-05,
1156
+ "loss": 1.0638,
1157
  "step": 164
1158
  },
1159
  {
1160
+ "epoch": 0.004001867538184486,
1161
+ "grad_norm": 1.2552540302276611,
1162
+ "learning_rate": 7.4e-05,
1163
+ "loss": 1.3463,
1164
  "step": 165
1165
  },
1166
  {
1167
+ "epoch": 0.0040261212808401494,
1168
+ "grad_norm": 1.0616928339004517,
1169
+ "learning_rate": 7.2e-05,
1170
+ "loss": 1.9275,
1171
  "step": 166
1172
  },
1173
  {
1174
+ "epoch": 0.004050375023495813,
1175
+ "grad_norm": 2.8277781009674072,
1176
+ "learning_rate": 7e-05,
1177
+ "loss": 1.9653,
1178
  "step": 167
1179
  },
1180
  {
1181
+ "epoch": 0.004074628766151477,
1182
+ "grad_norm": 1.7752915620803833,
1183
+ "learning_rate": 6.800000000000001e-05,
1184
+ "loss": 1.2783,
1185
  "step": 168
1186
  },
1187
  {
1188
+ "epoch": 0.00409888250880714,
1189
+ "grad_norm": 2.4863102436065674,
1190
+ "learning_rate": 6.6e-05,
1191
+ "loss": 1.2803,
1192
  "step": 169
1193
  },
1194
  {
1195
+ "epoch": 0.004123136251462804,
1196
+ "grad_norm": 0.9784297943115234,
1197
+ "learning_rate": 6.400000000000001e-05,
1198
+ "loss": 1.0347,
1199
  "step": 170
1200
  },
1201
  {
1202
+ "epoch": 0.004147389994118468,
1203
+ "grad_norm": 2.0077602863311768,
1204
+ "learning_rate": 6.2e-05,
1205
+ "loss": 1.5035,
1206
  "step": 171
1207
  },
1208
  {
1209
+ "epoch": 0.004171643736774131,
1210
+ "grad_norm": 1.5943875312805176,
1211
+ "learning_rate": 6e-05,
1212
+ "loss": 1.7834,
1213
  "step": 172
1214
  },
1215
  {
1216
+ "epoch": 0.004195897479429794,
1217
+ "grad_norm": 1.0898661613464355,
1218
+ "learning_rate": 5.8e-05,
1219
+ "loss": 1.5043,
1220
  "step": 173
1221
  },
1222
  {
1223
+ "epoch": 0.004220151222085458,
1224
+ "grad_norm": 1.8181873559951782,
1225
+ "learning_rate": 5.6000000000000006e-05,
1226
+ "loss": 1.5026,
1227
  "step": 174
1228
  },
1229
  {
1230
+ "epoch": 0.004244404964741121,
1231
+ "grad_norm": 0.9227128028869629,
1232
+ "learning_rate": 5.4000000000000005e-05,
1233
+ "loss": 1.3945,
1234
  "step": 175
1235
  },
1236
  {
1237
+ "epoch": 0.004268658707396785,
1238
+ "grad_norm": 1.3131438493728638,
1239
+ "learning_rate": 5.2000000000000004e-05,
1240
+ "loss": 1.2629,
1241
  "step": 176
1242
  },
1243
  {
1244
+ "epoch": 0.004292912450052449,
1245
+ "grad_norm": 1.388433814048767,
1246
+ "learning_rate": 5e-05,
1247
+ "loss": 1.607,
1248
  "step": 177
1249
  },
1250
  {
1251
+ "epoch": 0.004317166192708112,
1252
+ "grad_norm": 1.5145083665847778,
1253
+ "learning_rate": 4.8e-05,
1254
+ "loss": 1.8671,
1255
  "step": 178
1256
  },
1257
  {
1258
+ "epoch": 0.004341419935363776,
1259
+ "grad_norm": 2.0021004676818848,
1260
+ "learning_rate": 4.600000000000001e-05,
1261
+ "loss": 2.0679,
1262
  "step": 179
1263
  },
1264
  {
1265
+ "epoch": 0.00436567367801944,
1266
+ "grad_norm": 1.8512483835220337,
1267
+ "learning_rate": 4.4000000000000006e-05,
1268
+ "loss": 1.3095,
1269
  "step": 180
1270
  },
1271
  {
1272
+ "epoch": 0.004389927420675103,
1273
+ "grad_norm": 1.0164090394973755,
1274
+ "learning_rate": 4.2e-05,
1275
+ "loss": 1.3749,
1276
  "step": 181
1277
  },
1278
  {
1279
+ "epoch": 0.004414181163330766,
1280
+ "grad_norm": 30.730253219604492,
1281
+ "learning_rate": 4e-05,
1282
+ "loss": 2.3106,
1283
  "step": 182
1284
  },
1285
  {
1286
+ "epoch": 0.00443843490598643,
1287
+ "grad_norm": 1.2311471700668335,
1288
+ "learning_rate": 3.8e-05,
1289
+ "loss": 1.1078,
1290
  "step": 183
1291
  },
1292
  {
1293
+ "epoch": 0.004462688648642093,
1294
+ "grad_norm": 0.9555554389953613,
1295
+ "learning_rate": 3.6e-05,
1296
+ "loss": 1.4175,
1297
  "step": 184
1298
  },
1299
  {
1300
+ "epoch": 0.004486942391297757,
1301
+ "grad_norm": 1.4511969089508057,
1302
+ "learning_rate": 3.4000000000000007e-05,
1303
+ "loss": 1.4099,
1304
  "step": 185
1305
  },
1306
  {
1307
+ "epoch": 0.004511196133953421,
1308
+ "grad_norm": 5.319721221923828,
1309
+ "learning_rate": 3.2000000000000005e-05,
1310
+ "loss": 1.2749,
1311
  "step": 186
1312
  },
1313
  {
1314
+ "epoch": 0.004535449876609084,
1315
+ "grad_norm": 1.8668482303619385,
1316
+ "learning_rate": 3e-05,
1317
+ "loss": 1.4658,
1318
  "step": 187
1319
  },
1320
  {
1321
+ "epoch": 0.004559703619264748,
1322
+ "grad_norm": 1.737425684928894,
1323
+ "learning_rate": 2.8000000000000003e-05,
1324
+ "loss": 1.8142,
1325
  "step": 188
1326
  },
1327
  {
1328
+ "epoch": 0.004583957361920412,
1329
+ "grad_norm": 12.860699653625488,
1330
+ "learning_rate": 2.6000000000000002e-05,
1331
+ "loss": 1.6338,
1332
  "step": 189
1333
  },
1334
  {
1335
+ "epoch": 0.004608211104576075,
1336
+ "grad_norm": 0.6315305233001709,
1337
+ "learning_rate": 2.4e-05,
1338
+ "loss": 1.0426,
1339
  "step": 190
1340
  },
1341
  {
1342
+ "epoch": 0.004632464847231738,
1343
+ "grad_norm": 1.0747138261795044,
1344
+ "learning_rate": 2.2000000000000003e-05,
1345
+ "loss": 1.4042,
1346
  "step": 191
1347
  },
1348
  {
1349
+ "epoch": 0.004656718589887402,
1350
+ "grad_norm": 1.1410670280456543,
1351
+ "learning_rate": 2e-05,
1352
+ "loss": 1.2148,
1353
  "step": 192
1354
  },
1355
  {
1356
+ "epoch": 0.004680972332543065,
1357
+ "grad_norm": 1.4486732482910156,
1358
+ "learning_rate": 1.8e-05,
1359
+ "loss": 1.3208,
1360
  "step": 193
1361
  },
1362
  {
1363
+ "epoch": 0.004705226075198729,
1364
+ "grad_norm": 2.5336716175079346,
1365
+ "learning_rate": 1.6000000000000003e-05,
1366
+ "loss": 1.7136,
1367
  "step": 194
1368
  },
1369
  {
1370
+ "epoch": 0.004729479817854393,
1371
+ "grad_norm": 3.4056637287139893,
1372
+ "learning_rate": 1.4000000000000001e-05,
1373
+ "loss": 1.5443,
1374
  "step": 195
1375
  },
1376
  {
1377
+ "epoch": 0.004753733560510056,
1378
+ "grad_norm": 1.1721996068954468,
1379
+ "learning_rate": 1.2e-05,
1380
+ "loss": 1.4518,
1381
  "step": 196
1382
  },
1383
  {
1384
+ "epoch": 0.00477798730316572,
1385
+ "grad_norm": 1.9326874017715454,
1386
+ "learning_rate": 1e-05,
1387
+ "loss": 1.6381,
1388
  "step": 197
1389
  },
1390
  {
1391
+ "epoch": 0.004802241045821384,
1392
+ "grad_norm": 1.0234136581420898,
1393
+ "learning_rate": 8.000000000000001e-06,
1394
+ "loss": 1.4116,
1395
  "step": 198
1396
  },
1397
  {
1398
+ "epoch": 0.004826494788477047,
1399
+ "grad_norm": 2.1815526485443115,
1400
+ "learning_rate": 6e-06,
1401
+ "loss": 1.7442,
1402
  "step": 199
1403
  },
1404
  {
1405
+ "epoch": 0.00485074853113271,
1406
+ "grad_norm": 1.9557406902313232,
1407
+ "learning_rate": 4.000000000000001e-06,
1408
+ "loss": 1.9568,
1409
  "step": 200
1410
  }
1411
  ],
 
1426
  "attributes": {}
1427
  }
1428
  },
1429
+ "total_flos": 6034692870365184.0,
1430
  "train_batch_size": 1,
1431
  "trial_name": null,
1432
  "trial_params": null
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:098cc62850a9422967145f5b3c4822dcaa7f739cabac8d97cbe7d3678e81010e
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02e9962458f016a1c07dc5280d7007cfd14653c05542b51edd26eefb09ce3f00
3
  size 5176
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:098cc62850a9422967145f5b3c4822dcaa7f739cabac8d97cbe7d3678e81010e
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02e9962458f016a1c07dc5280d7007cfd14653c05542b51edd26eefb09ce3f00
3
  size 5176