PeterJinGo commited on
Commit
f20027e
1 Parent(s): 6bc4f05

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +422 -422
README.md CHANGED
@@ -56,7 +56,7 @@ The following hyperparameters were used during training:
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
- | 0.9403 | 1.0 | 1084 | 0.9415 |
60
 
61
 
62
  ### Framework versions
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
+ | 0.9404 | 1.0 | 1084 | 0.9415 |
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 453935093514240.0,
4
- "train_loss": 0.9830407258329357,
5
- "train_runtime": 11998.8497,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 11.558,
8
- "train_steps_per_second": 0.09
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 453935093514240.0,
4
+ "train_loss": 0.9826947098728476,
5
+ "train_runtime": 5906.6996,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 23.48,
8
+ "train_steps_per_second": 0.184
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 453935093514240.0,
4
- "train_loss": 0.9830407258329357,
5
- "train_runtime": 11998.8497,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 11.558,
8
- "train_steps_per_second": 0.09
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 453935093514240.0,
4
+ "train_loss": 0.9826947098728476,
5
+ "train_runtime": 5906.6996,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 23.48,
8
+ "train_steps_per_second": 0.184
9
  }
trainer_state.json CHANGED
@@ -10,1539 +10,1539 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0009225092250922509,
13
- "grad_norm": 9.870575577438405,
14
  "learning_rate": 1.8348623853211012e-07,
15
  "loss": 1.1607,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.004612546125461255,
20
- "grad_norm": 7.65916956083968,
21
  "learning_rate": 9.174311926605506e-07,
22
  "loss": 1.1303,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.00922509225092251,
27
- "grad_norm": 3.7726483117427345,
28
  "learning_rate": 1.8348623853211011e-06,
29
- "loss": 1.057,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.013837638376383764,
34
- "grad_norm": 2.5379513268735505,
35
  "learning_rate": 2.7522935779816517e-06,
36
- "loss": 1.0238,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.01845018450184502,
41
- "grad_norm": 2.0372936699866466,
42
  "learning_rate": 3.6697247706422022e-06,
43
- "loss": 0.9931,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.023062730627306273,
48
- "grad_norm": 1.9015079532614876,
49
  "learning_rate": 4.587155963302753e-06,
50
- "loss": 0.9821,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.027675276752767528,
55
- "grad_norm": 2.2588401749573546,
56
  "learning_rate": 5.504587155963303e-06,
57
- "loss": 0.9798,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.03228782287822878,
62
- "grad_norm": 2.7788236168992237,
63
  "learning_rate": 6.422018348623854e-06,
64
- "loss": 0.9812,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03690036900369004,
69
- "grad_norm": 2.0946702560307022,
70
  "learning_rate": 7.3394495412844045e-06,
71
- "loss": 0.9807,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04151291512915129,
76
- "grad_norm": 2.5421232639973415,
77
  "learning_rate": 8.256880733944956e-06,
78
- "loss": 0.9867,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.046125461254612546,
83
- "grad_norm": 2.346175802405767,
84
  "learning_rate": 9.174311926605506e-06,
85
- "loss": 0.9966,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.0507380073800738,
90
- "grad_norm": 2.4887196180378783,
91
  "learning_rate": 1.0091743119266055e-05,
92
- "loss": 0.9908,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.055350553505535055,
97
- "grad_norm": 2.0063521735753165,
98
  "learning_rate": 1.1009174311926607e-05,
99
- "loss": 0.9924,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05996309963099631,
104
- "grad_norm": 1.9753528261008033,
105
  "learning_rate": 1.1926605504587156e-05,
106
- "loss": 1.0028,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06457564575645756,
111
- "grad_norm": 2.8188122479797793,
112
  "learning_rate": 1.2844036697247708e-05,
113
- "loss": 0.9874,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.06918819188191883,
118
- "grad_norm": 2.4283436777324243,
119
  "learning_rate": 1.3761467889908258e-05,
120
- "loss": 1.0038,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07380073800738007,
125
- "grad_norm": 2.2694140697845735,
126
  "learning_rate": 1.4678899082568809e-05,
127
- "loss": 1.0162,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.07841328413284133,
132
- "grad_norm": 2.2296135287203533,
133
  "learning_rate": 1.559633027522936e-05,
134
- "loss": 1.0022,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08302583025830258,
139
- "grad_norm": 2.496334289275322,
140
  "learning_rate": 1.6513761467889912e-05,
141
- "loss": 0.9963,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08763837638376384,
146
- "grad_norm": 3.048918418901738,
147
  "learning_rate": 1.743119266055046e-05,
148
- "loss": 0.9874,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09225092250922509,
153
- "grad_norm": 2.219949213385871,
154
  "learning_rate": 1.834862385321101e-05,
155
- "loss": 1.0366,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09686346863468635,
160
- "grad_norm": 2.3152361863940305,
161
  "learning_rate": 1.9266055045871563e-05,
162
- "loss": 1.009,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.1014760147601476,
167
- "grad_norm": 2.1557669545307925,
168
  "learning_rate": 1.9999948088910656e-05,
169
- "loss": 1.0126,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.10608856088560886,
174
- "grad_norm": 2.5252882972323283,
175
  "learning_rate": 1.9998131257372878e-05,
176
- "loss": 1.0228,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.11070110701107011,
181
- "grad_norm": 1.754976914882788,
182
  "learning_rate": 1.999371941029485e-05,
183
- "loss": 1.0132,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.11531365313653137,
188
- "grad_norm": 1.8015448977018316,
189
  "learning_rate": 1.9986713692771732e-05,
190
- "loss": 1.0354,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.11992619926199262,
195
- "grad_norm": 1.8342985482903624,
196
  "learning_rate": 1.9977115923137912e-05,
197
- "loss": 1.0299,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.12453874538745388,
202
- "grad_norm": 1.7924526745821787,
203
  "learning_rate": 1.9964928592495046e-05,
204
- "loss": 1.0315,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.12915129151291513,
209
- "grad_norm": 15.523610268563516,
210
  "learning_rate": 1.9950154864065497e-05,
211
- "loss": 1.0376,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.13376383763837638,
216
- "grad_norm": 2.102764923940876,
217
  "learning_rate": 1.993279857237133e-05,
218
- "loss": 1.0436,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.13837638376383765,
223
- "grad_norm": 2.1109629634915197,
224
  "learning_rate": 1.9912864222239045e-05,
225
- "loss": 1.0436,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.1429889298892989,
230
- "grad_norm": 2.01098830232536,
231
  "learning_rate": 1.9890356987630362e-05,
232
- "loss": 1.0206,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.14760147601476015,
237
- "grad_norm": 2.0368730493904157,
238
  "learning_rate": 1.986528271029931e-05,
239
- "loss": 1.0232,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.1522140221402214,
244
- "grad_norm": 1.6688489601440277,
245
  "learning_rate": 1.9837647898276008e-05,
246
- "loss": 1.0236,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.15682656826568267,
251
- "grad_norm": 2.0005420507660334,
252
  "learning_rate": 1.9807459724177497e-05,
253
- "loss": 1.0246,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.16143911439114392,
258
- "grad_norm": 1.830621233659763,
259
  "learning_rate": 1.977472602334609e-05,
260
- "loss": 1.0322,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.16605166051660517,
265
- "grad_norm": 2.0747249565016426,
266
  "learning_rate": 1.973945529181572e-05,
267
- "loss": 1.0356,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.1706642066420664,
272
- "grad_norm": 1.8657819252798358,
273
  "learning_rate": 1.9701656684106764e-05,
274
- "loss": 1.0406,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.1752767527675277,
279
- "grad_norm": 1.863882659265557,
280
  "learning_rate": 1.9661340010850025e-05,
281
- "loss": 1.0295,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.17988929889298894,
286
- "grad_norm": 1.907383023598496,
287
  "learning_rate": 1.9618515736240353e-05,
288
- "loss": 1.0518,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.18450184501845018,
293
- "grad_norm": 1.8946641894052252,
294
  "learning_rate": 1.9573194975320672e-05,
295
- "loss": 1.0382,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.18911439114391143,
300
- "grad_norm": 1.8166789629573785,
301
  "learning_rate": 1.952538949109708e-05,
302
- "loss": 1.0443,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.1937269372693727,
307
- "grad_norm": 1.7352343980788758,
308
  "learning_rate": 1.9475111691485737e-05,
309
- "loss": 1.0217,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.19833948339483395,
314
- "grad_norm": 1.6974181401216584,
315
  "learning_rate": 1.9422374626092414e-05,
316
- "loss": 1.0305,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.2029520295202952,
321
- "grad_norm": 1.5750696152336092,
322
  "learning_rate": 1.936719198282545e-05,
323
- "loss": 1.0293,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.20756457564575645,
328
- "grad_norm": 1.623225736738918,
329
  "learning_rate": 1.930957808434307e-05,
330
- "loss": 1.0269,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.21217712177121772,
335
- "grad_norm": 1.631576712487477,
336
  "learning_rate": 1.9249547884335917e-05,
337
- "loss": 1.0305,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.21678966789667897,
342
- "grad_norm": 1.7005295416853186,
343
  "learning_rate": 1.9187116963645845e-05,
344
- "loss": 1.0356,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.22140221402214022,
349
- "grad_norm": 1.7890847993052705,
350
  "learning_rate": 1.912230152622189e-05,
351
- "loss": 1.033,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.22601476014760147,
356
- "grad_norm": 1.7968201186150952,
357
  "learning_rate": 1.9055118394914545e-05,
358
  "loss": 1.0367,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.23062730627306274,
363
- "grad_norm": 1.5783711589062268,
364
  "learning_rate": 1.898558500710939e-05,
365
- "loss": 1.0329,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.235239852398524,
370
- "grad_norm": 1.7821446933403304,
371
  "learning_rate": 1.891371941020121e-05,
372
- "loss": 1.0538,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.23985239852398524,
377
- "grad_norm": 1.7571850494815153,
378
  "learning_rate": 1.88395402569098e-05,
379
- "loss": 1.0172,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.2444649446494465,
384
- "grad_norm": 1.6777787820198717,
385
  "learning_rate": 1.8763066800438638e-05,
386
- "loss": 1.0262,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.24907749077490776,
391
- "grad_norm": 1.7754650640017842,
392
  "learning_rate": 1.868431888947773e-05,
393
- "loss": 1.0378,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.253690036900369,
398
- "grad_norm": 1.6204893672219858,
399
  "learning_rate": 1.860331696305188e-05,
400
- "loss": 1.0272,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.25830258302583026,
405
- "grad_norm": 1.6082347328850755,
406
  "learning_rate": 1.852008204521572e-05,
407
- "loss": 1.0199,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.2629151291512915,
412
- "grad_norm": 1.5512676811150408,
413
  "learning_rate": 1.8434635739596945e-05,
414
- "loss": 1.0365,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.26752767527675275,
419
- "grad_norm": 1.7967133178474277,
420
  "learning_rate": 1.834700022378907e-05,
421
- "loss": 1.0541,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.272140221402214,
426
- "grad_norm": 1.6009722961453385,
427
  "learning_rate": 1.825719824359524e-05,
428
- "loss": 1.0399,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.2767527675276753,
433
- "grad_norm": 1.549904165894849,
434
  "learning_rate": 1.816525310712456e-05,
435
- "loss": 1.0096,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.28136531365313655,
440
- "grad_norm": 1.515466383587945,
441
  "learning_rate": 1.8071188678742457e-05,
442
  "loss": 1.0166,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.2859778597785978,
447
- "grad_norm": 1.5677132403955465,
448
  "learning_rate": 1.7975029372876706e-05,
449
- "loss": 1.0209,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.29059040590405905,
454
- "grad_norm": 1.542178919337743,
455
  "learning_rate": 1.787680014768065e-05,
456
  "loss": 1.0434,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.2952029520295203,
461
- "grad_norm": 1.6414967467646593,
462
  "learning_rate": 1.777652649855531e-05,
463
- "loss": 1.0184,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.29981549815498154,
468
- "grad_norm": 1.6372383463858382,
469
  "learning_rate": 1.7674234451532065e-05,
470
- "loss": 1.0317,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.3044280442804428,
475
- "grad_norm": 1.650440536079242,
476
  "learning_rate": 1.7569950556517566e-05,
477
  "loss": 1.052,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.30904059040590404,
482
- "grad_norm": 1.553654913479007,
483
  "learning_rate": 1.7463701880402738e-05,
484
- "loss": 1.0205,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.31365313653136534,
489
- "grad_norm": 1.6079774780306095,
490
  "learning_rate": 1.7355516000037555e-05,
491
- "loss": 1.0289,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.3182656826568266,
496
- "grad_norm": 1.6788267575243834,
497
  "learning_rate": 1.7245420995073453e-05,
498
- "loss": 1.0452,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.32287822878228783,
503
- "grad_norm": 1.5790256320964164,
504
  "learning_rate": 1.7133445440675268e-05,
505
  "loss": 1.028,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.3274907749077491,
510
- "grad_norm": 1.5621840814508567,
511
  "learning_rate": 1.7019618400104572e-05,
512
- "loss": 1.0262,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.33210332103321033,
517
- "grad_norm": 1.519849047216207,
518
  "learning_rate": 1.6903969417176244e-05,
519
- "loss": 1.0183,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.3367158671586716,
524
- "grad_norm": 1.5037917545786645,
525
  "learning_rate": 1.6786528508590436e-05,
526
- "loss": 1.0334,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.3413284132841328,
531
- "grad_norm": 1.750901636962616,
532
  "learning_rate": 1.666732615614169e-05,
533
- "loss": 1.0198,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.3459409594095941,
538
- "grad_norm": 1.4681811166568075,
539
  "learning_rate": 1.6546393298807405e-05,
540
- "loss": 1.0181,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.3505535055350554,
545
- "grad_norm": 1.9552415843064699,
546
  "learning_rate": 1.6423761324717636e-05,
547
- "loss": 1.0237,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.3551660516605166,
552
- "grad_norm": 1.5359322834804126,
553
  "learning_rate": 1.6299462063008272e-05,
554
- "loss": 1.0164,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.35977859778597787,
559
- "grad_norm": 1.538202612072466,
560
  "learning_rate": 1.61735277755598e-05,
561
- "loss": 1.0236,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.3643911439114391,
566
- "grad_norm": 1.8749904513501805,
567
  "learning_rate": 1.6045991148623752e-05,
568
- "loss": 1.0168,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.36900369003690037,
573
- "grad_norm": 1.4674737229917856,
574
  "learning_rate": 1.5916885284338937e-05,
575
- "loss": 1.0042,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.3736162361623616,
580
- "grad_norm": 1.4565983698821008,
581
  "learning_rate": 1.5786243692139826e-05,
582
- "loss": 1.0269,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.37822878228782286,
587
- "grad_norm": 1.5930896810188973,
588
  "learning_rate": 1.5654100280059155e-05,
589
- "loss": 1.0039,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.3828413284132841,
594
- "grad_norm": 1.4721927126588246,
595
  "learning_rate": 1.5520489345927095e-05,
596
- "loss": 1.0036,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.3874538745387454,
601
- "grad_norm": 1.5395462162350828,
602
  "learning_rate": 1.538544556846925e-05,
603
- "loss": 1.0253,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.39206642066420666,
608
- "grad_norm": 1.4973959788085256,
609
  "learning_rate": 1.5249003998305787e-05,
610
- "loss": 1.0142,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.3966789667896679,
615
- "grad_norm": 1.6021819916230071,
616
  "learning_rate": 1.5111200048854055e-05,
617
- "loss": 1.0203,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.40129151291512916,
622
- "grad_norm": 1.5164653354921802,
623
  "learning_rate": 1.4972069487137024e-05,
624
- "loss": 1.0097,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.4059040590405904,
629
- "grad_norm": 1.6120537324024575,
630
  "learning_rate": 1.4831648424499953e-05,
631
- "loss": 1.0156,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.41051660516605165,
636
- "grad_norm": 1.4054842646806989,
637
  "learning_rate": 1.4689973307237687e-05,
638
- "loss": 1.0282,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.4151291512915129,
643
- "grad_norm": 1.4671822665223906,
644
  "learning_rate": 1.4547080907135024e-05,
645
  "loss": 1.0125,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.41974169741697415,
650
- "grad_norm": 1.464262893049375,
651
  "learning_rate": 1.4403008311922593e-05,
652
- "loss": 1.0106,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.42435424354243545,
657
- "grad_norm": 1.443587186675441,
658
  "learning_rate": 1.4257792915650728e-05,
659
- "loss": 1.013,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.4289667896678967,
664
- "grad_norm": 1.3761731833498907,
665
  "learning_rate": 1.4111472408983843e-05,
666
- "loss": 1.0073,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.43357933579335795,
671
- "grad_norm": 1.529603561908737,
672
  "learning_rate": 1.3964084769417823e-05,
673
- "loss": 1.0296,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.4381918819188192,
678
- "grad_norm": 1.6733724515060537,
679
  "learning_rate": 1.3815668251422953e-05,
680
- "loss": 0.9951,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.44280442804428044,
685
- "grad_norm": 1.4450871420978533,
686
  "learning_rate": 1.3666261376514978e-05,
687
- "loss": 0.9927,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.4474169741697417,
692
- "grad_norm": 1.4561810147046073,
693
  "learning_rate": 1.3515902923256832e-05,
694
- "loss": 1.0082,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.45202952029520294,
699
- "grad_norm": 1.5238257427571544,
700
  "learning_rate": 1.3364631917193671e-05,
701
- "loss": 1.0025,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.4566420664206642,
706
- "grad_norm": 1.5528476103643793,
707
  "learning_rate": 1.321248762072377e-05,
708
- "loss": 1.0031,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.4612546125461255,
713
- "grad_norm": 1.4687610879413946,
714
  "learning_rate": 1.3059509522907998e-05,
715
- "loss": 0.9982,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.46586715867158673,
720
- "grad_norm": 1.4093011733734708,
721
  "learning_rate": 1.2905737329220394e-05,
722
- "loss": 0.9966,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.470479704797048,
727
- "grad_norm": 1.44715856596638,
728
  "learning_rate": 1.2751210951242636e-05,
729
  "loss": 1.0052,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.47509225092250923,
734
- "grad_norm": 1.3996570148561756,
735
  "learning_rate": 1.2595970496304975e-05,
736
  "loss": 0.9942,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.4797047970479705,
741
- "grad_norm": 1.4325383073010012,
742
  "learning_rate": 1.2440056257076376e-05,
743
- "loss": 1.0081,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.4843173431734317,
748
- "grad_norm": 1.4383492965547426,
749
  "learning_rate": 1.2283508701106559e-05,
750
- "loss": 0.9788,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.488929889298893,
755
- "grad_norm": 1.4099081664313255,
756
  "learning_rate": 1.2126368460322637e-05,
757
- "loss": 1.0083,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.4935424354243542,
762
- "grad_norm": 1.3412375300770663,
763
  "learning_rate": 1.1968676320483103e-05,
764
  "loss": 0.9684,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.4981549815498155,
769
- "grad_norm": 1.4515358137974845,
770
  "learning_rate": 1.1810473210591882e-05,
771
  "loss": 0.9852,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.5027675276752768,
776
- "grad_norm": 1.4597275022169718,
777
  "learning_rate": 1.1651800192275197e-05,
778
- "loss": 0.9793,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.507380073800738,
783
- "grad_norm": 1.3868232464146542,
784
  "learning_rate": 1.1492698449124042e-05,
785
- "loss": 0.9907,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.5119926199261993,
790
- "grad_norm": 1.4023901770546143,
791
  "learning_rate": 1.1333209276004959e-05,
792
- "loss": 0.9967,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.5166051660516605,
797
- "grad_norm": 1.403043028756451,
798
  "learning_rate": 1.1173374068341962e-05,
799
- "loss": 1.0058,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.5212177121771218,
804
- "grad_norm": 1.5463650175486758,
805
  "learning_rate": 1.1013234311372353e-05,
806
- "loss": 0.9843,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.525830258302583,
811
- "grad_norm": 1.566717033090475,
812
  "learning_rate": 1.0852831569379217e-05,
813
- "loss": 0.9911,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.5304428044280443,
818
- "grad_norm": 1.4828461511508186,
819
  "learning_rate": 1.0692207474903421e-05,
820
- "loss": 0.9909,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.5350553505535055,
825
- "grad_norm": 2.8521203782189253,
826
  "learning_rate": 1.0531403717937888e-05,
827
- "loss": 0.9927,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.5396678966789668,
832
- "grad_norm": 1.512595456828133,
833
  "learning_rate": 1.037046203510694e-05,
834
- "loss": 0.979,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.544280442804428,
839
- "grad_norm": 1.5368674133781381,
840
  "learning_rate": 1.0209424198833571e-05,
841
- "loss": 0.9877,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.5488929889298892,
846
- "grad_norm": 1.6413121795161139,
847
  "learning_rate": 1.0048332006497406e-05,
848
- "loss": 0.9954,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.5535055350553506,
853
- "grad_norm": 1.4636611051453832,
854
  "learning_rate": 9.887227269586184e-06,
855
- "loss": 0.9801,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.5581180811808119,
860
- "grad_norm": 1.4876078711300504,
861
  "learning_rate": 9.7261518028436e-06,
862
- "loss": 0.9896,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.5627306273062731,
867
- "grad_norm": 1.4565652096222934,
868
  "learning_rate": 9.565147413416266e-06,
869
- "loss": 0.9782,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.5673431734317343,
874
- "grad_norm": 1.3376736240483553,
875
  "learning_rate": 9.404255890002677e-06,
876
- "loss": 0.996,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.5719557195571956,
881
- "grad_norm": 1.266054229637192,
882
  "learning_rate": 9.243518992006944e-06,
883
- "loss": 0.974,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.5765682656826568,
888
- "grad_norm": 1.3235752371132756,
889
  "learning_rate": 9.082978438700138e-06,
890
- "loss": 0.9679,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.5811808118081181,
895
- "grad_norm": 1.4334968823366236,
896
  "learning_rate": 8.922675898392072e-06,
897
  "loss": 0.9666,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.5857933579335793,
902
- "grad_norm": 1.373620518219274,
903
  "learning_rate": 8.762652977616258e-06,
904
- "loss": 0.9862,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.5904059040590406,
909
- "grad_norm": 1.4317910304850499,
910
  "learning_rate": 8.602951210330942e-06,
911
- "loss": 0.9662,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.5950184501845018,
916
- "grad_norm": 1.3805470019673511,
917
  "learning_rate": 8.443612047138965e-06,
918
- "loss": 0.9836,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.5996309963099631,
923
- "grad_norm": 1.4344389139852456,
924
  "learning_rate": 8.284676844529258e-06,
925
- "loss": 0.9458,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.6042435424354243,
930
- "grad_norm": 1.2923675142084592,
931
  "learning_rate": 8.126186854142752e-06,
932
- "loss": 0.953,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.6088560885608856,
937
- "grad_norm": 1.3520769915175057,
938
  "learning_rate": 7.968183212065537e-06,
939
- "loss": 0.965,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.6134686346863468,
944
- "grad_norm": 1.3540469095893106,
945
  "learning_rate": 7.81070692815195e-06,
946
- "loss": 0.962,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.6180811808118081,
951
- "grad_norm": 1.4140291089073116,
952
  "learning_rate": 7.6537988753805e-06,
953
- "loss": 0.9566,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.6226937269372693,
958
- "grad_norm": 1.3299616725752847,
959
  "learning_rate": 7.497499779245268e-06,
960
- "loss": 0.965,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.6273062730627307,
965
- "grad_norm": 1.3769480821952969,
966
  "learning_rate": 7.3418502071856004e-06,
967
- "loss": 0.9736,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.6319188191881919,
972
- "grad_norm": 1.3550016015001538,
973
  "learning_rate": 7.186890558056836e-06,
974
- "loss": 0.9528,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.6365313653136532,
979
- "grad_norm": 1.4444985355599935,
980
  "learning_rate": 7.0326610516447825e-06,
981
- "loss": 0.9518,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.6411439114391144,
986
- "grad_norm": 1.3752886192742222,
987
  "learning_rate": 6.879201718226658e-06,
988
- "loss": 0.9437,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.6457564575645757,
993
- "grad_norm": 1.2728918848531818,
994
  "learning_rate": 6.7265523881812335e-06,
995
- "loss": 0.9564,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.6503690036900369,
1000
- "grad_norm": 1.2776578000630632,
1001
  "learning_rate": 6.574752681650864e-06,
1002
- "loss": 0.9431,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.6549815498154982,
1007
- "grad_norm": 1.2934037917676169,
1008
  "learning_rate": 6.423841998258069e-06,
1009
- "loss": 0.9491,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.6595940959409594,
1014
- "grad_norm": 1.3237574013597018,
1015
  "learning_rate": 6.273859506879365e-06,
1016
- "loss": 0.9681,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.6642066420664207,
1021
- "grad_norm": 1.3217921823273022,
1022
  "learning_rate": 6.124844135478971e-06,
1023
- "loss": 0.9649,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.6688191881918819,
1028
- "grad_norm": 1.2737386646079398,
1029
  "learning_rate": 5.976834561005069e-06,
1030
- "loss": 0.9606,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.6734317343173432,
1035
- "grad_norm": 1.3636332749577897,
1036
  "learning_rate": 5.829869199351188e-06,
1037
- "loss": 0.9491,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.6780442804428044,
1042
- "grad_norm": 1.308891744452198,
1043
  "learning_rate": 5.68398619538536e-06,
1044
- "loss": 0.9431,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.6826568265682657,
1049
- "grad_norm": 1.3623472799172087,
1050
  "learning_rate": 5.53922341304961e-06,
1051
- "loss": 0.938,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.6872693726937269,
1056
- "grad_norm": 1.3779395937417078,
1057
  "learning_rate": 5.39561842553239e-06,
1058
- "loss": 0.9765,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.6918819188191881,
1063
- "grad_norm": 1.314594616549353,
1064
  "learning_rate": 5.2532085055164205e-06,
1065
- "loss": 0.9495,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.6964944649446494,
1070
- "grad_norm": 1.2958945545107472,
1071
  "learning_rate": 5.112030615504601e-06,
1072
- "loss": 0.9526,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.7011070110701108,
1077
- "grad_norm": 1.268037176857953,
1078
  "learning_rate": 4.972121398226371e-06,
1079
  "loss": 0.9389,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.705719557195572,
1084
- "grad_norm": 1.3213967171034264,
1085
  "learning_rate": 4.833517167127077e-06,
1086
- "loss": 0.9541,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.7103321033210332,
1091
- "grad_norm": 1.303739776653261,
1092
  "learning_rate": 4.6962538969428416e-06,
1093
- "loss": 0.9523,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.7149446494464945,
1098
- "grad_norm": 1.2626443878609368,
1099
  "learning_rate": 4.560367214363295e-06,
1100
- "loss": 0.9463,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.7195571955719557,
1105
- "grad_norm": 1.4159625474171884,
1106
  "learning_rate": 4.425892388784681e-06,
1107
- "loss": 0.9486,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.724169741697417,
1112
- "grad_norm": 1.4010891439452202,
1113
  "learning_rate": 4.292864323155684e-06,
1114
- "loss": 0.9353,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.7287822878228782,
1119
- "grad_norm": 1.3287864388096529,
1120
  "learning_rate": 4.161317544918345e-06,
1121
  "loss": 0.9389,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.7333948339483395,
1126
- "grad_norm": 1.306056357380541,
1127
  "learning_rate": 4.031286197046493e-06,
1128
  "loss": 0.9423,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.7380073800738007,
1133
- "grad_norm": 1.3004419319996845,
1134
  "learning_rate": 3.902804029183907e-06,
1135
- "loss": 0.9448,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.742619926199262,
1140
- "grad_norm": 1.3193243689915886,
1141
  "learning_rate": 3.775904388884618e-06,
1142
- "loss": 0.9427,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.7472324723247232,
1147
- "grad_norm": 1.296148330228006,
1148
  "learning_rate": 3.650620212957524e-06,
1149
- "loss": 0.9412,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.7518450184501845,
1154
- "grad_norm": 1.4326318299445335,
1155
  "learning_rate": 3.5269840189176616e-06,
1156
- "loss": 0.9289,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.7564575645756457,
1161
- "grad_norm": 1.3055827977236605,
1162
  "learning_rate": 3.405027896546277e-06,
1163
- "loss": 0.926,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.761070110701107,
1168
- "grad_norm": 1.327865337759123,
1169
  "learning_rate": 3.2847834995619067e-06,
1170
- "loss": 0.9453,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.7656826568265682,
1175
- "grad_norm": 1.2543373923102779,
1176
  "learning_rate": 3.1662820374046776e-06,
1177
- "loss": 0.9357,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.7702952029520295,
1182
- "grad_norm": 1.311192918139976,
1183
  "learning_rate": 3.0495542671358745e-06,
1184
- "loss": 0.9299,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.7749077490774908,
1189
- "grad_norm": 1.3013405217794898,
1190
  "learning_rate": 2.934630485454948e-06,
1191
- "loss": 0.9363,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.7795202952029521,
1196
- "grad_norm": 1.3139133572420352,
1197
  "learning_rate": 2.8215405208360237e-06,
1198
  "loss": 0.9399,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.7841328413284133,
1203
- "grad_norm": 1.325489126281033,
1204
  "learning_rate": 2.7103137257858867e-06,
1205
- "loss": 0.9359,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.7887453874538746,
1210
- "grad_norm": 1.3557387383978536,
1211
  "learning_rate": 2.600978969225558e-06,
1212
- "loss": 0.9408,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.7933579335793358,
1217
- "grad_norm": 1.2968402542167223,
1218
  "learning_rate": 2.493564628997369e-06,
1219
- "loss": 0.9473,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.7979704797047971,
1224
- "grad_norm": 1.307351413531213,
1225
  "learning_rate": 2.3880985844994674e-06,
1226
- "loss": 0.9288,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.8025830258302583,
1231
- "grad_norm": 1.301711812307168,
1232
  "learning_rate": 2.284608209449746e-06,
1233
- "loss": 0.9367,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.8071955719557196,
1238
- "grad_norm": 1.3331160835163591,
1239
  "learning_rate": 2.183120364780975e-06,
1240
- "loss": 0.9394,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.8118081180811808,
1245
- "grad_norm": 1.2683359981117004,
1246
  "learning_rate": 2.083661391669043e-06,
1247
  "loss": 0.9255,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.816420664206642,
1252
- "grad_norm": 1.2993174833270567,
1253
  "learning_rate": 1.986257104696121e-06,
1254
- "loss": 0.9368,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.8210332103321033,
1259
- "grad_norm": 1.3745325636574788,
1260
  "learning_rate": 1.8909327851504633e-06,
1261
- "loss": 0.9527,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.8256457564575646,
1266
- "grad_norm": 1.339181816674666,
1267
  "learning_rate": 1.7977131744646724e-06,
1268
- "loss": 0.9331,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.8302583025830258,
1273
- "grad_norm": 1.327094422276523,
1274
  "learning_rate": 1.7066224677940313e-06,
1275
- "loss": 0.9437,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.834870848708487,
1280
- "grad_norm": 1.2652133382811892,
1281
  "learning_rate": 1.6176843077366755e-06,
1282
  "loss": 0.9139,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.8394833948339483,
1287
- "grad_norm": 1.2892238897012769,
1288
  "learning_rate": 1.5309217781971419e-06,
1289
- "loss": 0.9135,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.8440959409594095,
1294
- "grad_norm": 1.3102845800852012,
1295
  "learning_rate": 1.446357398394934e-06,
1296
- "loss": 0.923,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.8487084870848709,
1301
- "grad_norm": 1.3117325038831478,
1302
  "learning_rate": 1.3640131170196758e-06,
1303
- "loss": 0.9159,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.8533210332103321,
1308
- "grad_norm": 5.1971689200456375,
1309
  "learning_rate": 1.2839103065343084e-06,
1310
- "loss": 0.9295,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.8579335793357934,
1315
- "grad_norm": 1.314904030690533,
1316
  "learning_rate": 1.2060697576278812e-06,
1317
- "loss": 0.9336,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.8625461254612546,
1322
- "grad_norm": 1.2791863891720787,
1323
  "learning_rate": 1.1305116738193211e-06,
1324
- "loss": 0.9346,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.8671586715867159,
1329
- "grad_norm": 1.2890531276807027,
1330
  "learning_rate": 1.0572556662136036e-06,
1331
- "loss": 0.9302,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.8717712177121771,
1336
- "grad_norm": 1.2742267464237356,
1337
  "learning_rate": 9.863207484116987e-07,
1338
- "loss": 0.9181,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.8763837638376384,
1343
- "grad_norm": 1.2644327659379573,
1344
  "learning_rate": 9.177253315755796e-07,
1345
- "loss": 0.9379,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.8809963099630996,
1350
- "grad_norm": 1.2421396307830366,
1351
  "learning_rate": 8.514872196496182e-07,
1352
  "loss": 0.935,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.8856088560885609,
1357
- "grad_norm": 1.2203962112131188,
1358
  "learning_rate": 7.876236047395525e-07,
1359
- "loss": 0.9264,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.8902214022140221,
1364
- "grad_norm": 1.2994699310922848,
1365
  "learning_rate": 7.26151062650291e-07,
1366
- "loss": 0.934,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.8948339483394834,
1371
- "grad_norm": 1.3062465605195615,
1372
  "learning_rate": 6.670855485836525e-07,
1373
- "loss": 0.9285,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.8994464944649446,
1378
- "grad_norm": 1.2690114831578048,
1379
  "learning_rate": 6.104423929971948e-07,
1380
- "loss": 0.933,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.9040590405904059,
1385
- "grad_norm": 1.2785142678323616,
1386
  "learning_rate": 5.562362976251901e-07,
1387
- "loss": 0.9379,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.9086715867158671,
1392
- "grad_norm": 1.2131026491304173,
1393
  "learning_rate": 5.044813316627994e-07,
1394
- "loss": 0.9276,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.9132841328413284,
1399
- "grad_norm": 1.300615964359426,
1400
  "learning_rate": 4.5519092811439627e-07,
1401
- "loss": 0.9255,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.9178966789667896,
1406
- "grad_norm": 1.3050095257612062,
1407
  "learning_rate": 4.083778803070504e-07,
1408
- "loss": 0.9437,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.922509225092251,
1413
- "grad_norm": 1.2410745191815336,
1414
  "learning_rate": 3.6405433856999684e-07,
1415
- "loss": 0.957,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.9271217712177122,
1420
- "grad_norm": 1.3063199127205747,
1421
  "learning_rate": 3.2223180708102933e-07,
1422
- "loss": 0.9324,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.9317343173431735,
1427
- "grad_norm": 1.3222002154954164,
1428
  "learning_rate": 2.829211408805932e-07,
1429
- "loss": 0.9176,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.9363468634686347,
1434
- "grad_norm": 1.2578620181074625,
1435
  "learning_rate": 2.461325430543482e-07,
1436
- "loss": 0.9322,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.940959409594096,
1441
- "grad_norm": 1.2337296124585635,
1442
  "learning_rate": 2.1187556208496885e-07,
1443
- "loss": 0.9218,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.9455719557195572,
1448
- "grad_norm": 1.2291744208349304,
1449
  "learning_rate": 1.8015908937382587e-07,
1450
- "loss": 0.9331,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.9501845018450185,
1455
- "grad_norm": 1.2819268220912796,
1456
  "learning_rate": 1.5099135693322776e-07,
1457
- "loss": 0.9337,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.9547970479704797,
1462
- "grad_norm": 1.2311873702844616,
1463
  "learning_rate": 1.2437993524979984e-07,
1464
- "loss": 0.9034,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.959409594095941,
1469
- "grad_norm": 1.306307352211591,
1470
  "learning_rate": 1.0033173131956175e-07,
1471
- "loss": 0.9213,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.9640221402214022,
1476
- "grad_norm": 1.2302162082382204,
1477
  "learning_rate": 7.885298685522235e-08,
1478
- "loss": 0.9153,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.9686346863468634,
1483
- "grad_norm": 1.2949224700302513,
1484
  "learning_rate": 5.99492766661347e-08,
1485
- "loss": 0.9207,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.9732472324723247,
1490
- "grad_norm": 1.2979324069801879,
1491
  "learning_rate": 4.362550721136338e-08,
1492
- "loss": 0.9254,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.977859778597786,
1497
- "grad_norm": 1.2464202848956238,
1498
  "learning_rate": 2.988591532620322e-08,
1499
- "loss": 0.9155,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.9824723247232472,
1504
- "grad_norm": 1.2826213715670498,
1505
  "learning_rate": 1.8734067122514464e-08,
1506
- "loss": 0.9229,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.9870848708487084,
1511
- "grad_norm": 1.2838870564000777,
1512
  "learning_rate": 1.0172857063137643e-08,
1513
- "loss": 0.9254,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.9916974169741697,
1518
- "grad_norm": 1.2809568528218904,
1519
  "learning_rate": 4.204507210633368e-09,
1520
- "loss": 0.9466,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.996309963099631,
1525
- "grad_norm": 1.2838585438818646,
1526
  "learning_rate": 8.30566650548148e-10,
1527
- "loss": 0.9403,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 1.0,
1532
- "eval_loss": 0.9415282607078552,
1533
- "eval_runtime": 141.608,
1534
- "eval_samples_per_second": 108.398,
1535
- "eval_steps_per_second": 1.695,
1536
  "step": 1084
1537
  },
1538
  {
1539
  "epoch": 1.0,
1540
  "step": 1084,
1541
  "total_flos": 453935093514240.0,
1542
- "train_loss": 0.9830407258329357,
1543
- "train_runtime": 11998.8497,
1544
- "train_samples_per_second": 11.558,
1545
- "train_steps_per_second": 0.09
1546
  }
1547
  ],
1548
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0009225092250922509,
13
+ "grad_norm": 9.87069124888532,
14
  "learning_rate": 1.8348623853211012e-07,
15
  "loss": 1.1607,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.004612546125461255,
20
+ "grad_norm": 7.637481536818217,
21
  "learning_rate": 9.174311926605506e-07,
22
  "loss": 1.1303,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.00922509225092251,
27
+ "grad_norm": 3.662821001105185,
28
  "learning_rate": 1.8348623853211011e-06,
29
+ "loss": 1.0569,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.013837638376383764,
34
+ "grad_norm": 2.537889977772848,
35
  "learning_rate": 2.7522935779816517e-06,
36
+ "loss": 1.0237,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.01845018450184502,
41
+ "grad_norm": 2.1108929850361218,
42
  "learning_rate": 3.6697247706422022e-06,
43
+ "loss": 0.993,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.023062730627306273,
48
+ "grad_norm": 1.9063826501229393,
49
  "learning_rate": 4.587155963302753e-06,
50
+ "loss": 0.9822,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.027675276752767528,
55
+ "grad_norm": 2.303421644374396,
56
  "learning_rate": 5.504587155963303e-06,
57
+ "loss": 0.9799,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.03228782287822878,
62
+ "grad_norm": 2.4361534479639824,
63
  "learning_rate": 6.422018348623854e-06,
64
+ "loss": 0.9816,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03690036900369004,
69
+ "grad_norm": 2.0144702285707354,
70
  "learning_rate": 7.3394495412844045e-06,
71
+ "loss": 0.9806,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04151291512915129,
76
+ "grad_norm": 2.0841326391410573,
77
  "learning_rate": 8.256880733944956e-06,
78
+ "loss": 0.9861,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.046125461254612546,
83
+ "grad_norm": 1.9352245609335417,
84
  "learning_rate": 9.174311926605506e-06,
85
+ "loss": 0.9963,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.0507380073800738,
90
+ "grad_norm": 2.3043607004443283,
91
  "learning_rate": 1.0091743119266055e-05,
92
+ "loss": 0.9904,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.055350553505535055,
97
+ "grad_norm": 2.199239666479046,
98
  "learning_rate": 1.1009174311926607e-05,
99
+ "loss": 0.994,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05996309963099631,
104
+ "grad_norm": 1.9024609720272683,
105
  "learning_rate": 1.1926605504587156e-05,
106
+ "loss": 1.0039,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06457564575645756,
111
+ "grad_norm": 2.57414786410357,
112
  "learning_rate": 1.2844036697247708e-05,
113
+ "loss": 0.9881,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.06918819188191883,
118
+ "grad_norm": 2.4073690509013734,
119
  "learning_rate": 1.3761467889908258e-05,
120
+ "loss": 1.0037,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07380073800738007,
125
+ "grad_norm": 2.344151583739395,
126
  "learning_rate": 1.4678899082568809e-05,
127
+ "loss": 1.0155,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.07841328413284133,
132
+ "grad_norm": 2.398529050285677,
133
  "learning_rate": 1.559633027522936e-05,
134
+ "loss": 1.0012,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08302583025830258,
139
+ "grad_norm": 2.418044809142461,
140
  "learning_rate": 1.6513761467889912e-05,
141
+ "loss": 0.9945,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08763837638376384,
146
+ "grad_norm": 2.5129111485618942,
147
  "learning_rate": 1.743119266055046e-05,
148
+ "loss": 0.9857,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09225092250922509,
153
+ "grad_norm": 1.9698114649305183,
154
  "learning_rate": 1.834862385321101e-05,
155
+ "loss": 1.0359,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09686346863468635,
160
+ "grad_norm": 2.1358990392067705,
161
  "learning_rate": 1.9266055045871563e-05,
162
+ "loss": 1.0084,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.1014760147601476,
167
+ "grad_norm": 2.0245472890624825,
168
  "learning_rate": 1.9999948088910656e-05,
169
+ "loss": 1.0141,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.10608856088560886,
174
+ "grad_norm": 2.0374124109562066,
175
  "learning_rate": 1.9998131257372878e-05,
176
+ "loss": 1.0244,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.11070110701107011,
181
+ "grad_norm": 1.7325251719074102,
182
  "learning_rate": 1.999371941029485e-05,
183
+ "loss": 1.0162,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.11531365313653137,
188
+ "grad_norm": 1.9546491823840284,
189
  "learning_rate": 1.9986713692771732e-05,
190
+ "loss": 1.0385,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.11992619926199262,
195
+ "grad_norm": 1.7981992444328092,
196
  "learning_rate": 1.9977115923137912e-05,
197
+ "loss": 1.03,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.12453874538745388,
202
+ "grad_norm": 1.787669857072768,
203
  "learning_rate": 1.9964928592495046e-05,
204
+ "loss": 1.0301,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.12915129151291513,
209
+ "grad_norm": 1.8171487016580647,
210
  "learning_rate": 1.9950154864065497e-05,
211
+ "loss": 1.0284,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.13376383763837638,
216
+ "grad_norm": 1.8557221003169828,
217
  "learning_rate": 1.993279857237133e-05,
218
+ "loss": 1.0318,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.13837638376383765,
223
+ "grad_norm": 2.1551905075917976,
224
  "learning_rate": 1.9912864222239045e-05,
225
+ "loss": 1.0407,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.1429889298892989,
230
+ "grad_norm": 2.514388634020912,
231
  "learning_rate": 1.9890356987630362e-05,
232
+ "loss": 1.0212,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.14760147601476015,
237
+ "grad_norm": 1.9323789502583069,
238
  "learning_rate": 1.986528271029931e-05,
239
+ "loss": 1.0247,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.1522140221402214,
244
+ "grad_norm": 2.1684894847599407,
245
  "learning_rate": 1.9837647898276008e-05,
246
+ "loss": 1.0242,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.15682656826568267,
251
+ "grad_norm": 1.9242427718398765,
252
  "learning_rate": 1.9807459724177497e-05,
253
+ "loss": 1.0237,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.16143911439114392,
258
+ "grad_norm": 2.1094860370607793,
259
  "learning_rate": 1.977472602334609e-05,
260
+ "loss": 1.0293,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.16605166051660517,
265
+ "grad_norm": 1.6601715392116458,
266
  "learning_rate": 1.973945529181572e-05,
267
+ "loss": 1.0321,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.1706642066420664,
272
+ "grad_norm": 1.7762559074553415,
273
  "learning_rate": 1.9701656684106764e-05,
274
+ "loss": 1.037,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.1752767527675277,
279
+ "grad_norm": 1.805704388216368,
280
  "learning_rate": 1.9661340010850025e-05,
281
+ "loss": 1.027,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.17988929889298894,
286
+ "grad_norm": 1.7681290984831735,
287
  "learning_rate": 1.9618515736240353e-05,
288
+ "loss": 1.05,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.18450184501845018,
293
+ "grad_norm": 1.9579635085842144,
294
  "learning_rate": 1.9573194975320672e-05,
295
+ "loss": 1.0384,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.18911439114391143,
300
+ "grad_norm": 1.7758217950165514,
301
  "learning_rate": 1.952538949109708e-05,
302
+ "loss": 1.0455,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.1937269372693727,
307
+ "grad_norm": 1.7447577937143957,
308
  "learning_rate": 1.9475111691485737e-05,
309
+ "loss": 1.0223,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.19833948339483395,
314
+ "grad_norm": 1.7070823353641547,
315
  "learning_rate": 1.9422374626092414e-05,
316
+ "loss": 1.0304,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.2029520295202952,
321
+ "grad_norm": 1.608022610830541,
322
  "learning_rate": 1.936719198282545e-05,
323
+ "loss": 1.0295,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.20756457564575645,
328
+ "grad_norm": 1.501683364990764,
329
  "learning_rate": 1.930957808434307e-05,
330
+ "loss": 1.0259,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.21217712177121772,
335
+ "grad_norm": 1.6270563682677235,
336
  "learning_rate": 1.9249547884335917e-05,
337
+ "loss": 1.0297,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.21678966789667897,
342
+ "grad_norm": 1.7238840286664376,
343
  "learning_rate": 1.9187116963645845e-05,
344
+ "loss": 1.0346,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.22140221402214022,
349
+ "grad_norm": 1.9345013049084874,
350
  "learning_rate": 1.912230152622189e-05,
351
+ "loss": 1.0327,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.22601476014760147,
356
+ "grad_norm": 1.8517916832280532,
357
  "learning_rate": 1.9055118394914545e-05,
358
  "loss": 1.0367,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.23062730627306274,
363
+ "grad_norm": 1.6298646424255816,
364
  "learning_rate": 1.898558500710939e-05,
365
+ "loss": 1.0328,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.235239852398524,
370
+ "grad_norm": 1.6424022648576855,
371
  "learning_rate": 1.891371941020121e-05,
372
+ "loss": 1.0527,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.23985239852398524,
377
+ "grad_norm": 1.7425295163698495,
378
  "learning_rate": 1.88395402569098e-05,
379
+ "loss": 1.0165,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.2444649446494465,
384
+ "grad_norm": 1.6241841879077579,
385
  "learning_rate": 1.8763066800438638e-05,
386
+ "loss": 1.0261,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.24907749077490776,
391
+ "grad_norm": 1.6945753894174898,
392
  "learning_rate": 1.868431888947773e-05,
393
+ "loss": 1.0365,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.253690036900369,
398
+ "grad_norm": 1.576532058041066,
399
  "learning_rate": 1.860331696305188e-05,
400
+ "loss": 1.0267,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.25830258302583026,
405
+ "grad_norm": 1.584573157237757,
406
  "learning_rate": 1.852008204521572e-05,
407
+ "loss": 1.0196,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.2629151291512915,
412
+ "grad_norm": 1.525407220806812,
413
  "learning_rate": 1.8434635739596945e-05,
414
+ "loss": 1.0359,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.26752767527675275,
419
+ "grad_norm": 1.6099653934208198,
420
  "learning_rate": 1.834700022378907e-05,
421
+ "loss": 1.053,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.272140221402214,
426
+ "grad_norm": 1.6022748273878131,
427
  "learning_rate": 1.825719824359524e-05,
428
+ "loss": 1.0392,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.2767527675276753,
433
+ "grad_norm": 1.581538008430849,
434
  "learning_rate": 1.816525310712456e-05,
435
+ "loss": 1.0094,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.28136531365313655,
440
+ "grad_norm": 1.547945989389372,
441
  "learning_rate": 1.8071188678742457e-05,
442
  "loss": 1.0166,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.2859778597785978,
447
+ "grad_norm": 1.5893709618261977,
448
  "learning_rate": 1.7975029372876706e-05,
449
+ "loss": 1.0215,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.29059040590405905,
454
+ "grad_norm": 1.5451010927075632,
455
  "learning_rate": 1.787680014768065e-05,
456
  "loss": 1.0434,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.2952029520295203,
461
+ "grad_norm": 1.56800364704446,
462
  "learning_rate": 1.777652649855531e-05,
463
+ "loss": 1.0112,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.29981549815498154,
468
+ "grad_norm": 1.5993537873748278,
469
  "learning_rate": 1.7674234451532065e-05,
470
+ "loss": 1.0309,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.3044280442804428,
475
+ "grad_norm": 1.585947275219834,
476
  "learning_rate": 1.7569950556517566e-05,
477
  "loss": 1.052,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.30904059040590404,
482
+ "grad_norm": 1.5112282757161204,
483
  "learning_rate": 1.7463701880402738e-05,
484
+ "loss": 1.021,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.31365313653136534,
489
+ "grad_norm": 1.589682997809413,
490
  "learning_rate": 1.7355516000037555e-05,
491
+ "loss": 1.0292,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.3182656826568266,
496
+ "grad_norm": 1.7115271953853421,
497
  "learning_rate": 1.7245420995073453e-05,
498
+ "loss": 1.0416,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.32287822878228783,
503
+ "grad_norm": 1.602143873918192,
504
  "learning_rate": 1.7133445440675268e-05,
505
  "loss": 1.028,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.3274907749077491,
510
+ "grad_norm": 1.5571424522990809,
511
  "learning_rate": 1.7019618400104572e-05,
512
+ "loss": 1.0271,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.33210332103321033,
517
+ "grad_norm": 1.5239968044057388,
518
  "learning_rate": 1.6903969417176244e-05,
519
+ "loss": 1.0184,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.3367158671586716,
524
+ "grad_norm": 1.5569745783828202,
525
  "learning_rate": 1.6786528508590436e-05,
526
+ "loss": 1.0329,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.3413284132841328,
531
+ "grad_norm": 1.9748263576909895,
532
  "learning_rate": 1.666732615614169e-05,
533
+ "loss": 1.0202,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.3459409594095941,
538
+ "grad_norm": 1.4426758781209656,
539
  "learning_rate": 1.6546393298807405e-05,
540
+ "loss": 1.0184,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.3505535055350554,
545
+ "grad_norm": 1.956219718617806,
546
  "learning_rate": 1.6423761324717636e-05,
547
+ "loss": 1.0232,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.3551660516605166,
552
+ "grad_norm": 1.552656856196208,
553
  "learning_rate": 1.6299462063008272e-05,
554
+ "loss": 1.0155,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.35977859778597787,
559
+ "grad_norm": 1.547453869406189,
560
  "learning_rate": 1.61735277755598e-05,
561
+ "loss": 1.023,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.3643911439114391,
566
+ "grad_norm": 1.9841965884936383,
567
  "learning_rate": 1.6045991148623752e-05,
568
+ "loss": 1.0169,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.36900369003690037,
573
+ "grad_norm": 1.5804913256312618,
574
  "learning_rate": 1.5916885284338937e-05,
575
+ "loss": 1.005,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.3736162361623616,
580
+ "grad_norm": 1.4500292243389075,
581
  "learning_rate": 1.5786243692139826e-05,
582
+ "loss": 1.0268,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.37822878228782286,
587
+ "grad_norm": 1.5270225195154143,
588
  "learning_rate": 1.5654100280059155e-05,
589
+ "loss": 1.0033,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.3828413284132841,
594
+ "grad_norm": 1.4524051988943654,
595
  "learning_rate": 1.5520489345927095e-05,
596
+ "loss": 1.0034,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.3874538745387454,
601
+ "grad_norm": 1.5364744313244658,
602
  "learning_rate": 1.538544556846925e-05,
603
+ "loss": 1.0248,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.39206642066420666,
608
+ "grad_norm": 1.4472396779072696,
609
  "learning_rate": 1.5249003998305787e-05,
610
+ "loss": 1.0135,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.3966789667896679,
615
+ "grad_norm": 1.520494566330917,
616
  "learning_rate": 1.5111200048854055e-05,
617
+ "loss": 1.0201,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.40129151291512916,
622
+ "grad_norm": 1.4719100022352993,
623
  "learning_rate": 1.4972069487137024e-05,
624
+ "loss": 1.0095,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.4059040590405904,
629
+ "grad_norm": 1.598812244722486,
630
  "learning_rate": 1.4831648424499953e-05,
631
+ "loss": 1.0162,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.41051660516605165,
636
+ "grad_norm": 1.4065472391213587,
637
  "learning_rate": 1.4689973307237687e-05,
638
+ "loss": 1.0291,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.4151291512915129,
643
+ "grad_norm": 1.4646473697790325,
644
  "learning_rate": 1.4547080907135024e-05,
645
  "loss": 1.0125,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.41974169741697415,
650
+ "grad_norm": 1.4001634930874887,
651
  "learning_rate": 1.4403008311922593e-05,
652
+ "loss": 1.0116,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.42435424354243545,
657
+ "grad_norm": 1.398806355327192,
658
  "learning_rate": 1.4257792915650728e-05,
659
+ "loss": 1.0134,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.4289667896678967,
664
+ "grad_norm": 1.3844222701270295,
665
  "learning_rate": 1.4111472408983843e-05,
666
+ "loss": 1.006,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.43357933579335795,
671
+ "grad_norm": 1.5808754908114087,
672
  "learning_rate": 1.3964084769417823e-05,
673
+ "loss": 1.0305,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.4381918819188192,
678
+ "grad_norm": 1.6160615173021982,
679
  "learning_rate": 1.3815668251422953e-05,
680
+ "loss": 0.9955,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.44280442804428044,
685
+ "grad_norm": 1.4667586778867048,
686
  "learning_rate": 1.3666261376514978e-05,
687
+ "loss": 0.9926,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.4474169741697417,
692
+ "grad_norm": 1.4591073794026388,
693
  "learning_rate": 1.3515902923256832e-05,
694
+ "loss": 1.0086,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.45202952029520294,
699
+ "grad_norm": 1.5638562371641933,
700
  "learning_rate": 1.3364631917193671e-05,
701
+ "loss": 1.0009,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.4566420664206642,
706
+ "grad_norm": 1.6344875906653302,
707
  "learning_rate": 1.321248762072377e-05,
708
+ "loss": 1.0035,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.4612546125461255,
713
+ "grad_norm": 1.451893714311935,
714
  "learning_rate": 1.3059509522907998e-05,
715
+ "loss": 0.9977,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.46586715867158673,
720
+ "grad_norm": 1.4139701174616282,
721
  "learning_rate": 1.2905737329220394e-05,
722
+ "loss": 0.9965,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.470479704797048,
727
+ "grad_norm": 1.455878372967596,
728
  "learning_rate": 1.2751210951242636e-05,
729
  "loss": 1.0052,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.47509225092250923,
734
+ "grad_norm": 1.4569058816755027,
735
  "learning_rate": 1.2595970496304975e-05,
736
  "loss": 0.9942,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.4797047970479705,
741
+ "grad_norm": 1.427126351694174,
742
  "learning_rate": 1.2440056257076376e-05,
743
+ "loss": 1.0083,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.4843173431734317,
748
+ "grad_norm": 1.4756361746839437,
749
  "learning_rate": 1.2283508701106559e-05,
750
+ "loss": 0.9791,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.488929889298893,
755
+ "grad_norm": 1.3844767567894687,
756
  "learning_rate": 1.2126368460322637e-05,
757
+ "loss": 1.0081,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.4935424354243542,
762
+ "grad_norm": 1.3341569254535863,
763
  "learning_rate": 1.1968676320483103e-05,
764
  "loss": 0.9684,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.4981549815498155,
769
+ "grad_norm": 1.4344785984280697,
770
  "learning_rate": 1.1810473210591882e-05,
771
  "loss": 0.9852,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.5027675276752768,
776
+ "grad_norm": 1.754080538395525,
777
  "learning_rate": 1.1651800192275197e-05,
778
+ "loss": 0.98,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.507380073800738,
783
+ "grad_norm": 1.4043415597711335,
784
  "learning_rate": 1.1492698449124042e-05,
785
+ "loss": 0.9885,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.5119926199261993,
790
+ "grad_norm": 1.3966005324424933,
791
  "learning_rate": 1.1333209276004959e-05,
792
+ "loss": 0.9963,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.5166051660516605,
797
+ "grad_norm": 1.547684566936734,
798
  "learning_rate": 1.1173374068341962e-05,
799
+ "loss": 1.0059,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.5212177121771218,
804
+ "grad_norm": 1.7439931213982827,
805
  "learning_rate": 1.1013234311372353e-05,
806
+ "loss": 0.9885,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.525830258302583,
811
+ "grad_norm": 1.5974123883312268,
812
  "learning_rate": 1.0852831569379217e-05,
813
+ "loss": 0.9919,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.5304428044280443,
818
+ "grad_norm": 1.5564454777648937,
819
  "learning_rate": 1.0692207474903421e-05,
820
+ "loss": 0.9908,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.5350553505535055,
825
+ "grad_norm": 1.380805738943329,
826
  "learning_rate": 1.0531403717937888e-05,
827
+ "loss": 0.9888,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.5396678966789668,
832
+ "grad_norm": 1.5491179141261517,
833
  "learning_rate": 1.037046203510694e-05,
834
+ "loss": 0.9762,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.544280442804428,
839
+ "grad_norm": 1.4181650465011963,
840
  "learning_rate": 1.0209424198833571e-05,
841
+ "loss": 0.9862,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.5488929889298892,
846
+ "grad_norm": 1.5960637409599259,
847
  "learning_rate": 1.0048332006497406e-05,
848
+ "loss": 0.9933,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.5535055350553506,
853
+ "grad_norm": 1.4796470839876057,
854
  "learning_rate": 9.887227269586184e-06,
855
+ "loss": 0.9802,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.5581180811808119,
860
+ "grad_norm": 1.4873847151380262,
861
  "learning_rate": 9.7261518028436e-06,
862
+ "loss": 0.9897,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.5627306273062731,
867
+ "grad_norm": 1.41471946282249,
868
  "learning_rate": 9.565147413416266e-06,
869
+ "loss": 0.9779,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.5673431734317343,
874
+ "grad_norm": 1.3655515654525507,
875
  "learning_rate": 9.404255890002677e-06,
876
+ "loss": 0.9965,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.5719557195571956,
881
+ "grad_norm": 1.335274063215355,
882
  "learning_rate": 9.243518992006944e-06,
883
+ "loss": 0.9731,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.5765682656826568,
888
+ "grad_norm": 1.3554465338690787,
889
  "learning_rate": 9.082978438700138e-06,
890
+ "loss": 0.9676,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.5811808118081181,
895
+ "grad_norm": 1.4235930731304616,
896
  "learning_rate": 8.922675898392072e-06,
897
  "loss": 0.9666,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.5857933579335793,
902
+ "grad_norm": 5.2910626769080595,
903
  "learning_rate": 8.762652977616258e-06,
904
+ "loss": 0.986,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.5904059040590406,
909
+ "grad_norm": 1.3831211598811353,
910
  "learning_rate": 8.602951210330942e-06,
911
+ "loss": 0.9666,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.5950184501845018,
916
+ "grad_norm": 1.384620337979853,
917
  "learning_rate": 8.443612047138965e-06,
918
+ "loss": 0.9828,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.5996309963099631,
923
+ "grad_norm": 1.4337206214300657,
924
  "learning_rate": 8.284676844529258e-06,
925
+ "loss": 0.9454,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.6042435424354243,
930
+ "grad_norm": 1.3016737402975005,
931
  "learning_rate": 8.126186854142752e-06,
932
+ "loss": 0.9526,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.6088560885608856,
937
+ "grad_norm": 1.339333065167964,
938
  "learning_rate": 7.968183212065537e-06,
939
+ "loss": 0.9645,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.6134686346863468,
944
+ "grad_norm": 1.3518100579415706,
945
  "learning_rate": 7.81070692815195e-06,
946
+ "loss": 0.9611,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.6180811808118081,
951
+ "grad_norm": 1.3732804507416974,
952
  "learning_rate": 7.6537988753805e-06,
953
+ "loss": 0.9561,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.6226937269372693,
958
+ "grad_norm": 1.3297074852238435,
959
  "learning_rate": 7.497499779245268e-06,
960
+ "loss": 0.9647,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.6273062730627307,
965
+ "grad_norm": 1.3980138453757798,
966
  "learning_rate": 7.3418502071856004e-06,
967
+ "loss": 0.9731,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.6319188191881919,
972
+ "grad_norm": 1.3624541986139007,
973
  "learning_rate": 7.186890558056836e-06,
974
+ "loss": 0.953,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.6365313653136532,
979
+ "grad_norm": 1.4662848790274252,
980
  "learning_rate": 7.0326610516447825e-06,
981
+ "loss": 0.9521,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.6411439114391144,
986
+ "grad_norm": 1.4634625847212066,
987
  "learning_rate": 6.879201718226658e-06,
988
+ "loss": 0.9435,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.6457564575645757,
993
+ "grad_norm": 1.2725239803843122,
994
  "learning_rate": 6.7265523881812335e-06,
995
+ "loss": 0.9563,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.6503690036900369,
1000
+ "grad_norm": 1.2859613538989152,
1001
  "learning_rate": 6.574752681650864e-06,
1002
+ "loss": 0.9425,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.6549815498154982,
1007
+ "grad_norm": 1.2989719564279842,
1008
  "learning_rate": 6.423841998258069e-06,
1009
+ "loss": 0.9486,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.6595940959409594,
1014
+ "grad_norm": 1.3253272023108238,
1015
  "learning_rate": 6.273859506879365e-06,
1016
+ "loss": 0.968,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.6642066420664207,
1021
+ "grad_norm": 1.3394701310088128,
1022
  "learning_rate": 6.124844135478971e-06,
1023
+ "loss": 0.965,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.6688191881918819,
1028
+ "grad_norm": 1.340333856091027,
1029
  "learning_rate": 5.976834561005069e-06,
1030
+ "loss": 0.9595,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.6734317343173432,
1035
+ "grad_norm": 1.3468413770308945,
1036
  "learning_rate": 5.829869199351188e-06,
1037
+ "loss": 0.9489,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.6780442804428044,
1042
+ "grad_norm": 1.4216747763305098,
1043
  "learning_rate": 5.68398619538536e-06,
1044
+ "loss": 0.943,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.6826568265682657,
1049
+ "grad_norm": 1.3645056097489368,
1050
  "learning_rate": 5.53922341304961e-06,
1051
+ "loss": 0.9379,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.6872693726937269,
1056
+ "grad_norm": 1.3794510779283775,
1057
  "learning_rate": 5.39561842553239e-06,
1058
+ "loss": 0.9761,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.6918819188191881,
1063
+ "grad_norm": 1.350781271725051,
1064
  "learning_rate": 5.2532085055164205e-06,
1065
+ "loss": 0.9496,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.6964944649446494,
1070
+ "grad_norm": 1.3103824946112994,
1071
  "learning_rate": 5.112030615504601e-06,
1072
+ "loss": 0.9525,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.7011070110701108,
1077
+ "grad_norm": 1.2630745586647596,
1078
  "learning_rate": 4.972121398226371e-06,
1079
  "loss": 0.9389,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.705719557195572,
1084
+ "grad_norm": 1.3222994315961483,
1085
  "learning_rate": 4.833517167127077e-06,
1086
+ "loss": 0.953,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.7103321033210332,
1091
+ "grad_norm": 1.2932817703941415,
1092
  "learning_rate": 4.6962538969428416e-06,
1093
+ "loss": 0.9518,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.7149446494464945,
1098
+ "grad_norm": 1.273698406266777,
1099
  "learning_rate": 4.560367214363295e-06,
1100
+ "loss": 0.9465,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.7195571955719557,
1105
+ "grad_norm": 1.3922401933210866,
1106
  "learning_rate": 4.425892388784681e-06,
1107
+ "loss": 0.9484,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.724169741697417,
1112
+ "grad_norm": 1.3825399178312323,
1113
  "learning_rate": 4.292864323155684e-06,
1114
+ "loss": 0.9356,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.7287822878228782,
1119
+ "grad_norm": 1.3059433185103952,
1120
  "learning_rate": 4.161317544918345e-06,
1121
  "loss": 0.9389,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.7333948339483395,
1126
+ "grad_norm": 1.2856292890537573,
1127
  "learning_rate": 4.031286197046493e-06,
1128
  "loss": 0.9423,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.7380073800738007,
1133
+ "grad_norm": 1.3259995341033894,
1134
  "learning_rate": 3.902804029183907e-06,
1135
+ "loss": 0.9452,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.742619926199262,
1140
+ "grad_norm": 1.3497689286484875,
1141
  "learning_rate": 3.775904388884618e-06,
1142
+ "loss": 0.9426,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.7472324723247232,
1147
+ "grad_norm": 1.2944563230099602,
1148
  "learning_rate": 3.650620212957524e-06,
1149
+ "loss": 0.9408,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.7518450184501845,
1154
+ "grad_norm": 1.461614769681836,
1155
  "learning_rate": 3.5269840189176616e-06,
1156
+ "loss": 0.9283,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.7564575645756457,
1161
+ "grad_norm": 1.309936562816762,
1162
  "learning_rate": 3.405027896546277e-06,
1163
+ "loss": 0.9264,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.761070110701107,
1168
+ "grad_norm": 1.3200989144993005,
1169
  "learning_rate": 3.2847834995619067e-06,
1170
+ "loss": 0.9451,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.7656826568265682,
1175
+ "grad_norm": 1.2485940930123258,
1176
  "learning_rate": 3.1662820374046776e-06,
1177
+ "loss": 0.9354,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.7702952029520295,
1182
+ "grad_norm": 1.3042562154959694,
1183
  "learning_rate": 3.0495542671358745e-06,
1184
+ "loss": 0.9293,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.7749077490774908,
1189
+ "grad_norm": 1.2841878078681912,
1190
  "learning_rate": 2.934630485454948e-06,
1191
+ "loss": 0.9354,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.7795202952029521,
1196
+ "grad_norm": 1.2965638770448034,
1197
  "learning_rate": 2.8215405208360237e-06,
1198
  "loss": 0.9399,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.7841328413284133,
1203
+ "grad_norm": 1.309994639049123,
1204
  "learning_rate": 2.7103137257858867e-06,
1205
+ "loss": 0.9356,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.7887453874538746,
1210
+ "grad_norm": 1.363624295286568,
1211
  "learning_rate": 2.600978969225558e-06,
1212
+ "loss": 0.9404,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.7933579335793358,
1217
+ "grad_norm": 1.286633306272539,
1218
  "learning_rate": 2.493564628997369e-06,
1219
+ "loss": 0.9477,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.7979704797047971,
1224
+ "grad_norm": 1.3124608084944471,
1225
  "learning_rate": 2.3880985844994674e-06,
1226
+ "loss": 0.9285,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.8025830258302583,
1231
+ "grad_norm": 1.3039218311971004,
1232
  "learning_rate": 2.284608209449746e-06,
1233
+ "loss": 0.9362,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.8071955719557196,
1238
+ "grad_norm": 1.360146108241008,
1239
  "learning_rate": 2.183120364780975e-06,
1240
+ "loss": 0.9391,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.8118081180811808,
1245
+ "grad_norm": 1.2717837199196151,
1246
  "learning_rate": 2.083661391669043e-06,
1247
  "loss": 0.9255,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.816420664206642,
1252
+ "grad_norm": 1.348042003672959,
1253
  "learning_rate": 1.986257104696121e-06,
1254
+ "loss": 0.9369,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.8210332103321033,
1259
+ "grad_norm": 1.4427511726488103,
1260
  "learning_rate": 1.8909327851504633e-06,
1261
+ "loss": 0.953,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.8256457564575646,
1266
+ "grad_norm": 1.3362385077260535,
1267
  "learning_rate": 1.7977131744646724e-06,
1268
+ "loss": 0.9327,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.8302583025830258,
1273
+ "grad_norm": 1.3126899684942004,
1274
  "learning_rate": 1.7066224677940313e-06,
1275
+ "loss": 0.9438,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.834870848708487,
1280
+ "grad_norm": 1.263580902374999,
1281
  "learning_rate": 1.6176843077366755e-06,
1282
  "loss": 0.9139,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.8394833948339483,
1287
+ "grad_norm": 1.3323883748204894,
1288
  "learning_rate": 1.5309217781971419e-06,
1289
+ "loss": 0.9137,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.8440959409594095,
1294
+ "grad_norm": 1.3049631322592083,
1295
  "learning_rate": 1.446357398394934e-06,
1296
+ "loss": 0.9232,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.8487084870848709,
1301
+ "grad_norm": 1.3184435732078046,
1302
  "learning_rate": 1.3640131170196758e-06,
1303
+ "loss": 0.9163,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.8533210332103321,
1308
+ "grad_norm": 1.9046899055595279,
1309
  "learning_rate": 1.2839103065343084e-06,
1310
+ "loss": 0.9283,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.8579335793357934,
1315
+ "grad_norm": 1.3121941017876284,
1316
  "learning_rate": 1.2060697576278812e-06,
1317
+ "loss": 0.9335,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.8625461254612546,
1322
+ "grad_norm": 1.2845805796103378,
1323
  "learning_rate": 1.1305116738193211e-06,
1324
+ "loss": 0.9345,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.8671586715867159,
1329
+ "grad_norm": 1.3753219299344046,
1330
  "learning_rate": 1.0572556662136036e-06,
1331
+ "loss": 0.9303,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.8717712177121771,
1336
+ "grad_norm": 1.281057437073945,
1337
  "learning_rate": 9.863207484116987e-07,
1338
+ "loss": 0.9183,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.8763837638376384,
1343
+ "grad_norm": 1.2706888106388057,
1344
  "learning_rate": 9.177253315755796e-07,
1345
+ "loss": 0.9381,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.8809963099630996,
1350
+ "grad_norm": 1.2538061234568678,
1351
  "learning_rate": 8.514872196496182e-07,
1352
  "loss": 0.935,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.8856088560885609,
1357
+ "grad_norm": 1.248939276086923,
1358
  "learning_rate": 7.876236047395525e-07,
1359
+ "loss": 0.9262,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.8902214022140221,
1364
+ "grad_norm": 1.282623537519779,
1365
  "learning_rate": 7.26151062650291e-07,
1366
+ "loss": 0.9341,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.8948339483394834,
1371
+ "grad_norm": 1.3210618564039815,
1372
  "learning_rate": 6.670855485836525e-07,
1373
+ "loss": 0.9282,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.8994464944649446,
1378
+ "grad_norm": 1.258506432298463,
1379
  "learning_rate": 6.104423929971948e-07,
1380
+ "loss": 0.9324,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.9040590405904059,
1385
+ "grad_norm": 1.26397810078899,
1386
  "learning_rate": 5.562362976251901e-07,
1387
+ "loss": 0.9378,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.9086715867158671,
1392
+ "grad_norm": 1.2198200085641175,
1393
  "learning_rate": 5.044813316627994e-07,
1394
+ "loss": 0.9271,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.9132841328413284,
1399
+ "grad_norm": 1.2895766328734444,
1400
  "learning_rate": 4.5519092811439627e-07,
1401
+ "loss": 0.9252,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.9178966789667896,
1406
+ "grad_norm": 1.3194370085413256,
1407
  "learning_rate": 4.083778803070504e-07,
1408
+ "loss": 0.9439,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.922509225092251,
1413
+ "grad_norm": 1.2409895536435456,
1414
  "learning_rate": 3.6405433856999684e-07,
1415
+ "loss": 0.9568,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.9271217712177122,
1420
+ "grad_norm": 1.3179871393791862,
1421
  "learning_rate": 3.2223180708102933e-07,
1422
+ "loss": 0.9325,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.9317343173431735,
1427
+ "grad_norm": 1.285178507362633,
1428
  "learning_rate": 2.829211408805932e-07,
1429
+ "loss": 0.9178,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.9363468634686347,
1434
+ "grad_norm": 1.2573019061474504,
1435
  "learning_rate": 2.461325430543482e-07,
1436
+ "loss": 0.9315,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.940959409594096,
1441
+ "grad_norm": 1.22778494354884,
1442
  "learning_rate": 2.1187556208496885e-07,
1443
+ "loss": 0.921,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.9455719557195572,
1448
+ "grad_norm": 1.24393857404682,
1449
  "learning_rate": 1.8015908937382587e-07,
1450
+ "loss": 0.933,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.9501845018450185,
1455
+ "grad_norm": 1.285512101839501,
1456
  "learning_rate": 1.5099135693322776e-07,
1457
+ "loss": 0.9338,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.9547970479704797,
1462
+ "grad_norm": 1.2209551692205383,
1463
  "learning_rate": 1.2437993524979984e-07,
1464
+ "loss": 0.9032,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.959409594095941,
1469
+ "grad_norm": 1.2927034232954018,
1470
  "learning_rate": 1.0033173131956175e-07,
1471
+ "loss": 0.9212,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.9640221402214022,
1476
+ "grad_norm": 1.237696624670866,
1477
  "learning_rate": 7.885298685522235e-08,
1478
+ "loss": 0.9155,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.9686346863468634,
1483
+ "grad_norm": 1.2747731851750532,
1484
  "learning_rate": 5.99492766661347e-08,
1485
+ "loss": 0.9212,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.9732472324723247,
1490
+ "grad_norm": 1.2758933589184267,
1491
  "learning_rate": 4.362550721136338e-08,
1492
+ "loss": 0.9255,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.977859778597786,
1497
+ "grad_norm": 1.240622077510343,
1498
  "learning_rate": 2.988591532620322e-08,
1499
+ "loss": 0.9159,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.9824723247232472,
1504
+ "grad_norm": 1.2735821034434907,
1505
  "learning_rate": 1.8734067122514464e-08,
1506
+ "loss": 0.9223,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.9870848708487084,
1511
+ "grad_norm": 1.4488806574709552,
1512
  "learning_rate": 1.0172857063137643e-08,
1513
+ "loss": 0.9252,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.9916974169741697,
1518
+ "grad_norm": 1.2832307201478261,
1519
  "learning_rate": 4.204507210633368e-09,
1520
+ "loss": 0.9462,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.996309963099631,
1525
+ "grad_norm": 1.2810843286313107,
1526
  "learning_rate": 8.30566650548148e-10,
1527
+ "loss": 0.9404,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 1.0,
1532
+ "eval_loss": 0.9414571523666382,
1533
+ "eval_runtime": 141.0114,
1534
+ "eval_samples_per_second": 108.856,
1535
+ "eval_steps_per_second": 1.702,
1536
  "step": 1084
1537
  },
1538
  {
1539
  "epoch": 1.0,
1540
  "step": 1084,
1541
  "total_flos": 453935093514240.0,
1542
+ "train_loss": 0.9826947098728476,
1543
+ "train_runtime": 5906.6996,
1544
+ "train_samples_per_second": 23.48,
1545
+ "train_steps_per_second": 0.184
1546
  }
1547
  ],
1548
  "logging_steps": 5,