qubvel-hf HF staff commited on
Commit
8e16b25
1 Parent(s): 7ea2ef2

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,8 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: timm/resnet101.a1_in1k
5
  tags:
 
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -16,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # vit-base-beans
18
 
19
- This model is a fine-tuned version of [timm/resnet101.a1_in1k](https://huggingface.co/timm/resnet101.a1_in1k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.5027
22
  - Accuracy: 0.8571
23
 
24
  ## Model description
 
3
  license: apache-2.0
4
  base_model: timm/resnet101.a1_in1k
5
  tags:
6
+ - image-classification
7
+ - vision
8
  - generated_from_trainer
9
  metrics:
10
  - accuracy
 
18
 
19
  # vit-base-beans
20
 
21
+ This model is a fine-tuned version of [timm/resnet101.a1_in1k](https://huggingface.co/timm/resnet101.a1_in1k) on the beans dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.4913
24
  - Accuracy: 0.8571
25
 
26
  ## Model description
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_accuracy": 0.8646616541353384,
4
- "eval_loss": 0.6874601244926453,
5
- "eval_runtime": 0.7676,
6
- "eval_samples_per_second": 173.27,
7
- "eval_steps_per_second": 22.147,
8
- "total_flos": 2.0877820672794624e+17,
9
- "train_loss": 0.19350949709232038,
10
- "train_runtime": 49.806,
11
- "train_samples_per_second": 415.211,
12
- "train_steps_per_second": 52.203
13
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_accuracy": 0.8571428571428571,
4
+ "eval_loss": 0.49125248193740845,
5
+ "eval_runtime": 0.945,
6
+ "eval_samples_per_second": 140.745,
7
+ "eval_steps_per_second": 17.99,
8
+ "total_flos": 7.939121542823117e+17,
9
+ "train_loss": 0.8216404274793772,
10
+ "train_runtime": 338.1702,
11
+ "train_samples_per_second": 61.153,
12
+ "train_steps_per_second": 7.688
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_accuracy": 0.8646616541353384,
4
- "eval_loss": 0.6874601244926453,
5
- "eval_runtime": 0.7676,
6
- "eval_samples_per_second": 173.27,
7
- "eval_steps_per_second": 22.147
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_accuracy": 0.8571428571428571,
4
+ "eval_loss": 0.49125248193740845,
5
+ "eval_runtime": 0.945,
6
+ "eval_samples_per_second": 140.745,
7
+ "eval_steps_per_second": 17.99
8
  }
runs/Dec05_11-28-22_ip-10-90-1-182/events.out.tfevents.1733398455.ip-10-90-1-182.93862.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:868519f0e84b5fcf3194b3a9f32e6a75ef4fea3aa53c5005eaa76b23482456a0
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "total_flos": 2.0877820672794624e+17,
4
- "train_loss": 0.19350949709232038,
5
- "train_runtime": 49.806,
6
- "train_samples_per_second": 415.211,
7
- "train_steps_per_second": 52.203
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "total_flos": 7.939121542823117e+17,
4
+ "train_loss": 0.8216404274793772,
5
+ "train_runtime": 338.1702,
6
+ "train_samples_per_second": 61.153,
7
+ "train_steps_per_second": 7.688
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.6874601244926453,
3
- "best_model_checkpoint": "./beans_outputs/checkpoint-2600",
4
  "epoch": 20.0,
5
  "eval_steps": 500,
6
  "global_step": 2600,
@@ -10,2012 +10,2012 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
- "grad_norm": 2.136049747467041,
14
- "learning_rate": 1.98974358974359e-05,
15
- "loss": 1.1239,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
- "grad_norm": 1.8187670707702637,
21
- "learning_rate": 1.9794871794871798e-05,
22
- "loss": 1.1221,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
- "grad_norm": 2.0219993591308594,
28
- "learning_rate": 1.9692307692307696e-05,
29
- "loss": 1.1164,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
- "grad_norm": 2.4619803428649902,
35
- "learning_rate": 1.958974358974359e-05,
36
- "loss": 1.1044,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
- "grad_norm": 1.6733014583587646,
42
- "learning_rate": 1.9487179487179488e-05,
43
- "loss": 1.1082,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
- "grad_norm": 1.4969494342803955,
49
- "learning_rate": 1.9384615384615386e-05,
50
- "loss": 1.1043,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
- "grad_norm": 2.6347556114196777,
56
- "learning_rate": 1.9282051282051284e-05,
57
- "loss": 1.1028,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
- "grad_norm": 2.5843420028686523,
63
- "learning_rate": 1.9179487179487182e-05,
64
- "loss": 1.0908,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
- "grad_norm": 2.0522916316986084,
70
- "learning_rate": 1.907692307692308e-05,
71
- "loss": 1.094,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
- "grad_norm": 2.4885082244873047,
77
- "learning_rate": 1.8974358974358975e-05,
78
- "loss": 1.0912,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
- "grad_norm": 1.7014166116714478,
84
- "learning_rate": 1.8871794871794873e-05,
85
- "loss": 1.0949,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
- "grad_norm": 2.294283866882324,
91
- "learning_rate": 1.876923076923077e-05,
92
- "loss": 1.0992,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
- "grad_norm": 4.129885673522949,
98
- "learning_rate": 1.866666666666667e-05,
99
- "loss": 1.0864,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
- "eval_accuracy": 0.42857142857142855,
105
- "eval_loss": 1.0877832174301147,
106
- "eval_runtime": 0.7833,
107
- "eval_samples_per_second": 169.792,
108
- "eval_steps_per_second": 21.703,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
- "grad_norm": 2.331717014312744,
114
- "learning_rate": 1.8564102564102567e-05,
115
- "loss": 1.0774,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
- "grad_norm": 2.5262138843536377,
121
- "learning_rate": 1.8461538461538465e-05,
122
- "loss": 1.0719,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
- "grad_norm": 1.5971320867538452,
128
- "learning_rate": 1.835897435897436e-05,
129
- "loss": 1.0781,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
- "grad_norm": 2.383288860321045,
135
- "learning_rate": 1.8256410256410257e-05,
136
- "loss": 1.0929,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
- "grad_norm": 2.169706106185913,
142
- "learning_rate": 1.8153846153846155e-05,
143
- "loss": 1.0805,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
- "grad_norm": 2.1174418926239014,
149
- "learning_rate": 1.8051282051282053e-05,
150
- "loss": 1.08,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
- "grad_norm": 1.7236179113388062,
156
- "learning_rate": 1.794871794871795e-05,
157
- "loss": 1.0766,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
- "grad_norm": 1.7772722244262695,
163
- "learning_rate": 1.784615384615385e-05,
164
- "loss": 1.0676,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
- "grad_norm": 3.53834867477417,
170
- "learning_rate": 1.7743589743589744e-05,
171
- "loss": 1.0695,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
- "grad_norm": 2.0417070388793945,
177
- "learning_rate": 1.7641025641025642e-05,
178
- "loss": 1.0706,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
- "grad_norm": 1.9734611511230469,
184
- "learning_rate": 1.753846153846154e-05,
185
- "loss": 1.0863,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
- "grad_norm": 1.9997600317001343,
191
- "learning_rate": 1.7435897435897438e-05,
192
- "loss": 1.068,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
- "grad_norm": 3.0024373531341553,
198
- "learning_rate": 1.7333333333333336e-05,
199
- "loss": 1.0629,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
- "eval_accuracy": 0.5488721804511278,
205
- "eval_loss": 1.0593525171279907,
206
- "eval_runtime": 0.7442,
207
- "eval_samples_per_second": 178.706,
208
- "eval_steps_per_second": 22.842,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
- "grad_norm": 1.977807641029358,
214
- "learning_rate": 1.7230769230769234e-05,
215
- "loss": 1.0711,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
- "grad_norm": 2.3906102180480957,
221
- "learning_rate": 1.7128205128205128e-05,
222
- "loss": 1.0597,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
- "grad_norm": 2.3670897483825684,
228
- "learning_rate": 1.7025641025641026e-05,
229
- "loss": 1.0576,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
- "grad_norm": 3.026155948638916,
235
- "learning_rate": 1.6923076923076924e-05,
236
- "loss": 1.0434,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
- "grad_norm": 1.9082350730895996,
242
- "learning_rate": 1.6820512820512822e-05,
243
- "loss": 1.0566,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
- "grad_norm": 1.9187153577804565,
249
- "learning_rate": 1.671794871794872e-05,
250
- "loss": 1.0476,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
- "grad_norm": 1.4435549974441528,
256
- "learning_rate": 1.6615384615384618e-05,
257
- "loss": 1.032,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
- "grad_norm": 2.1457245349884033,
263
- "learning_rate": 1.6512820512820513e-05,
264
- "loss": 1.0475,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
- "grad_norm": 1.9391709566116333,
270
- "learning_rate": 1.641025641025641e-05,
271
- "loss": 1.0486,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
- "grad_norm": 1.8148127794265747,
277
- "learning_rate": 1.630769230769231e-05,
278
- "loss": 1.0407,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
- "grad_norm": 2.444157123565674,
284
- "learning_rate": 1.6205128205128207e-05,
285
- "loss": 1.0356,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
- "grad_norm": 1.9061695337295532,
291
- "learning_rate": 1.6102564102564105e-05,
292
- "loss": 1.0239,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
- "grad_norm": 4.859686851501465,
298
- "learning_rate": 1.6000000000000003e-05,
299
- "loss": 1.0434,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
- "eval_accuracy": 0.6766917293233082,
305
- "eval_loss": 1.0230107307434082,
306
- "eval_runtime": 0.7471,
307
- "eval_samples_per_second": 178.027,
308
- "eval_steps_per_second": 22.755,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
- "grad_norm": 2.2021689414978027,
314
- "learning_rate": 1.5897435897435897e-05,
315
- "loss": 1.0424,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
- "grad_norm": 1.8670283555984497,
321
- "learning_rate": 1.5794871794871795e-05,
322
- "loss": 1.0299,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
- "grad_norm": 2.193986654281616,
328
- "learning_rate": 1.5692307692307693e-05,
329
- "loss": 1.0369,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
- "grad_norm": 2.26470685005188,
335
- "learning_rate": 1.558974358974359e-05,
336
- "loss": 1.0159,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
- "grad_norm": 2.18507981300354,
342
- "learning_rate": 1.548717948717949e-05,
343
- "loss": 1.0282,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
- "grad_norm": 1.8047341108322144,
349
- "learning_rate": 1.5384615384615387e-05,
350
- "loss": 1.0381,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
- "grad_norm": 2.6463704109191895,
356
- "learning_rate": 1.5282051282051282e-05,
357
- "loss": 1.0322,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
- "grad_norm": 1.6456215381622314,
363
- "learning_rate": 1.517948717948718e-05,
364
- "loss": 1.0049,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
- "grad_norm": 2.774256706237793,
370
- "learning_rate": 1.5076923076923078e-05,
371
- "loss": 1.0091,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
- "grad_norm": 1.572251319885254,
377
- "learning_rate": 1.4974358974358976e-05,
378
- "loss": 0.998,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
- "grad_norm": 1.5640805959701538,
384
- "learning_rate": 1.4871794871794874e-05,
385
- "loss": 1.0222,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
- "grad_norm": 1.9231537580490112,
391
- "learning_rate": 1.4769230769230772e-05,
392
- "loss": 0.9979,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
- "grad_norm": 5.481942176818848,
398
- "learning_rate": 1.4666666666666666e-05,
399
- "loss": 1.0214,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
- "eval_accuracy": 0.6766917293233082,
405
- "eval_loss": 0.9964542388916016,
406
- "eval_runtime": 0.7616,
407
- "eval_samples_per_second": 174.638,
408
- "eval_steps_per_second": 22.322,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
- "grad_norm": 2.7514402866363525,
414
- "learning_rate": 1.4564102564102564e-05,
415
- "loss": 1.0128,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
- "grad_norm": 1.8411396741867065,
421
- "learning_rate": 1.4461538461538462e-05,
422
- "loss": 1.0145,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
- "grad_norm": 2.670154571533203,
428
- "learning_rate": 1.435897435897436e-05,
429
- "loss": 1.0227,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
- "grad_norm": 2.1951498985290527,
435
- "learning_rate": 1.4256410256410258e-05,
436
- "loss": 1.0321,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
- "grad_norm": 1.9692825078964233,
442
- "learning_rate": 1.4153846153846156e-05,
443
- "loss": 0.9829,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
- "grad_norm": 2.611340284347534,
449
- "learning_rate": 1.405128205128205e-05,
450
- "loss": 0.9918,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
- "grad_norm": 2.4288899898529053,
456
- "learning_rate": 1.3948717948717949e-05,
457
- "loss": 0.9879,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
- "grad_norm": 1.7537823915481567,
463
- "learning_rate": 1.3846153846153847e-05,
464
- "loss": 0.9793,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
- "grad_norm": 3.1588003635406494,
470
- "learning_rate": 1.3743589743589745e-05,
471
- "loss": 1.0002,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
- "grad_norm": 2.2472622394561768,
477
- "learning_rate": 1.3641025641025643e-05,
478
- "loss": 1.0094,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
- "grad_norm": 1.7958937883377075,
484
- "learning_rate": 1.353846153846154e-05,
485
- "loss": 0.9703,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
- "grad_norm": 2.415766477584839,
491
- "learning_rate": 1.3435897435897435e-05,
492
- "loss": 0.9703,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
- "grad_norm": 4.948933124542236,
498
- "learning_rate": 1.3333333333333333e-05,
499
- "loss": 1.0026,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
- "eval_accuracy": 0.7443609022556391,
505
- "eval_loss": 0.9569369554519653,
506
- "eval_runtime": 0.7647,
507
- "eval_samples_per_second": 173.928,
508
- "eval_steps_per_second": 22.231,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
- "grad_norm": 2.1397032737731934,
514
- "learning_rate": 1.3230769230769231e-05,
515
- "loss": 0.9645,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
- "grad_norm": 2.7277321815490723,
521
- "learning_rate": 1.312820512820513e-05,
522
- "loss": 1.0063,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
- "grad_norm": 2.391350030899048,
528
- "learning_rate": 1.3025641025641027e-05,
529
- "loss": 0.9918,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
- "grad_norm": 2.751174211502075,
535
- "learning_rate": 1.2923076923076925e-05,
536
- "loss": 0.9849,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
- "grad_norm": 2.77424693107605,
542
- "learning_rate": 1.2820512820512823e-05,
543
- "loss": 0.9745,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
- "grad_norm": 1.9156702756881714,
549
- "learning_rate": 1.2717948717948718e-05,
550
- "loss": 0.9684,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
- "grad_norm": 1.9521454572677612,
556
- "learning_rate": 1.2615384615384616e-05,
557
- "loss": 0.9503,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
- "grad_norm": 2.468419313430786,
563
- "learning_rate": 1.2512820512820514e-05,
564
- "loss": 0.9641,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
- "grad_norm": 2.520923614501953,
570
- "learning_rate": 1.2410256410256412e-05,
571
- "loss": 0.9471,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
- "grad_norm": 2.1003758907318115,
577
- "learning_rate": 1.230769230769231e-05,
578
- "loss": 0.9513,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
- "grad_norm": 2.192279100418091,
584
- "learning_rate": 1.2205128205128208e-05,
585
- "loss": 0.9527,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
- "grad_norm": 3.8428618907928467,
591
- "learning_rate": 1.2102564102564102e-05,
592
- "loss": 0.938,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
- "grad_norm": 4.9151530265808105,
598
- "learning_rate": 1.2e-05,
599
- "loss": 0.9753,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
  "eval_accuracy": 0.7819548872180451,
605
- "eval_loss": 0.9288201332092285,
606
- "eval_runtime": 0.7499,
607
- "eval_samples_per_second": 177.349,
608
- "eval_steps_per_second": 22.669,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
- "grad_norm": 2.7967398166656494,
614
- "learning_rate": 1.1897435897435898e-05,
615
- "loss": 0.9428,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
- "grad_norm": 2.5342345237731934,
621
- "learning_rate": 1.1794871794871796e-05,
622
- "loss": 0.9406,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
- "grad_norm": 1.877543330192566,
628
- "learning_rate": 1.1692307692307694e-05,
629
- "loss": 0.9319,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
- "grad_norm": 2.4524621963500977,
635
- "learning_rate": 1.1589743589743592e-05,
636
- "loss": 0.9332,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
- "grad_norm": 2.4967362880706787,
642
- "learning_rate": 1.1487179487179487e-05,
643
- "loss": 0.9367,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
- "grad_norm": 3.2078776359558105,
649
- "learning_rate": 1.1384615384615385e-05,
650
- "loss": 0.9339,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
- "grad_norm": 2.926706075668335,
656
- "learning_rate": 1.1282051282051283e-05,
657
- "loss": 0.9416,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
- "grad_norm": 1.8625017404556274,
663
- "learning_rate": 1.117948717948718e-05,
664
- "loss": 0.9111,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
- "grad_norm": 2.7141189575195312,
670
- "learning_rate": 1.1076923076923079e-05,
671
- "loss": 0.9574,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
- "grad_norm": 2.307347536087036,
677
- "learning_rate": 1.0974358974358977e-05,
678
- "loss": 0.9259,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
- "grad_norm": 2.3937132358551025,
684
- "learning_rate": 1.0871794871794871e-05,
685
- "loss": 0.9207,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
- "grad_norm": 3.0794668197631836,
691
- "learning_rate": 1.076923076923077e-05,
692
- "loss": 0.9418,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
- "grad_norm": 4.111669063568115,
698
- "learning_rate": 1.0666666666666667e-05,
699
- "loss": 0.9252,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
- "eval_accuracy": 0.7969924812030075,
705
- "eval_loss": 0.8874692916870117,
706
- "eval_runtime": 0.7823,
707
- "eval_samples_per_second": 170.013,
708
- "eval_steps_per_second": 21.731,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
- "grad_norm": 2.7561662197113037,
714
- "learning_rate": 1.0564102564102565e-05,
715
- "loss": 0.911,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
- "grad_norm": 3.2020223140716553,
721
- "learning_rate": 1.0461538461538463e-05,
722
- "loss": 0.912,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
- "grad_norm": 3.459304094314575,
728
- "learning_rate": 1.0358974358974361e-05,
729
- "loss": 0.8994,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
- "grad_norm": 2.774078369140625,
735
- "learning_rate": 1.0256410256410256e-05,
736
- "loss": 0.9079,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
- "grad_norm": 2.7169668674468994,
742
- "learning_rate": 1.0153846153846154e-05,
743
- "loss": 0.9256,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
- "grad_norm": 2.171323299407959,
749
- "learning_rate": 1.0051282051282052e-05,
750
- "loss": 0.8898,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
- "grad_norm": 2.7350351810455322,
756
- "learning_rate": 9.94871794871795e-06,
757
- "loss": 0.9243,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
- "grad_norm": 2.3926539421081543,
763
- "learning_rate": 9.846153846153848e-06,
764
- "loss": 0.8868,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
- "grad_norm": 2.0602715015411377,
770
- "learning_rate": 9.743589743589744e-06,
771
- "loss": 0.8837,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
- "grad_norm": 2.885303497314453,
777
- "learning_rate": 9.641025641025642e-06,
778
- "loss": 0.8827,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
- "grad_norm": 2.261361837387085,
784
- "learning_rate": 9.53846153846154e-06,
785
- "loss": 0.9047,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
- "grad_norm": 2.6180179119110107,
791
- "learning_rate": 9.435897435897436e-06,
792
- "loss": 0.861,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
- "grad_norm": 4.225304126739502,
798
- "learning_rate": 9.333333333333334e-06,
799
- "loss": 0.9192,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
- "eval_accuracy": 0.8120300751879699,
805
- "eval_loss": 0.850643515586853,
806
- "eval_runtime": 0.756,
807
- "eval_samples_per_second": 175.92,
808
- "eval_steps_per_second": 22.486,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
- "grad_norm": 2.1875813007354736,
814
- "learning_rate": 9.230769230769232e-06,
815
- "loss": 0.8953,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
- "grad_norm": 2.1640567779541016,
821
- "learning_rate": 9.128205128205129e-06,
822
- "loss": 0.8658,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
- "grad_norm": 2.660614490509033,
828
- "learning_rate": 9.025641025641027e-06,
829
- "loss": 0.8995,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
- "grad_norm": 2.104029417037964,
835
- "learning_rate": 8.923076923076925e-06,
836
- "loss": 0.8569,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
- "grad_norm": 2.2643303871154785,
842
- "learning_rate": 8.820512820512821e-06,
843
- "loss": 0.8972,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
- "grad_norm": 2.632410764694214,
849
- "learning_rate": 8.717948717948719e-06,
850
- "loss": 0.8715,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
- "grad_norm": 1.6500084400177002,
856
- "learning_rate": 8.615384615384617e-06,
857
- "loss": 0.8716,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
- "grad_norm": 6.204855442047119,
863
- "learning_rate": 8.512820512820513e-06,
864
- "loss": 0.8985,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
- "grad_norm": 3.729611873626709,
870
- "learning_rate": 8.410256410256411e-06,
871
- "loss": 0.8837,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
- "grad_norm": 3.685739278793335,
877
- "learning_rate": 8.307692307692309e-06,
878
- "loss": 0.8865,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
- "grad_norm": 2.7028560638427734,
884
- "learning_rate": 8.205128205128205e-06,
885
- "loss": 0.875,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
- "grad_norm": 2.7692482471466064,
891
- "learning_rate": 8.102564102564103e-06,
892
- "loss": 0.8867,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
- "grad_norm": 3.9854462146759033,
898
- "learning_rate": 8.000000000000001e-06,
899
- "loss": 0.9008,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
- "eval_accuracy": 0.8045112781954887,
905
- "eval_loss": 0.8337866067886353,
906
- "eval_runtime": 0.7963,
907
- "eval_samples_per_second": 167.03,
908
- "eval_steps_per_second": 21.35,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
- "grad_norm": 1.9381572008132935,
914
- "learning_rate": 7.897435897435898e-06,
915
- "loss": 0.8969,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
- "grad_norm": 2.219219446182251,
921
- "learning_rate": 7.794871794871796e-06,
922
- "loss": 0.8412,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
- "grad_norm": 2.1302294731140137,
928
- "learning_rate": 7.692307692307694e-06,
929
- "loss": 0.8483,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
- "grad_norm": 2.541210174560547,
935
- "learning_rate": 7.58974358974359e-06,
936
- "loss": 0.8536,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
- "grad_norm": 1.952871322631836,
942
- "learning_rate": 7.487179487179488e-06,
943
- "loss": 0.8707,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
- "grad_norm": 3.273028612136841,
949
- "learning_rate": 7.384615384615386e-06,
950
- "loss": 0.8547,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
- "grad_norm": 2.6495628356933594,
956
- "learning_rate": 7.282051282051282e-06,
957
- "loss": 0.8709,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
- "grad_norm": 1.998024582862854,
963
- "learning_rate": 7.17948717948718e-06,
964
- "loss": 0.8278,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
- "grad_norm": 2.7621707916259766,
970
- "learning_rate": 7.076923076923078e-06,
971
- "loss": 0.8544,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
- "grad_norm": 1.844375491142273,
977
- "learning_rate": 6.974358974358974e-06,
978
- "loss": 0.8324,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
- "grad_norm": 2.149479866027832,
984
- "learning_rate": 6.871794871794872e-06,
985
- "loss": 0.8146,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
- "grad_norm": 2.2224795818328857,
991
- "learning_rate": 6.76923076923077e-06,
992
- "loss": 0.8367,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
- "grad_norm": 3.8497843742370605,
998
- "learning_rate": 6.666666666666667e-06,
999
- "loss": 0.8079,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
- "eval_accuracy": 0.8421052631578947,
1005
- "eval_loss": 0.8103837370872498,
1006
- "eval_runtime": 0.7593,
1007
- "eval_samples_per_second": 175.164,
1008
- "eval_steps_per_second": 22.389,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.076923076923077,
1013
- "grad_norm": 2.0343823432922363,
1014
- "learning_rate": 6.564102564102565e-06,
1015
- "loss": 0.8408,
1016
  "step": 1310
1017
  },
1018
  {
1019
  "epoch": 10.153846153846153,
1020
- "grad_norm": 2.4245193004608154,
1021
- "learning_rate": 6.461538461538463e-06,
1022
- "loss": 0.899,
1023
  "step": 1320
1024
  },
1025
  {
1026
  "epoch": 10.23076923076923,
1027
- "grad_norm": 2.3912925720214844,
1028
- "learning_rate": 6.358974358974359e-06,
1029
- "loss": 0.8758,
1030
  "step": 1330
1031
  },
1032
  {
1033
  "epoch": 10.307692307692308,
1034
- "grad_norm": 2.1387076377868652,
1035
- "learning_rate": 6.256410256410257e-06,
1036
- "loss": 0.8295,
1037
  "step": 1340
1038
  },
1039
  {
1040
  "epoch": 10.384615384615385,
1041
- "grad_norm": 2.142160415649414,
1042
- "learning_rate": 6.153846153846155e-06,
1043
- "loss": 0.8075,
1044
  "step": 1350
1045
  },
1046
  {
1047
  "epoch": 10.461538461538462,
1048
- "grad_norm": 2.6838831901550293,
1049
- "learning_rate": 6.051282051282051e-06,
1050
- "loss": 0.8448,
1051
  "step": 1360
1052
  },
1053
  {
1054
  "epoch": 10.538461538461538,
1055
- "grad_norm": 2.476369857788086,
1056
- "learning_rate": 5.948717948717949e-06,
1057
- "loss": 0.817,
1058
  "step": 1370
1059
  },
1060
  {
1061
  "epoch": 10.615384615384615,
1062
- "grad_norm": 3.031463861465454,
1063
- "learning_rate": 5.846153846153847e-06,
1064
- "loss": 0.8177,
1065
  "step": 1380
1066
  },
1067
  {
1068
  "epoch": 10.692307692307692,
1069
- "grad_norm": 2.2818636894226074,
1070
- "learning_rate": 5.743589743589743e-06,
1071
- "loss": 0.8124,
1072
  "step": 1390
1073
  },
1074
  {
1075
  "epoch": 10.76923076923077,
1076
- "grad_norm": 3.245805263519287,
1077
- "learning_rate": 5.641025641025641e-06,
1078
- "loss": 0.8674,
1079
  "step": 1400
1080
  },
1081
  {
1082
  "epoch": 10.846153846153847,
1083
- "grad_norm": 2.194627046585083,
1084
- "learning_rate": 5.538461538461539e-06,
1085
- "loss": 0.831,
1086
  "step": 1410
1087
  },
1088
  {
1089
  "epoch": 10.923076923076923,
1090
- "grad_norm": 1.8149436712265015,
1091
- "learning_rate": 5.435897435897436e-06,
1092
- "loss": 0.8391,
1093
  "step": 1420
1094
  },
1095
  {
1096
  "epoch": 11.0,
1097
- "grad_norm": 4.0584821701049805,
1098
- "learning_rate": 5.333333333333334e-06,
1099
- "loss": 0.8332,
1100
  "step": 1430
1101
  },
1102
  {
1103
  "epoch": 11.0,
1104
- "eval_accuracy": 0.8345864661654135,
1105
- "eval_loss": 0.7806060314178467,
1106
- "eval_runtime": 0.742,
1107
- "eval_samples_per_second": 179.256,
1108
- "eval_steps_per_second": 22.912,
1109
  "step": 1430
1110
  },
1111
  {
1112
  "epoch": 11.076923076923077,
1113
- "grad_norm": 1.9833248853683472,
1114
- "learning_rate": 5.230769230769232e-06,
1115
- "loss": 0.8484,
1116
  "step": 1440
1117
  },
1118
  {
1119
  "epoch": 11.153846153846153,
1120
- "grad_norm": 5.478232383728027,
1121
- "learning_rate": 5.128205128205128e-06,
1122
- "loss": 0.8308,
1123
  "step": 1450
1124
  },
1125
  {
1126
  "epoch": 11.23076923076923,
1127
- "grad_norm": 2.5792922973632812,
1128
- "learning_rate": 5.025641025641026e-06,
1129
- "loss": 0.802,
1130
  "step": 1460
1131
  },
1132
  {
1133
  "epoch": 11.307692307692308,
1134
- "grad_norm": 2.730989694595337,
1135
- "learning_rate": 4.923076923076924e-06,
1136
- "loss": 0.8225,
1137
  "step": 1470
1138
  },
1139
  {
1140
  "epoch": 11.384615384615385,
1141
- "grad_norm": 2.7447853088378906,
1142
- "learning_rate": 4.820512820512821e-06,
1143
- "loss": 0.8176,
1144
  "step": 1480
1145
  },
1146
  {
1147
  "epoch": 11.461538461538462,
1148
- "grad_norm": 2.6465837955474854,
1149
- "learning_rate": 4.717948717948718e-06,
1150
- "loss": 0.8471,
1151
  "step": 1490
1152
  },
1153
  {
1154
  "epoch": 11.538461538461538,
1155
- "grad_norm": 2.4876015186309814,
1156
- "learning_rate": 4.615384615384616e-06,
1157
- "loss": 0.8349,
1158
  "step": 1500
1159
  },
1160
  {
1161
  "epoch": 11.615384615384615,
1162
- "grad_norm": 3.2605788707733154,
1163
- "learning_rate": 4.512820512820513e-06,
1164
- "loss": 0.8285,
1165
  "step": 1510
1166
  },
1167
  {
1168
  "epoch": 11.692307692307692,
1169
- "grad_norm": 3.278341293334961,
1170
- "learning_rate": 4.4102564102564104e-06,
1171
- "loss": 0.8546,
1172
  "step": 1520
1173
  },
1174
  {
1175
  "epoch": 11.76923076923077,
1176
- "grad_norm": 2.0945637226104736,
1177
- "learning_rate": 4.307692307692308e-06,
1178
- "loss": 0.8096,
1179
  "step": 1530
1180
  },
1181
  {
1182
  "epoch": 11.846153846153847,
1183
- "grad_norm": 2.161726474761963,
1184
- "learning_rate": 4.2051282051282055e-06,
1185
- "loss": 0.7938,
1186
  "step": 1540
1187
  },
1188
  {
1189
  "epoch": 11.923076923076923,
1190
- "grad_norm": 2.1052703857421875,
1191
- "learning_rate": 4.102564102564103e-06,
1192
- "loss": 0.8295,
1193
  "step": 1550
1194
  },
1195
  {
1196
  "epoch": 12.0,
1197
- "grad_norm": 3.460094451904297,
1198
- "learning_rate": 4.000000000000001e-06,
1199
- "loss": 0.8103,
1200
  "step": 1560
1201
  },
1202
  {
1203
  "epoch": 12.0,
1204
- "eval_accuracy": 0.8345864661654135,
1205
- "eval_loss": 0.7585543990135193,
1206
- "eval_runtime": 0.7508,
1207
- "eval_samples_per_second": 177.133,
1208
- "eval_steps_per_second": 22.641,
1209
  "step": 1560
1210
  },
1211
  {
1212
  "epoch": 12.076923076923077,
1213
- "grad_norm": 2.943866014480591,
1214
- "learning_rate": 3.897435897435898e-06,
1215
- "loss": 0.7903,
1216
  "step": 1570
1217
  },
1218
  {
1219
  "epoch": 12.153846153846153,
1220
- "grad_norm": 2.6185402870178223,
1221
- "learning_rate": 3.794871794871795e-06,
1222
- "loss": 0.8229,
1223
  "step": 1580
1224
  },
1225
  {
1226
  "epoch": 12.23076923076923,
1227
- "grad_norm": 1.6378310918807983,
1228
- "learning_rate": 3.692307692307693e-06,
1229
- "loss": 0.8246,
1230
  "step": 1590
1231
  },
1232
  {
1233
  "epoch": 12.307692307692308,
1234
- "grad_norm": 2.3109569549560547,
1235
- "learning_rate": 3.58974358974359e-06,
1236
- "loss": 0.8363,
1237
  "step": 1600
1238
  },
1239
  {
1240
  "epoch": 12.384615384615385,
1241
- "grad_norm": 2.3602941036224365,
1242
- "learning_rate": 3.487179487179487e-06,
1243
- "loss": 0.8078,
1244
  "step": 1610
1245
  },
1246
  {
1247
  "epoch": 12.461538461538462,
1248
- "grad_norm": 3.0623390674591064,
1249
- "learning_rate": 3.384615384615385e-06,
1250
- "loss": 0.794,
1251
  "step": 1620
1252
  },
1253
  {
1254
  "epoch": 12.538461538461538,
1255
- "grad_norm": 2.947983741760254,
1256
- "learning_rate": 3.2820512820512823e-06,
1257
- "loss": 0.8033,
1258
  "step": 1630
1259
  },
1260
  {
1261
  "epoch": 12.615384615384615,
1262
- "grad_norm": 1.8083330392837524,
1263
- "learning_rate": 3.1794871794871795e-06,
1264
- "loss": 0.8158,
1265
  "step": 1640
1266
  },
1267
  {
1268
  "epoch": 12.692307692307692,
1269
- "grad_norm": 3.2873637676239014,
1270
- "learning_rate": 3.0769230769230774e-06,
1271
- "loss": 0.7651,
1272
  "step": 1650
1273
  },
1274
  {
1275
  "epoch": 12.76923076923077,
1276
- "grad_norm": 2.3777670860290527,
1277
- "learning_rate": 2.9743589743589746e-06,
1278
- "loss": 0.8566,
1279
  "step": 1660
1280
  },
1281
  {
1282
  "epoch": 12.846153846153847,
1283
- "grad_norm": 1.8692084550857544,
1284
- "learning_rate": 2.8717948717948717e-06,
1285
- "loss": 0.8218,
1286
  "step": 1670
1287
  },
1288
  {
1289
  "epoch": 12.923076923076923,
1290
- "grad_norm": 2.2379138469696045,
1291
- "learning_rate": 2.7692307692307697e-06,
1292
- "loss": 0.7984,
1293
  "step": 1680
1294
  },
1295
  {
1296
  "epoch": 13.0,
1297
- "grad_norm": 4.131476879119873,
1298
- "learning_rate": 2.666666666666667e-06,
1299
- "loss": 0.8149,
1300
  "step": 1690
1301
  },
1302
  {
1303
  "epoch": 13.0,
1304
- "eval_accuracy": 0.8421052631578947,
1305
- "eval_loss": 0.757113516330719,
1306
- "eval_runtime": 0.7762,
1307
- "eval_samples_per_second": 171.337,
1308
- "eval_steps_per_second": 21.9,
1309
  "step": 1690
1310
  },
1311
  {
1312
  "epoch": 13.076923076923077,
1313
- "grad_norm": 2.9936656951904297,
1314
- "learning_rate": 2.564102564102564e-06,
1315
- "loss": 0.7917,
1316
  "step": 1700
1317
  },
1318
  {
1319
  "epoch": 13.153846153846153,
1320
- "grad_norm": 2.5392699241638184,
1321
- "learning_rate": 2.461538461538462e-06,
1322
- "loss": 0.8241,
1323
  "step": 1710
1324
  },
1325
  {
1326
  "epoch": 13.23076923076923,
1327
- "grad_norm": 3.0166265964508057,
1328
- "learning_rate": 2.358974358974359e-06,
1329
- "loss": 0.8117,
1330
  "step": 1720
1331
  },
1332
  {
1333
  "epoch": 13.307692307692308,
1334
- "grad_norm": 1.8728867769241333,
1335
- "learning_rate": 2.2564102564102566e-06,
1336
- "loss": 0.8155,
1337
  "step": 1730
1338
  },
1339
  {
1340
  "epoch": 13.384615384615385,
1341
- "grad_norm": 2.50715708732605,
1342
- "learning_rate": 2.153846153846154e-06,
1343
- "loss": 0.7814,
1344
  "step": 1740
1345
  },
1346
  {
1347
  "epoch": 13.461538461538462,
1348
- "grad_norm": 5.447348594665527,
1349
- "learning_rate": 2.0512820512820513e-06,
1350
- "loss": 0.8253,
1351
  "step": 1750
1352
  },
1353
  {
1354
  "epoch": 13.538461538461538,
1355
- "grad_norm": 2.6522035598754883,
1356
- "learning_rate": 1.948717948717949e-06,
1357
- "loss": 0.8486,
1358
  "step": 1760
1359
  },
1360
  {
1361
  "epoch": 13.615384615384615,
1362
- "grad_norm": 2.1300199031829834,
1363
- "learning_rate": 1.8461538461538465e-06,
1364
- "loss": 0.8027,
1365
  "step": 1770
1366
  },
1367
  {
1368
  "epoch": 13.692307692307692,
1369
- "grad_norm": 2.1135923862457275,
1370
- "learning_rate": 1.7435897435897436e-06,
1371
- "loss": 0.7852,
1372
  "step": 1780
1373
  },
1374
  {
1375
  "epoch": 13.76923076923077,
1376
- "grad_norm": 1.871300220489502,
1377
- "learning_rate": 1.6410256410256412e-06,
1378
- "loss": 0.8224,
1379
  "step": 1790
1380
  },
1381
  {
1382
  "epoch": 13.846153846153847,
1383
- "grad_norm": 3.240356206893921,
1384
- "learning_rate": 1.5384615384615387e-06,
1385
- "loss": 0.7895,
1386
  "step": 1800
1387
  },
1388
  {
1389
  "epoch": 13.923076923076923,
1390
- "grad_norm": 2.5182340145111084,
1391
- "learning_rate": 1.4358974358974359e-06,
1392
- "loss": 0.7316,
1393
  "step": 1810
1394
  },
1395
  {
1396
  "epoch": 14.0,
1397
- "grad_norm": 4.281803607940674,
1398
- "learning_rate": 1.3333333333333334e-06,
1399
- "loss": 0.8186,
1400
  "step": 1820
1401
  },
1402
  {
1403
  "epoch": 14.0,
1404
- "eval_accuracy": 0.8270676691729323,
1405
- "eval_loss": 0.7540305852890015,
1406
- "eval_runtime": 0.7703,
1407
- "eval_samples_per_second": 172.654,
1408
- "eval_steps_per_second": 22.069,
1409
  "step": 1820
1410
  },
1411
  {
1412
  "epoch": 14.076923076923077,
1413
- "grad_norm": 2.050518751144409,
1414
- "learning_rate": 1.230769230769231e-06,
1415
- "loss": 0.8222,
1416
  "step": 1830
1417
  },
1418
  {
1419
  "epoch": 14.153846153846153,
1420
- "grad_norm": 2.051259994506836,
1421
- "learning_rate": 1.1282051282051283e-06,
1422
- "loss": 0.7878,
1423
  "step": 1840
1424
  },
1425
  {
1426
  "epoch": 14.23076923076923,
1427
- "grad_norm": 2.8861193656921387,
1428
- "learning_rate": 1.0256410256410257e-06,
1429
- "loss": 0.78,
1430
  "step": 1850
1431
  },
1432
  {
1433
  "epoch": 14.307692307692308,
1434
- "grad_norm": 4.159270763397217,
1435
- "learning_rate": 9.230769230769232e-07,
1436
- "loss": 0.774,
1437
  "step": 1860
1438
  },
1439
  {
1440
  "epoch": 14.384615384615385,
1441
- "grad_norm": 2.8624985218048096,
1442
- "learning_rate": 8.205128205128206e-07,
1443
- "loss": 0.7882,
1444
  "step": 1870
1445
  },
1446
  {
1447
  "epoch": 14.461538461538462,
1448
- "grad_norm": 2.5051703453063965,
1449
- "learning_rate": 7.179487179487179e-07,
1450
- "loss": 0.7883,
1451
  "step": 1880
1452
  },
1453
  {
1454
  "epoch": 14.538461538461538,
1455
- "grad_norm": 3.003545045852661,
1456
- "learning_rate": 6.153846153846155e-07,
1457
- "loss": 0.7817,
1458
  "step": 1890
1459
  },
1460
  {
1461
  "epoch": 14.615384615384615,
1462
- "grad_norm": 2.8403878211975098,
1463
- "learning_rate": 5.128205128205128e-07,
1464
- "loss": 0.8294,
1465
  "step": 1900
1466
  },
1467
  {
1468
  "epoch": 14.692307692307692,
1469
- "grad_norm": 2.124030590057373,
1470
- "learning_rate": 4.102564102564103e-07,
1471
- "loss": 0.7978,
1472
  "step": 1910
1473
  },
1474
  {
1475
  "epoch": 14.76923076923077,
1476
- "grad_norm": 4.762181758880615,
1477
- "learning_rate": 3.0769230769230774e-07,
1478
- "loss": 0.8038,
1479
  "step": 1920
1480
  },
1481
  {
1482
  "epoch": 14.846153846153847,
1483
- "grad_norm": 3.256133794784546,
1484
- "learning_rate": 2.0512820512820514e-07,
1485
- "loss": 0.8535,
1486
  "step": 1930
1487
  },
1488
  {
1489
  "epoch": 14.923076923076923,
1490
- "grad_norm": 2.355344772338867,
1491
- "learning_rate": 1.0256410256410257e-07,
1492
- "loss": 0.7587,
1493
  "step": 1940
1494
  },
1495
  {
1496
  "epoch": 15.0,
1497
- "grad_norm": 4.202574729919434,
1498
- "learning_rate": 0.0,
1499
- "loss": 0.7929,
1500
  "step": 1950
1501
  },
1502
  {
1503
  "epoch": 15.0,
1504
- "eval_accuracy": 0.8120300751879699,
1505
- "eval_loss": 0.7412300109863281,
1506
- "eval_runtime": 0.8087,
1507
- "eval_samples_per_second": 164.47,
1508
- "eval_steps_per_second": 21.022,
1509
  "step": 1950
1510
  },
1511
  {
1512
  "epoch": 15.076923076923077,
1513
- "grad_norm": 3.3559834957122803,
1514
  "learning_rate": 4.923076923076924e-06,
1515
- "loss": 0.7535,
1516
  "step": 1960
1517
  },
1518
  {
1519
  "epoch": 15.153846153846153,
1520
- "grad_norm": 1.8613739013671875,
1521
  "learning_rate": 4.8461538461538465e-06,
1522
- "loss": 0.7581,
1523
  "step": 1970
1524
  },
1525
  {
1526
  "epoch": 15.23076923076923,
1527
- "grad_norm": 2.3707966804504395,
1528
  "learning_rate": 4.76923076923077e-06,
1529
- "loss": 0.7836,
1530
  "step": 1980
1531
  },
1532
  {
1533
  "epoch": 15.307692307692308,
1534
- "grad_norm": 2.6265199184417725,
1535
  "learning_rate": 4.692307692307693e-06,
1536
- "loss": 0.8334,
1537
  "step": 1990
1538
  },
1539
  {
1540
  "epoch": 15.384615384615385,
1541
- "grad_norm": 2.078848123550415,
1542
  "learning_rate": 4.615384615384616e-06,
1543
- "loss": 0.7772,
1544
  "step": 2000
1545
  },
1546
  {
1547
  "epoch": 15.461538461538462,
1548
- "grad_norm": 2.6433162689208984,
1549
  "learning_rate": 4.538461538461539e-06,
1550
- "loss": 0.7955,
1551
  "step": 2010
1552
  },
1553
  {
1554
  "epoch": 15.538461538461538,
1555
- "grad_norm": 3.458962917327881,
1556
  "learning_rate": 4.461538461538462e-06,
1557
- "loss": 0.787,
1558
  "step": 2020
1559
  },
1560
  {
1561
  "epoch": 15.615384615384615,
1562
- "grad_norm": 5.090147495269775,
1563
  "learning_rate": 4.384615384615385e-06,
1564
- "loss": 0.7875,
1565
  "step": 2030
1566
  },
1567
  {
1568
  "epoch": 15.692307692307692,
1569
- "grad_norm": 1.9066407680511475,
1570
  "learning_rate": 4.307692307692308e-06,
1571
- "loss": 0.7764,
1572
  "step": 2040
1573
  },
1574
  {
1575
  "epoch": 15.76923076923077,
1576
- "grad_norm": 3.097341299057007,
1577
  "learning_rate": 4.230769230769231e-06,
1578
- "loss": 0.7335,
1579
  "step": 2050
1580
  },
1581
  {
1582
  "epoch": 15.846153846153847,
1583
- "grad_norm": 2.7201600074768066,
1584
  "learning_rate": 4.1538461538461545e-06,
1585
- "loss": 0.7747,
1586
  "step": 2060
1587
  },
1588
  {
1589
  "epoch": 15.923076923076923,
1590
- "grad_norm": 2.303032398223877,
1591
  "learning_rate": 4.076923076923077e-06,
1592
- "loss": 0.7738,
1593
  "step": 2070
1594
  },
1595
  {
1596
  "epoch": 16.0,
1597
- "grad_norm": 4.420492172241211,
1598
  "learning_rate": 4.000000000000001e-06,
1599
- "loss": 0.774,
1600
  "step": 2080
1601
  },
1602
  {
1603
  "epoch": 16.0,
1604
- "eval_accuracy": 0.849624060150376,
1605
- "eval_loss": 0.7370420694351196,
1606
- "eval_runtime": 0.776,
1607
- "eval_samples_per_second": 171.384,
1608
- "eval_steps_per_second": 21.906,
1609
  "step": 2080
1610
  },
1611
  {
1612
  "epoch": 16.076923076923077,
1613
- "grad_norm": 3.9969003200531006,
1614
  "learning_rate": 3.923076923076923e-06,
1615
- "loss": 0.8316,
1616
  "step": 2090
1617
  },
1618
  {
1619
  "epoch": 16.153846153846153,
1620
- "grad_norm": 2.3731822967529297,
1621
  "learning_rate": 3.846153846153847e-06,
1622
- "loss": 0.8162,
1623
  "step": 2100
1624
  },
1625
  {
1626
  "epoch": 16.23076923076923,
1627
- "grad_norm": 2.232074737548828,
1628
  "learning_rate": 3.7692307692307694e-06,
1629
- "loss": 0.8138,
1630
  "step": 2110
1631
  },
1632
  {
1633
  "epoch": 16.307692307692307,
1634
- "grad_norm": 2.8799118995666504,
1635
  "learning_rate": 3.692307692307693e-06,
1636
- "loss": 0.8434,
1637
  "step": 2120
1638
  },
1639
  {
1640
  "epoch": 16.384615384615383,
1641
- "grad_norm": 2.2093818187713623,
1642
  "learning_rate": 3.6153846153846156e-06,
1643
- "loss": 0.7886,
1644
  "step": 2130
1645
  },
1646
  {
1647
  "epoch": 16.46153846153846,
1648
- "grad_norm": 1.984840750694275,
1649
  "learning_rate": 3.538461538461539e-06,
1650
- "loss": 0.7682,
1651
  "step": 2140
1652
  },
1653
  {
1654
  "epoch": 16.53846153846154,
1655
- "grad_norm": 2.711601495742798,
1656
  "learning_rate": 3.4615384615384617e-06,
1657
- "loss": 0.7471,
1658
  "step": 2150
1659
  },
1660
  {
1661
  "epoch": 16.615384615384617,
1662
- "grad_norm": 2.130311965942383,
1663
  "learning_rate": 3.384615384615385e-06,
1664
- "loss": 0.7535,
1665
  "step": 2160
1666
  },
1667
  {
1668
  "epoch": 16.692307692307693,
1669
- "grad_norm": 2.327207565307617,
1670
  "learning_rate": 3.307692307692308e-06,
1671
- "loss": 0.718,
1672
  "step": 2170
1673
  },
1674
  {
1675
  "epoch": 16.76923076923077,
1676
- "grad_norm": 2.198944091796875,
1677
  "learning_rate": 3.2307692307692313e-06,
1678
- "loss": 0.8146,
1679
  "step": 2180
1680
  },
1681
  {
1682
  "epoch": 16.846153846153847,
1683
- "grad_norm": 2.388453483581543,
1684
  "learning_rate": 3.153846153846154e-06,
1685
- "loss": 0.8368,
1686
  "step": 2190
1687
  },
1688
  {
1689
  "epoch": 16.923076923076923,
1690
- "grad_norm": 2.2575690746307373,
1691
  "learning_rate": 3.0769230769230774e-06,
1692
- "loss": 0.749,
1693
  "step": 2200
1694
  },
1695
  {
1696
  "epoch": 17.0,
1697
- "grad_norm": 6.020498275756836,
1698
  "learning_rate": 3e-06,
1699
- "loss": 0.7613,
1700
  "step": 2210
1701
  },
1702
  {
1703
  "epoch": 17.0,
1704
- "eval_accuracy": 0.849624060150376,
1705
- "eval_loss": 0.7059224247932434,
1706
- "eval_runtime": 0.7496,
1707
- "eval_samples_per_second": 177.439,
1708
- "eval_steps_per_second": 22.68,
1709
  "step": 2210
1710
  },
1711
  {
1712
  "epoch": 17.076923076923077,
1713
- "grad_norm": 3.134481430053711,
1714
  "learning_rate": 2.9230769230769236e-06,
1715
- "loss": 0.7609,
1716
  "step": 2220
1717
  },
1718
  {
1719
  "epoch": 17.153846153846153,
1720
- "grad_norm": 2.0070559978485107,
1721
  "learning_rate": 2.846153846153846e-06,
1722
- "loss": 0.7483,
1723
  "step": 2230
1724
  },
1725
  {
1726
  "epoch": 17.23076923076923,
1727
- "grad_norm": 3.491682291030884,
1728
  "learning_rate": 2.7692307692307697e-06,
1729
- "loss": 0.7696,
1730
  "step": 2240
1731
  },
1732
  {
1733
  "epoch": 17.307692307692307,
1734
- "grad_norm": 1.9866397380828857,
1735
  "learning_rate": 2.6923076923076923e-06,
1736
- "loss": 0.7609,
1737
  "step": 2250
1738
  },
1739
  {
1740
  "epoch": 17.384615384615383,
1741
- "grad_norm": 3.458582878112793,
1742
  "learning_rate": 2.615384615384616e-06,
1743
- "loss": 0.7813,
1744
  "step": 2260
1745
  },
1746
  {
1747
  "epoch": 17.46153846153846,
1748
- "grad_norm": 2.1126835346221924,
1749
  "learning_rate": 2.5384615384615385e-06,
1750
- "loss": 0.7003,
1751
  "step": 2270
1752
  },
1753
  {
1754
  "epoch": 17.53846153846154,
1755
- "grad_norm": 3.5276880264282227,
1756
  "learning_rate": 2.461538461538462e-06,
1757
- "loss": 0.8305,
1758
  "step": 2280
1759
  },
1760
  {
1761
  "epoch": 17.615384615384617,
1762
- "grad_norm": 2.3967173099517822,
1763
  "learning_rate": 2.384615384615385e-06,
1764
- "loss": 0.7627,
1765
  "step": 2290
1766
  },
1767
  {
1768
  "epoch": 17.692307692307693,
1769
- "grad_norm": 4.473978042602539,
1770
  "learning_rate": 2.307692307692308e-06,
1771
- "loss": 0.7332,
1772
  "step": 2300
1773
  },
1774
  {
1775
  "epoch": 17.76923076923077,
1776
- "grad_norm": 2.1642568111419678,
1777
  "learning_rate": 2.230769230769231e-06,
1778
- "loss": 0.7678,
1779
  "step": 2310
1780
  },
1781
  {
1782
  "epoch": 17.846153846153847,
1783
- "grad_norm": 3.03192138671875,
1784
  "learning_rate": 2.153846153846154e-06,
1785
- "loss": 0.7565,
1786
  "step": 2320
1787
  },
1788
  {
1789
  "epoch": 17.923076923076923,
1790
- "grad_norm": 2.9610419273376465,
1791
  "learning_rate": 2.0769230769230773e-06,
1792
- "loss": 0.7651,
1793
  "step": 2330
1794
  },
1795
  {
1796
  "epoch": 18.0,
1797
- "grad_norm": 4.160178184509277,
1798
  "learning_rate": 2.0000000000000003e-06,
1799
- "loss": 0.7778,
1800
  "step": 2340
1801
  },
1802
  {
1803
  "epoch": 18.0,
1804
- "eval_accuracy": 0.8270676691729323,
1805
- "eval_loss": 0.6930322647094727,
1806
- "eval_runtime": 0.7854,
1807
- "eval_samples_per_second": 169.332,
1808
- "eval_steps_per_second": 21.644,
1809
  "step": 2340
1810
  },
1811
  {
1812
  "epoch": 18.076923076923077,
1813
- "grad_norm": 2.168921947479248,
1814
  "learning_rate": 1.9230769230769234e-06,
1815
- "loss": 0.7234,
1816
  "step": 2350
1817
  },
1818
  {
1819
  "epoch": 18.153846153846153,
1820
- "grad_norm": 3.935608386993408,
1821
  "learning_rate": 1.8461538461538465e-06,
1822
- "loss": 0.8192,
1823
  "step": 2360
1824
  },
1825
  {
1826
  "epoch": 18.23076923076923,
1827
- "grad_norm": 1.8215328454971313,
1828
  "learning_rate": 1.7692307692307695e-06,
1829
- "loss": 0.7271,
1830
  "step": 2370
1831
  },
1832
  {
1833
  "epoch": 18.307692307692307,
1834
- "grad_norm": 2.687016010284424,
1835
  "learning_rate": 1.6923076923076926e-06,
1836
- "loss": 0.8063,
1837
  "step": 2380
1838
  },
1839
  {
1840
  "epoch": 18.384615384615383,
1841
- "grad_norm": 2.3364577293395996,
1842
  "learning_rate": 1.6153846153846157e-06,
1843
- "loss": 0.7699,
1844
  "step": 2390
1845
  },
1846
  {
1847
  "epoch": 18.46153846153846,
1848
- "grad_norm": 2.7465319633483887,
1849
  "learning_rate": 1.5384615384615387e-06,
1850
- "loss": 0.8214,
1851
  "step": 2400
1852
  },
1853
  {
1854
  "epoch": 18.53846153846154,
1855
- "grad_norm": 3.3499436378479004,
1856
  "learning_rate": 1.4615384615384618e-06,
1857
- "loss": 0.7432,
1858
  "step": 2410
1859
  },
1860
  {
1861
  "epoch": 18.615384615384617,
1862
- "grad_norm": 3.7266149520874023,
1863
  "learning_rate": 1.3846153846153848e-06,
1864
- "loss": 0.797,
1865
  "step": 2420
1866
  },
1867
  {
1868
  "epoch": 18.692307692307693,
1869
- "grad_norm": 2.661741256713867,
1870
  "learning_rate": 1.307692307692308e-06,
1871
- "loss": 0.7404,
1872
  "step": 2430
1873
  },
1874
  {
1875
  "epoch": 18.76923076923077,
1876
- "grad_norm": 3.166747808456421,
1877
  "learning_rate": 1.230769230769231e-06,
1878
- "loss": 0.8197,
1879
  "step": 2440
1880
  },
1881
  {
1882
  "epoch": 18.846153846153847,
1883
- "grad_norm": 3.200448989868164,
1884
  "learning_rate": 1.153846153846154e-06,
1885
- "loss": 0.8068,
1886
  "step": 2450
1887
  },
1888
  {
1889
  "epoch": 18.923076923076923,
1890
- "grad_norm": 2.4404191970825195,
1891
  "learning_rate": 1.076923076923077e-06,
1892
- "loss": 0.788,
1893
  "step": 2460
1894
  },
1895
  {
1896
  "epoch": 19.0,
1897
- "grad_norm": 3.8639049530029297,
1898
  "learning_rate": 1.0000000000000002e-06,
1899
- "loss": 0.8081,
1900
  "step": 2470
1901
  },
1902
  {
1903
  "epoch": 19.0,
1904
- "eval_accuracy": 0.8646616541353384,
1905
- "eval_loss": 0.6890266537666321,
1906
- "eval_runtime": 0.7797,
1907
- "eval_samples_per_second": 170.576,
1908
- "eval_steps_per_second": 21.803,
1909
  "step": 2470
1910
  },
1911
  {
1912
  "epoch": 19.076923076923077,
1913
- "grad_norm": 1.7245137691497803,
1914
  "learning_rate": 9.230769230769232e-07,
1915
- "loss": 0.7929,
1916
  "step": 2480
1917
  },
1918
  {
1919
  "epoch": 19.153846153846153,
1920
- "grad_norm": 3.7959182262420654,
1921
  "learning_rate": 8.461538461538463e-07,
1922
- "loss": 0.7397,
1923
  "step": 2490
1924
  },
1925
  {
1926
  "epoch": 19.23076923076923,
1927
- "grad_norm": 2.798788070678711,
1928
  "learning_rate": 7.692307692307694e-07,
1929
- "loss": 0.7928,
1930
  "step": 2500
1931
  },
1932
  {
1933
  "epoch": 19.307692307692307,
1934
- "grad_norm": 2.1275336742401123,
1935
  "learning_rate": 6.923076923076924e-07,
1936
- "loss": 0.7672,
1937
  "step": 2510
1938
  },
1939
  {
1940
  "epoch": 19.384615384615383,
1941
- "grad_norm": 2.9216866493225098,
1942
  "learning_rate": 6.153846153846155e-07,
1943
- "loss": 0.7918,
1944
  "step": 2520
1945
  },
1946
  {
1947
  "epoch": 19.46153846153846,
1948
- "grad_norm": 2.3012797832489014,
1949
  "learning_rate": 5.384615384615386e-07,
1950
- "loss": 0.7418,
1951
  "step": 2530
1952
  },
1953
  {
1954
  "epoch": 19.53846153846154,
1955
- "grad_norm": 2.5353312492370605,
1956
  "learning_rate": 4.615384615384616e-07,
1957
- "loss": 0.8115,
1958
  "step": 2540
1959
  },
1960
  {
1961
  "epoch": 19.615384615384617,
1962
- "grad_norm": 3.469372510910034,
1963
  "learning_rate": 3.846153846153847e-07,
1964
- "loss": 0.7698,
1965
  "step": 2550
1966
  },
1967
  {
1968
  "epoch": 19.692307692307693,
1969
- "grad_norm": 2.3621013164520264,
1970
  "learning_rate": 3.0769230769230774e-07,
1971
- "loss": 0.6997,
1972
  "step": 2560
1973
  },
1974
  {
1975
  "epoch": 19.76923076923077,
1976
- "grad_norm": 1.7231149673461914,
1977
  "learning_rate": 2.307692307692308e-07,
1978
- "loss": 0.7207,
1979
  "step": 2570
1980
  },
1981
  {
1982
  "epoch": 19.846153846153847,
1983
- "grad_norm": 5.3792924880981445,
1984
  "learning_rate": 1.5384615384615387e-07,
1985
- "loss": 0.7656,
1986
  "step": 2580
1987
  },
1988
  {
1989
  "epoch": 19.923076923076923,
1990
- "grad_norm": 1.9618691205978394,
1991
  "learning_rate": 7.692307692307694e-08,
1992
- "loss": 0.6919,
1993
  "step": 2590
1994
  },
1995
  {
1996
  "epoch": 20.0,
1997
- "grad_norm": 4.051193714141846,
1998
  "learning_rate": 0.0,
1999
- "loss": 0.7916,
2000
  "step": 2600
2001
  },
2002
  {
2003
  "epoch": 20.0,
2004
- "eval_accuracy": 0.8646616541353384,
2005
- "eval_loss": 0.6874601244926453,
2006
- "eval_runtime": 0.8096,
2007
- "eval_samples_per_second": 164.271,
2008
- "eval_steps_per_second": 20.997,
2009
  "step": 2600
2010
  },
2011
  {
2012
  "epoch": 20.0,
2013
  "step": 2600,
2014
- "total_flos": 2.0877820672794624e+17,
2015
- "train_loss": 0.19350949709232038,
2016
- "train_runtime": 49.806,
2017
- "train_samples_per_second": 415.211,
2018
- "train_steps_per_second": 52.203
2019
  }
2020
  ],
2021
  "logging_steps": 10,
@@ -2035,7 +2035,7 @@
2035
  "attributes": {}
2036
  }
2037
  },
2038
- "total_flos": 2.0877820672794624e+17,
2039
  "train_batch_size": 8,
2040
  "trial_name": null,
2041
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.49125248193740845,
3
+ "best_model_checkpoint": "./beans_outputs/checkpoint-2340",
4
  "epoch": 20.0,
5
  "eval_steps": 500,
6
  "global_step": 2600,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
+ "grad_norm": 2.631645917892456,
14
+ "learning_rate": 1.9923076923076926e-05,
15
+ "loss": 1.0806,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
+ "grad_norm": 2.7555034160614014,
21
+ "learning_rate": 1.9846153846153847e-05,
22
+ "loss": 1.0886,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
+ "grad_norm": 3.3691883087158203,
28
+ "learning_rate": 1.976923076923077e-05,
29
+ "loss": 1.0765,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
+ "grad_norm": 3.109200954437256,
35
+ "learning_rate": 1.9692307692307696e-05,
36
+ "loss": 1.0931,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
+ "grad_norm": 2.8181138038635254,
42
+ "learning_rate": 1.9615384615384617e-05,
43
+ "loss": 1.0809,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
+ "grad_norm": 2.4915010929107666,
49
+ "learning_rate": 1.953846153846154e-05,
50
+ "loss": 1.0718,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
+ "grad_norm": 3.8907675743103027,
56
+ "learning_rate": 1.9461538461538462e-05,
57
+ "loss": 1.0905,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
+ "grad_norm": 3.9334473609924316,
63
+ "learning_rate": 1.9384615384615386e-05,
64
+ "loss": 1.09,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
+ "grad_norm": 2.9074010848999023,
70
+ "learning_rate": 1.930769230769231e-05,
71
+ "loss": 1.0835,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
+ "grad_norm": 3.810696840286255,
77
+ "learning_rate": 1.923076923076923e-05,
78
+ "loss": 1.0851,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
+ "grad_norm": 2.3626232147216797,
84
+ "learning_rate": 1.9153846153846156e-05,
85
+ "loss": 1.0751,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
+ "grad_norm": 2.9881410598754883,
91
+ "learning_rate": 1.907692307692308e-05,
92
+ "loss": 1.0728,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "grad_norm": 8.272817611694336,
98
+ "learning_rate": 1.9e-05,
99
+ "loss": 1.07,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
+ "eval_accuracy": 0.41353383458646614,
105
+ "eval_loss": 1.0682790279388428,
106
+ "eval_runtime": 1.1629,
107
+ "eval_samples_per_second": 114.366,
108
+ "eval_steps_per_second": 14.618,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
+ "grad_norm": 2.923727035522461,
114
+ "learning_rate": 1.8923076923076925e-05,
115
+ "loss": 1.0742,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
+ "grad_norm": 4.396390914916992,
121
+ "learning_rate": 1.8846153846153846e-05,
122
+ "loss": 1.0647,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
+ "grad_norm": 2.367124080657959,
128
+ "learning_rate": 1.876923076923077e-05,
129
+ "loss": 1.0739,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
+ "grad_norm": 3.049018621444702,
135
+ "learning_rate": 1.8692307692307695e-05,
136
+ "loss": 1.0518,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
+ "grad_norm": 3.515366554260254,
142
+ "learning_rate": 1.8615384615384616e-05,
143
+ "loss": 1.0581,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
+ "grad_norm": 3.3465802669525146,
149
+ "learning_rate": 1.853846153846154e-05,
150
+ "loss": 1.0578,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
+ "grad_norm": 3.190157175064087,
156
+ "learning_rate": 1.8461538461538465e-05,
157
+ "loss": 1.0674,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
+ "grad_norm": 4.284377574920654,
163
+ "learning_rate": 1.8384615384615386e-05,
164
+ "loss": 1.0656,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
+ "grad_norm": 4.362184524536133,
170
+ "learning_rate": 1.830769230769231e-05,
171
+ "loss": 1.0597,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
+ "grad_norm": 3.011751651763916,
177
+ "learning_rate": 1.823076923076923e-05,
178
+ "loss": 1.05,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
+ "grad_norm": 2.408613920211792,
184
+ "learning_rate": 1.8153846153846155e-05,
185
+ "loss": 1.0617,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
+ "grad_norm": 16.68673324584961,
191
+ "learning_rate": 1.807692307692308e-05,
192
+ "loss": 1.0474,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
+ "grad_norm": 6.727813243865967,
198
+ "learning_rate": 1.8e-05,
199
+ "loss": 1.0523,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
+ "eval_accuracy": 0.6240601503759399,
205
+ "eval_loss": 1.035576581954956,
206
+ "eval_runtime": 0.9256,
207
+ "eval_samples_per_second": 143.683,
208
+ "eval_steps_per_second": 18.365,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
+ "grad_norm": 2.9416935443878174,
214
+ "learning_rate": 1.7923076923076925e-05,
215
+ "loss": 1.0505,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
+ "grad_norm": 3.15810489654541,
221
+ "learning_rate": 1.784615384615385e-05,
222
+ "loss": 1.0492,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
+ "grad_norm": 3.2049734592437744,
228
+ "learning_rate": 1.776923076923077e-05,
229
+ "loss": 1.0467,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
+ "grad_norm": 3.3393309116363525,
235
+ "learning_rate": 1.7692307692307694e-05,
236
+ "loss": 1.0297,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
+ "grad_norm": 2.938563346862793,
242
+ "learning_rate": 1.7615384615384615e-05,
243
+ "loss": 1.0442,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
+ "grad_norm": 2.6086559295654297,
249
+ "learning_rate": 1.753846153846154e-05,
250
+ "loss": 1.0506,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
+ "grad_norm": 3.588426113128662,
256
+ "learning_rate": 1.7461538461538464e-05,
257
+ "loss": 1.0351,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
+ "grad_norm": 2.821547269821167,
263
+ "learning_rate": 1.7384615384615385e-05,
264
+ "loss": 1.0213,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
+ "grad_norm": 3.074611186981201,
270
+ "learning_rate": 1.730769230769231e-05,
271
+ "loss": 1.0406,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
+ "grad_norm": 2.6791045665740967,
277
+ "learning_rate": 1.7230769230769234e-05,
278
+ "loss": 1.0237,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
+ "grad_norm": 2.8163557052612305,
284
+ "learning_rate": 1.7153846153846155e-05,
285
+ "loss": 1.0337,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
+ "grad_norm": 3.1033220291137695,
291
+ "learning_rate": 1.707692307692308e-05,
292
+ "loss": 1.0288,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
+ "grad_norm": 6.048186302185059,
298
+ "learning_rate": 1.7e-05,
299
+ "loss": 1.0439,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
+ "eval_accuracy": 0.6616541353383458,
305
+ "eval_loss": 1.0045464038848877,
306
+ "eval_runtime": 0.9488,
307
+ "eval_samples_per_second": 140.174,
308
+ "eval_steps_per_second": 17.917,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
+ "grad_norm": 3.378181219100952,
314
+ "learning_rate": 1.6923076923076924e-05,
315
+ "loss": 1.0363,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
+ "grad_norm": 3.51971697807312,
321
+ "learning_rate": 1.684615384615385e-05,
322
+ "loss": 1.0182,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
+ "grad_norm": 3.365443706512451,
328
+ "learning_rate": 1.676923076923077e-05,
329
+ "loss": 1.0095,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
+ "grad_norm": 3.7726383209228516,
335
+ "learning_rate": 1.6692307692307694e-05,
336
+ "loss": 1.005,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
+ "grad_norm": 4.8340864181518555,
342
+ "learning_rate": 1.6615384615384618e-05,
343
+ "loss": 1.0283,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
+ "grad_norm": 3.5189855098724365,
349
+ "learning_rate": 1.653846153846154e-05,
350
+ "loss": 1.0159,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
+ "grad_norm": 3.1230361461639404,
356
+ "learning_rate": 1.6461538461538463e-05,
357
+ "loss": 1.0098,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
+ "grad_norm": 2.575528860092163,
363
+ "learning_rate": 1.6384615384615384e-05,
364
+ "loss": 1.0065,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
+ "grad_norm": 3.465209722518921,
370
+ "learning_rate": 1.630769230769231e-05,
371
+ "loss": 1.0066,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
+ "grad_norm": 2.744354486465454,
377
+ "learning_rate": 1.6230769230769233e-05,
378
+ "loss": 0.9909,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
+ "grad_norm": 2.649618625640869,
384
+ "learning_rate": 1.6153846153846154e-05,
385
+ "loss": 1.0106,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
+ "grad_norm": 3.7920305728912354,
391
+ "learning_rate": 1.607692307692308e-05,
392
+ "loss": 0.9962,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
+ "grad_norm": 6.38316535949707,
398
+ "learning_rate": 1.6000000000000003e-05,
399
+ "loss": 1.0056,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
+ "eval_accuracy": 0.7293233082706767,
405
+ "eval_loss": 0.9671366810798645,
406
+ "eval_runtime": 0.9722,
407
+ "eval_samples_per_second": 136.81,
408
+ "eval_steps_per_second": 17.487,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
+ "grad_norm": 4.276334285736084,
414
+ "learning_rate": 1.5923076923076924e-05,
415
+ "loss": 0.9717,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
+ "grad_norm": 3.4239840507507324,
421
+ "learning_rate": 1.5846153846153848e-05,
422
+ "loss": 1.0061,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
+ "grad_norm": 3.4140846729278564,
428
+ "learning_rate": 1.576923076923077e-05,
429
+ "loss": 1.0057,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
+ "grad_norm": 3.539780855178833,
435
+ "learning_rate": 1.5692307692307693e-05,
436
+ "loss": 1.0197,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
+ "grad_norm": 4.03983211517334,
442
+ "learning_rate": 1.5615384615384618e-05,
443
+ "loss": 0.9655,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
+ "grad_norm": 3.4873383045196533,
449
+ "learning_rate": 1.553846153846154e-05,
450
+ "loss": 0.9859,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
+ "grad_norm": 3.594426393508911,
456
+ "learning_rate": 1.5461538461538463e-05,
457
+ "loss": 0.9938,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
+ "grad_norm": 3.7454657554626465,
463
+ "learning_rate": 1.5384615384615387e-05,
464
+ "loss": 0.97,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
+ "grad_norm": 4.500467300415039,
470
+ "learning_rate": 1.5307692307692308e-05,
471
+ "loss": 0.976,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
+ "grad_norm": 2.860618829727173,
477
+ "learning_rate": 1.523076923076923e-05,
478
+ "loss": 1.0125,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
+ "grad_norm": 4.1896796226501465,
484
+ "learning_rate": 1.5153846153846155e-05,
485
+ "loss": 0.9496,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
+ "grad_norm": 3.3220174312591553,
491
+ "learning_rate": 1.5076923076923078e-05,
492
+ "loss": 0.9541,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
+ "grad_norm": 7.564222812652588,
498
+ "learning_rate": 1.5000000000000002e-05,
499
+ "loss": 0.9853,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
+ "eval_accuracy": 0.7894736842105263,
505
+ "eval_loss": 0.9245139956474304,
506
+ "eval_runtime": 0.9408,
507
+ "eval_samples_per_second": 141.37,
508
+ "eval_steps_per_second": 18.07,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
+ "grad_norm": 3.3493552207946777,
514
+ "learning_rate": 1.4923076923076925e-05,
515
+ "loss": 0.9562,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
+ "grad_norm": 4.18095588684082,
521
+ "learning_rate": 1.4846153846153847e-05,
522
+ "loss": 0.9853,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
+ "grad_norm": 3.0175259113311768,
528
+ "learning_rate": 1.4769230769230772e-05,
529
+ "loss": 0.9418,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
+ "grad_norm": 4.064915180206299,
535
+ "learning_rate": 1.4692307692307694e-05,
536
+ "loss": 0.9644,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
+ "grad_norm": 4.862594127655029,
542
+ "learning_rate": 1.4615384615384615e-05,
543
+ "loss": 0.9521,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
+ "grad_norm": 3.188516855239868,
549
+ "learning_rate": 1.453846153846154e-05,
550
+ "loss": 0.9612,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
+ "grad_norm": 3.2765607833862305,
556
+ "learning_rate": 1.4461538461538462e-05,
557
+ "loss": 0.9351,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
+ "grad_norm": 2.763471841812134,
563
+ "learning_rate": 1.4384615384615387e-05,
564
+ "loss": 0.956,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
+ "grad_norm": 4.148519515991211,
570
+ "learning_rate": 1.430769230769231e-05,
571
+ "loss": 0.9196,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
+ "grad_norm": 3.0915586948394775,
577
+ "learning_rate": 1.4230769230769232e-05,
578
+ "loss": 0.9266,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
+ "grad_norm": 3.6650705337524414,
584
+ "learning_rate": 1.4153846153846156e-05,
585
+ "loss": 0.9385,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
+ "grad_norm": 4.015691757202148,
591
+ "learning_rate": 1.4076923076923079e-05,
592
+ "loss": 0.9356,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
+ "grad_norm": 11.575014114379883,
598
+ "learning_rate": 1.4e-05,
599
+ "loss": 0.9581,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
  "eval_accuracy": 0.7819548872180451,
605
+ "eval_loss": 0.8743670582771301,
606
+ "eval_runtime": 0.9201,
607
+ "eval_samples_per_second": 144.551,
608
+ "eval_steps_per_second": 18.477,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
+ "grad_norm": 3.6309585571289062,
614
+ "learning_rate": 1.3923076923076924e-05,
615
+ "loss": 0.9073,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
+ "grad_norm": 4.175745010375977,
621
+ "learning_rate": 1.3846153846153847e-05,
622
+ "loss": 0.9238,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
+ "grad_norm": 3.9674503803253174,
628
+ "learning_rate": 1.3769230769230771e-05,
629
+ "loss": 0.9014,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
+ "grad_norm": 3.084416627883911,
635
+ "learning_rate": 1.3692307692307694e-05,
636
+ "loss": 0.9023,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
+ "grad_norm": 4.2650580406188965,
642
+ "learning_rate": 1.3615384615384616e-05,
643
+ "loss": 0.8993,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
+ "grad_norm": 4.604613304138184,
649
+ "learning_rate": 1.353846153846154e-05,
650
+ "loss": 0.8902,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
+ "grad_norm": 4.572556018829346,
656
+ "learning_rate": 1.3461538461538463e-05,
657
+ "loss": 0.9039,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
+ "grad_norm": 4.689060211181641,
663
+ "learning_rate": 1.3384615384615384e-05,
664
+ "loss": 0.8743,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
+ "grad_norm": 4.0536394119262695,
670
+ "learning_rate": 1.3307692307692309e-05,
671
+ "loss": 0.9188,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
+ "grad_norm": 3.3945388793945312,
677
+ "learning_rate": 1.3230769230769231e-05,
678
+ "loss": 0.9175,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
+ "grad_norm": 4.679390907287598,
684
+ "learning_rate": 1.3153846153846156e-05,
685
+ "loss": 0.8946,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
+ "grad_norm": 3.4296305179595947,
691
+ "learning_rate": 1.3076923076923078e-05,
692
+ "loss": 0.8895,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
+ "grad_norm": 6.18251895904541,
698
+ "learning_rate": 1.3000000000000001e-05,
699
+ "loss": 0.9044,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
+ "eval_accuracy": 0.7819548872180451,
705
+ "eval_loss": 0.8171929717063904,
706
+ "eval_runtime": 0.9448,
707
+ "eval_samples_per_second": 140.765,
708
+ "eval_steps_per_second": 17.993,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
+ "grad_norm": 4.493593692779541,
714
+ "learning_rate": 1.2923076923076925e-05,
715
+ "loss": 0.9023,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
+ "grad_norm": 3.8425426483154297,
721
+ "learning_rate": 1.2846153846153848e-05,
722
+ "loss": 0.8691,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
+ "grad_norm": 4.417716026306152,
728
+ "learning_rate": 1.2769230769230769e-05,
729
+ "loss": 0.8817,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
+ "grad_norm": 4.091881275177002,
735
+ "learning_rate": 1.2692307692307693e-05,
736
+ "loss": 0.8901,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
+ "grad_norm": 4.237718105316162,
742
+ "learning_rate": 1.2615384615384616e-05,
743
+ "loss": 0.863,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
+ "grad_norm": 6.6138410568237305,
749
+ "learning_rate": 1.253846153846154e-05,
750
+ "loss": 0.8556,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
+ "grad_norm": 4.715096473693848,
756
+ "learning_rate": 1.2461538461538463e-05,
757
+ "loss": 0.8851,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
+ "grad_norm": 3.7215723991394043,
763
+ "learning_rate": 1.2384615384615385e-05,
764
+ "loss": 0.8334,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
+ "grad_norm": 3.3076252937316895,
770
+ "learning_rate": 1.230769230769231e-05,
771
+ "loss": 0.8542,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
+ "grad_norm": 4.20552921295166,
777
+ "learning_rate": 1.2230769230769232e-05,
778
+ "loss": 0.8271,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
+ "grad_norm": 4.651803016662598,
784
+ "learning_rate": 1.2153846153846153e-05,
785
+ "loss": 0.8346,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
+ "grad_norm": 4.573400974273682,
791
+ "learning_rate": 1.2076923076923078e-05,
792
+ "loss": 0.8315,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
+ "grad_norm": 6.5735182762146,
798
+ "learning_rate": 1.2e-05,
799
+ "loss": 0.869,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
+ "eval_accuracy": 0.8270676691729323,
805
+ "eval_loss": 0.773723304271698,
806
+ "eval_runtime": 0.9363,
807
+ "eval_samples_per_second": 142.041,
808
+ "eval_steps_per_second": 18.156,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
+ "grad_norm": 4.215645790100098,
814
+ "learning_rate": 1.1923076923076925e-05,
815
+ "loss": 0.8822,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
+ "grad_norm": 3.5975663661956787,
821
+ "learning_rate": 1.1846153846153847e-05,
822
+ "loss": 0.8305,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
+ "grad_norm": 4.703883647918701,
828
+ "learning_rate": 1.176923076923077e-05,
829
+ "loss": 0.8559,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
+ "grad_norm": 3.2407641410827637,
835
+ "learning_rate": 1.1692307692307694e-05,
836
+ "loss": 0.8117,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
+ "grad_norm": 4.487489700317383,
842
+ "learning_rate": 1.1615384615384617e-05,
843
+ "loss": 0.8241,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
+ "grad_norm": 3.8978068828582764,
849
+ "learning_rate": 1.1538461538461538e-05,
850
+ "loss": 0.8537,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
+ "grad_norm": 2.5178143978118896,
856
+ "learning_rate": 1.1461538461538462e-05,
857
+ "loss": 0.8062,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
+ "grad_norm": 4.473972320556641,
863
+ "learning_rate": 1.1384615384615385e-05,
864
+ "loss": 0.8532,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
+ "grad_norm": 5.469933032989502,
870
+ "learning_rate": 1.1307692307692309e-05,
871
+ "loss": 0.8532,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
+ "grad_norm": 2.7200920581817627,
877
+ "learning_rate": 1.1230769230769232e-05,
878
+ "loss": 0.8203,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
+ "grad_norm": 3.509950876235962,
884
+ "learning_rate": 1.1153846153846154e-05,
885
+ "loss": 0.82,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
+ "grad_norm": 3.90140962600708,
891
+ "learning_rate": 1.1076923076923079e-05,
892
+ "loss": 0.8182,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
+ "grad_norm": 16.341764450073242,
898
+ "learning_rate": 1.1000000000000001e-05,
899
+ "loss": 0.8804,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
+ "eval_accuracy": 0.8270676691729323,
905
+ "eval_loss": 0.7098143100738525,
906
+ "eval_runtime": 0.9276,
907
+ "eval_samples_per_second": 143.385,
908
+ "eval_steps_per_second": 18.327,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
+ "grad_norm": 5.392989635467529,
914
+ "learning_rate": 1.0923076923076922e-05,
915
+ "loss": 0.8143,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
+ "grad_norm": 3.0304982662200928,
921
+ "learning_rate": 1.0846153846153847e-05,
922
+ "loss": 0.7586,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
+ "grad_norm": 3.1372382640838623,
928
+ "learning_rate": 1.076923076923077e-05,
929
+ "loss": 0.7662,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
+ "grad_norm": 3.1745128631591797,
935
+ "learning_rate": 1.0692307692307694e-05,
936
+ "loss": 0.8316,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
+ "grad_norm": 6.142508029937744,
942
+ "learning_rate": 1.0615384615384616e-05,
943
+ "loss": 0.799,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
+ "grad_norm": 4.660125255584717,
949
+ "learning_rate": 1.0538461538461539e-05,
950
+ "loss": 0.7981,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
+ "grad_norm": 3.9741530418395996,
956
+ "learning_rate": 1.0461538461538463e-05,
957
+ "loss": 0.7887,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
+ "grad_norm": 4.778607368469238,
963
+ "learning_rate": 1.0384615384615386e-05,
964
+ "loss": 0.7304,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
+ "grad_norm": 4.286049842834473,
970
+ "learning_rate": 1.0307692307692307e-05,
971
+ "loss": 0.8002,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
+ "grad_norm": 3.7198238372802734,
977
+ "learning_rate": 1.0230769230769231e-05,
978
+ "loss": 0.7861,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
+ "grad_norm": 3.3311705589294434,
984
+ "learning_rate": 1.0153846153846154e-05,
985
+ "loss": 0.7667,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
+ "grad_norm": 4.488588333129883,
991
+ "learning_rate": 1.0076923076923078e-05,
992
+ "loss": 0.8,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
+ "grad_norm": 8.365084648132324,
998
+ "learning_rate": 1e-05,
999
+ "loss": 0.7757,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
+ "eval_accuracy": 0.8120300751879699,
1005
+ "eval_loss": 0.6705044507980347,
1006
+ "eval_runtime": 0.9309,
1007
+ "eval_samples_per_second": 142.866,
1008
+ "eval_steps_per_second": 18.261,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.076923076923077,
1013
+ "grad_norm": 4.813228130340576,
1014
+ "learning_rate": 9.923076923076923e-06,
1015
+ "loss": 0.7808,
1016
  "step": 1310
1017
  },
1018
  {
1019
  "epoch": 10.153846153846153,
1020
+ "grad_norm": 3.86871337890625,
1021
+ "learning_rate": 9.846153846153848e-06,
1022
+ "loss": 0.7749,
1023
  "step": 1320
1024
  },
1025
  {
1026
  "epoch": 10.23076923076923,
1027
+ "grad_norm": 4.498291492462158,
1028
+ "learning_rate": 9.76923076923077e-06,
1029
+ "loss": 0.7842,
1030
  "step": 1330
1031
  },
1032
  {
1033
  "epoch": 10.307692307692308,
1034
+ "grad_norm": 7.416966438293457,
1035
+ "learning_rate": 9.692307692307693e-06,
1036
+ "loss": 0.7397,
1037
  "step": 1340
1038
  },
1039
  {
1040
  "epoch": 10.384615384615385,
1041
+ "grad_norm": 3.8194034099578857,
1042
+ "learning_rate": 9.615384615384616e-06,
1043
+ "loss": 0.7101,
1044
  "step": 1350
1045
  },
1046
  {
1047
  "epoch": 10.461538461538462,
1048
+ "grad_norm": 4.42163610458374,
1049
+ "learning_rate": 9.53846153846154e-06,
1050
+ "loss": 0.7694,
1051
  "step": 1360
1052
  },
1053
  {
1054
  "epoch": 10.538461538461538,
1055
+ "grad_norm": 4.710392951965332,
1056
+ "learning_rate": 9.461538461538463e-06,
1057
+ "loss": 0.7536,
1058
  "step": 1370
1059
  },
1060
  {
1061
  "epoch": 10.615384615384615,
1062
+ "grad_norm": 4.0583906173706055,
1063
+ "learning_rate": 9.384615384615385e-06,
1064
+ "loss": 0.7591,
1065
  "step": 1380
1066
  },
1067
  {
1068
  "epoch": 10.692307692307692,
1069
+ "grad_norm": 4.696585655212402,
1070
+ "learning_rate": 9.307692307692308e-06,
1071
+ "loss": 0.7469,
1072
  "step": 1390
1073
  },
1074
  {
1075
  "epoch": 10.76923076923077,
1076
+ "grad_norm": 4.995838642120361,
1077
+ "learning_rate": 9.230769230769232e-06,
1078
+ "loss": 0.776,
1079
  "step": 1400
1080
  },
1081
  {
1082
  "epoch": 10.846153846153847,
1083
+ "grad_norm": 3.7382941246032715,
1084
+ "learning_rate": 9.153846153846155e-06,
1085
+ "loss": 0.7281,
1086
  "step": 1410
1087
  },
1088
  {
1089
  "epoch": 10.923076923076923,
1090
+ "grad_norm": 4.595252513885498,
1091
+ "learning_rate": 9.076923076923078e-06,
1092
+ "loss": 0.7781,
1093
  "step": 1420
1094
  },
1095
  {
1096
  "epoch": 11.0,
1097
+ "grad_norm": 10.765867233276367,
1098
+ "learning_rate": 9e-06,
1099
+ "loss": 0.7694,
1100
  "step": 1430
1101
  },
1102
  {
1103
  "epoch": 11.0,
1104
+ "eval_accuracy": 0.8571428571428571,
1105
+ "eval_loss": 0.638173520565033,
1106
+ "eval_runtime": 0.9524,
1107
+ "eval_samples_per_second": 139.647,
1108
+ "eval_steps_per_second": 17.85,
1109
  "step": 1430
1110
  },
1111
  {
1112
  "epoch": 11.076923076923077,
1113
+ "grad_norm": 4.3198041915893555,
1114
+ "learning_rate": 8.923076923076925e-06,
1115
+ "loss": 0.7603,
1116
  "step": 1440
1117
  },
1118
  {
1119
  "epoch": 11.153846153846153,
1120
+ "grad_norm": 6.03615140914917,
1121
+ "learning_rate": 8.846153846153847e-06,
1122
+ "loss": 0.7594,
1123
  "step": 1450
1124
  },
1125
  {
1126
  "epoch": 11.23076923076923,
1127
+ "grad_norm": 4.512632846832275,
1128
+ "learning_rate": 8.76923076923077e-06,
1129
+ "loss": 0.7111,
1130
  "step": 1460
1131
  },
1132
  {
1133
  "epoch": 11.307692307692308,
1134
+ "grad_norm": 3.5640311241149902,
1135
+ "learning_rate": 8.692307692307692e-06,
1136
+ "loss": 0.7783,
1137
  "step": 1470
1138
  },
1139
  {
1140
  "epoch": 11.384615384615385,
1141
+ "grad_norm": 3.612410306930542,
1142
+ "learning_rate": 8.615384615384617e-06,
1143
+ "loss": 0.7285,
1144
  "step": 1480
1145
  },
1146
  {
1147
  "epoch": 11.461538461538462,
1148
+ "grad_norm": 5.0590996742248535,
1149
+ "learning_rate": 8.53846153846154e-06,
1150
+ "loss": 0.7128,
1151
  "step": 1490
1152
  },
1153
  {
1154
  "epoch": 11.538461538461538,
1155
+ "grad_norm": 4.820272445678711,
1156
+ "learning_rate": 8.461538461538462e-06,
1157
+ "loss": 0.798,
1158
  "step": 1500
1159
  },
1160
  {
1161
  "epoch": 11.615384615384615,
1162
+ "grad_norm": 3.6416468620300293,
1163
+ "learning_rate": 8.384615384615385e-06,
1164
+ "loss": 0.7408,
1165
  "step": 1510
1166
  },
1167
  {
1168
  "epoch": 11.692307692307692,
1169
+ "grad_norm": 4.562481880187988,
1170
+ "learning_rate": 8.307692307692309e-06,
1171
+ "loss": 0.7682,
1172
  "step": 1520
1173
  },
1174
  {
1175
  "epoch": 11.76923076923077,
1176
+ "grad_norm": 3.1171770095825195,
1177
+ "learning_rate": 8.230769230769232e-06,
1178
+ "loss": 0.7337,
1179
  "step": 1530
1180
  },
1181
  {
1182
  "epoch": 11.846153846153847,
1183
+ "grad_norm": 3.2231085300445557,
1184
+ "learning_rate": 8.153846153846154e-06,
1185
+ "loss": 0.7348,
1186
  "step": 1540
1187
  },
1188
  {
1189
  "epoch": 11.923076923076923,
1190
+ "grad_norm": 3.5213522911071777,
1191
+ "learning_rate": 8.076923076923077e-06,
1192
+ "loss": 0.7064,
1193
  "step": 1550
1194
  },
1195
  {
1196
  "epoch": 12.0,
1197
+ "grad_norm": 7.278842449188232,
1198
+ "learning_rate": 8.000000000000001e-06,
1199
+ "loss": 0.7966,
1200
  "step": 1560
1201
  },
1202
  {
1203
  "epoch": 12.0,
1204
+ "eval_accuracy": 0.7894736842105263,
1205
+ "eval_loss": 0.6087508797645569,
1206
+ "eval_runtime": 0.9347,
1207
+ "eval_samples_per_second": 142.285,
1208
+ "eval_steps_per_second": 18.187,
1209
  "step": 1560
1210
  },
1211
  {
1212
  "epoch": 12.076923076923077,
1213
+ "grad_norm": 8.028270721435547,
1214
+ "learning_rate": 7.923076923076924e-06,
1215
+ "loss": 0.7035,
1216
  "step": 1570
1217
  },
1218
  {
1219
  "epoch": 12.153846153846153,
1220
+ "grad_norm": 3.4982223510742188,
1221
+ "learning_rate": 7.846153846153847e-06,
1222
+ "loss": 0.6953,
1223
  "step": 1580
1224
  },
1225
  {
1226
  "epoch": 12.23076923076923,
1227
+ "grad_norm": 3.474436044692993,
1228
+ "learning_rate": 7.76923076923077e-06,
1229
+ "loss": 0.7272,
1230
  "step": 1590
1231
  },
1232
  {
1233
  "epoch": 12.307692307692308,
1234
+ "grad_norm": 5.147262096405029,
1235
+ "learning_rate": 7.692307692307694e-06,
1236
+ "loss": 0.7639,
1237
  "step": 1600
1238
  },
1239
  {
1240
  "epoch": 12.384615384615385,
1241
+ "grad_norm": 3.383554458618164,
1242
+ "learning_rate": 7.615384615384615e-06,
1243
+ "loss": 0.7094,
1244
  "step": 1610
1245
  },
1246
  {
1247
  "epoch": 12.461538461538462,
1248
+ "grad_norm": 3.3019802570343018,
1249
+ "learning_rate": 7.538461538461539e-06,
1250
+ "loss": 0.685,
1251
  "step": 1620
1252
  },
1253
  {
1254
  "epoch": 12.538461538461538,
1255
+ "grad_norm": 3.602853775024414,
1256
+ "learning_rate": 7.461538461538462e-06,
1257
+ "loss": 0.7068,
1258
  "step": 1630
1259
  },
1260
  {
1261
  "epoch": 12.615384615384615,
1262
+ "grad_norm": 3.3312697410583496,
1263
+ "learning_rate": 7.384615384615386e-06,
1264
+ "loss": 0.7045,
1265
  "step": 1640
1266
  },
1267
  {
1268
  "epoch": 12.692307692307692,
1269
+ "grad_norm": 6.843470573425293,
1270
+ "learning_rate": 7.307692307692308e-06,
1271
+ "loss": 0.6643,
1272
  "step": 1650
1273
  },
1274
  {
1275
  "epoch": 12.76923076923077,
1276
+ "grad_norm": 9.701897621154785,
1277
+ "learning_rate": 7.230769230769231e-06,
1278
+ "loss": 0.8052,
1279
  "step": 1660
1280
  },
1281
  {
1282
  "epoch": 12.846153846153847,
1283
+ "grad_norm": 5.923687934875488,
1284
+ "learning_rate": 7.153846153846155e-06,
1285
+ "loss": 0.7118,
1286
  "step": 1670
1287
  },
1288
  {
1289
  "epoch": 12.923076923076923,
1290
+ "grad_norm": 3.330617666244507,
1291
+ "learning_rate": 7.076923076923078e-06,
1292
+ "loss": 0.701,
1293
  "step": 1680
1294
  },
1295
  {
1296
  "epoch": 13.0,
1297
+ "grad_norm": 10.613458633422852,
1298
+ "learning_rate": 7e-06,
1299
+ "loss": 0.7425,
1300
  "step": 1690
1301
  },
1302
  {
1303
  "epoch": 13.0,
1304
+ "eval_accuracy": 0.849624060150376,
1305
+ "eval_loss": 0.572424054145813,
1306
+ "eval_runtime": 0.9506,
1307
+ "eval_samples_per_second": 139.913,
1308
+ "eval_steps_per_second": 17.884,
1309
  "step": 1690
1310
  },
1311
  {
1312
  "epoch": 13.076923076923077,
1313
+ "grad_norm": 4.795916557312012,
1314
+ "learning_rate": 6.923076923076923e-06,
1315
+ "loss": 0.7275,
1316
  "step": 1700
1317
  },
1318
  {
1319
  "epoch": 13.153846153846153,
1320
+ "grad_norm": 3.3990397453308105,
1321
+ "learning_rate": 6.846153846153847e-06,
1322
+ "loss": 0.737,
1323
  "step": 1710
1324
  },
1325
  {
1326
  "epoch": 13.23076923076923,
1327
+ "grad_norm": 5.3640851974487305,
1328
+ "learning_rate": 6.76923076923077e-06,
1329
+ "loss": 0.7653,
1330
  "step": 1720
1331
  },
1332
  {
1333
  "epoch": 13.307692307692308,
1334
+ "grad_norm": 2.9321601390838623,
1335
+ "learning_rate": 6.692307692307692e-06,
1336
+ "loss": 0.741,
1337
  "step": 1730
1338
  },
1339
  {
1340
  "epoch": 13.384615384615385,
1341
+ "grad_norm": 5.113746166229248,
1342
+ "learning_rate": 6.615384615384616e-06,
1343
+ "loss": 0.6832,
1344
  "step": 1740
1345
  },
1346
  {
1347
  "epoch": 13.461538461538462,
1348
+ "grad_norm": 4.589268207550049,
1349
+ "learning_rate": 6.538461538461539e-06,
1350
+ "loss": 0.7463,
1351
  "step": 1750
1352
  },
1353
  {
1354
  "epoch": 13.538461538461538,
1355
+ "grad_norm": 3.7653889656066895,
1356
+ "learning_rate": 6.461538461538463e-06,
1357
+ "loss": 0.7569,
1358
  "step": 1760
1359
  },
1360
  {
1361
  "epoch": 13.615384615384615,
1362
+ "grad_norm": 4.248018264770508,
1363
+ "learning_rate": 6.384615384615384e-06,
1364
+ "loss": 0.6853,
1365
  "step": 1770
1366
  },
1367
  {
1368
  "epoch": 13.692307692307692,
1369
+ "grad_norm": 4.481900691986084,
1370
+ "learning_rate": 6.307692307692308e-06,
1371
+ "loss": 0.6679,
1372
  "step": 1780
1373
  },
1374
  {
1375
  "epoch": 13.76923076923077,
1376
+ "grad_norm": 3.7759058475494385,
1377
+ "learning_rate": 6.230769230769231e-06,
1378
+ "loss": 0.7159,
1379
  "step": 1790
1380
  },
1381
  {
1382
  "epoch": 13.846153846153847,
1383
+ "grad_norm": 7.013620853424072,
1384
+ "learning_rate": 6.153846153846155e-06,
1385
+ "loss": 0.6663,
1386
  "step": 1800
1387
  },
1388
  {
1389
  "epoch": 13.923076923076923,
1390
+ "grad_norm": 3.7434396743774414,
1391
+ "learning_rate": 6.076923076923077e-06,
1392
+ "loss": 0.6043,
1393
  "step": 1810
1394
  },
1395
  {
1396
  "epoch": 14.0,
1397
+ "grad_norm": 8.63290023803711,
1398
+ "learning_rate": 6e-06,
1399
+ "loss": 0.7698,
1400
  "step": 1820
1401
  },
1402
  {
1403
  "epoch": 14.0,
1404
+ "eval_accuracy": 0.8195488721804511,
1405
+ "eval_loss": 0.5665194392204285,
1406
+ "eval_runtime": 0.9506,
1407
+ "eval_samples_per_second": 139.918,
1408
+ "eval_steps_per_second": 17.884,
1409
  "step": 1820
1410
  },
1411
  {
1412
  "epoch": 14.076923076923077,
1413
+ "grad_norm": 6.073098659515381,
1414
+ "learning_rate": 5.923076923076924e-06,
1415
+ "loss": 0.701,
1416
  "step": 1830
1417
  },
1418
  {
1419
  "epoch": 14.153846153846153,
1420
+ "grad_norm": 4.466890811920166,
1421
+ "learning_rate": 5.846153846153847e-06,
1422
+ "loss": 0.712,
1423
  "step": 1840
1424
  },
1425
  {
1426
  "epoch": 14.23076923076923,
1427
+ "grad_norm": 4.844086170196533,
1428
+ "learning_rate": 5.769230769230769e-06,
1429
+ "loss": 0.6637,
1430
  "step": 1850
1431
  },
1432
  {
1433
  "epoch": 14.307692307692308,
1434
+ "grad_norm": 5.5982184410095215,
1435
+ "learning_rate": 5.692307692307692e-06,
1436
+ "loss": 0.6735,
1437
  "step": 1860
1438
  },
1439
  {
1440
  "epoch": 14.384615384615385,
1441
+ "grad_norm": 6.027699947357178,
1442
+ "learning_rate": 5.615384615384616e-06,
1443
+ "loss": 0.672,
1444
  "step": 1870
1445
  },
1446
  {
1447
  "epoch": 14.461538461538462,
1448
+ "grad_norm": 3.128363847732544,
1449
+ "learning_rate": 5.538461538461539e-06,
1450
+ "loss": 0.6864,
1451
  "step": 1880
1452
  },
1453
  {
1454
  "epoch": 14.538461538461538,
1455
+ "grad_norm": 5.279370307922363,
1456
+ "learning_rate": 5.461538461538461e-06,
1457
+ "loss": 0.6765,
1458
  "step": 1890
1459
  },
1460
  {
1461
  "epoch": 14.615384615384615,
1462
+ "grad_norm": 3.779651165008545,
1463
+ "learning_rate": 5.384615384615385e-06,
1464
+ "loss": 0.6902,
1465
  "step": 1900
1466
  },
1467
  {
1468
  "epoch": 14.692307692307692,
1469
+ "grad_norm": 7.5796003341674805,
1470
+ "learning_rate": 5.307692307692308e-06,
1471
+ "loss": 0.6971,
1472
  "step": 1910
1473
  },
1474
  {
1475
  "epoch": 14.76923076923077,
1476
+ "grad_norm": 5.17158842086792,
1477
+ "learning_rate": 5.230769230769232e-06,
1478
+ "loss": 0.6997,
1479
  "step": 1920
1480
  },
1481
  {
1482
  "epoch": 14.846153846153847,
1483
+ "grad_norm": 3.0330209732055664,
1484
+ "learning_rate": 5.1538461538461534e-06,
1485
+ "loss": 0.778,
1486
  "step": 1930
1487
  },
1488
  {
1489
  "epoch": 14.923076923076923,
1490
+ "grad_norm": 7.733983993530273,
1491
+ "learning_rate": 5.076923076923077e-06,
1492
+ "loss": 0.6478,
1493
  "step": 1940
1494
  },
1495
  {
1496
  "epoch": 15.0,
1497
+ "grad_norm": 7.919536590576172,
1498
+ "learning_rate": 5e-06,
1499
+ "loss": 0.6632,
1500
  "step": 1950
1501
  },
1502
  {
1503
  "epoch": 15.0,
1504
+ "eval_accuracy": 0.8571428571428571,
1505
+ "eval_loss": 0.5307806730270386,
1506
+ "eval_runtime": 0.9229,
1507
+ "eval_samples_per_second": 144.113,
1508
+ "eval_steps_per_second": 18.421,
1509
  "step": 1950
1510
  },
1511
  {
1512
  "epoch": 15.076923076923077,
1513
+ "grad_norm": 4.707977771759033,
1514
  "learning_rate": 4.923076923076924e-06,
1515
+ "loss": 0.6156,
1516
  "step": 1960
1517
  },
1518
  {
1519
  "epoch": 15.153846153846153,
1520
+ "grad_norm": 5.254363536834717,
1521
  "learning_rate": 4.8461538461538465e-06,
1522
+ "loss": 0.6478,
1523
  "step": 1970
1524
  },
1525
  {
1526
  "epoch": 15.23076923076923,
1527
+ "grad_norm": 4.477289199829102,
1528
  "learning_rate": 4.76923076923077e-06,
1529
+ "loss": 0.6786,
1530
  "step": 1980
1531
  },
1532
  {
1533
  "epoch": 15.307692307692308,
1534
+ "grad_norm": 6.381091117858887,
1535
  "learning_rate": 4.692307692307693e-06,
1536
+ "loss": 0.7183,
1537
  "step": 1990
1538
  },
1539
  {
1540
  "epoch": 15.384615384615385,
1541
+ "grad_norm": 3.8760223388671875,
1542
  "learning_rate": 4.615384615384616e-06,
1543
+ "loss": 0.6391,
1544
  "step": 2000
1545
  },
1546
  {
1547
  "epoch": 15.461538461538462,
1548
+ "grad_norm": 7.011289119720459,
1549
  "learning_rate": 4.538461538461539e-06,
1550
+ "loss": 0.6135,
1551
  "step": 2010
1552
  },
1553
  {
1554
  "epoch": 15.538461538461538,
1555
+ "grad_norm": 5.04637336730957,
1556
  "learning_rate": 4.461538461538462e-06,
1557
+ "loss": 0.628,
1558
  "step": 2020
1559
  },
1560
  {
1561
  "epoch": 15.615384615384615,
1562
+ "grad_norm": 7.818150520324707,
1563
  "learning_rate": 4.384615384615385e-06,
1564
+ "loss": 0.6537,
1565
  "step": 2030
1566
  },
1567
  {
1568
  "epoch": 15.692307692307692,
1569
+ "grad_norm": 3.9413349628448486,
1570
  "learning_rate": 4.307692307692308e-06,
1571
+ "loss": 0.6662,
1572
  "step": 2040
1573
  },
1574
  {
1575
  "epoch": 15.76923076923077,
1576
+ "grad_norm": 5.911675930023193,
1577
  "learning_rate": 4.230769230769231e-06,
1578
+ "loss": 0.6284,
1579
  "step": 2050
1580
  },
1581
  {
1582
  "epoch": 15.846153846153847,
1583
+ "grad_norm": 4.451909065246582,
1584
  "learning_rate": 4.1538461538461545e-06,
1585
+ "loss": 0.6906,
1586
  "step": 2060
1587
  },
1588
  {
1589
  "epoch": 15.923076923076923,
1590
+ "grad_norm": 5.271285057067871,
1591
  "learning_rate": 4.076923076923077e-06,
1592
+ "loss": 0.6453,
1593
  "step": 2070
1594
  },
1595
  {
1596
  "epoch": 16.0,
1597
+ "grad_norm": 8.229835510253906,
1598
  "learning_rate": 4.000000000000001e-06,
1599
+ "loss": 0.6162,
1600
  "step": 2080
1601
  },
1602
  {
1603
  "epoch": 16.0,
1604
+ "eval_accuracy": 0.8345864661654135,
1605
+ "eval_loss": 0.5261984467506409,
1606
+ "eval_runtime": 0.9205,
1607
+ "eval_samples_per_second": 144.487,
1608
+ "eval_steps_per_second": 18.468,
1609
  "step": 2080
1610
  },
1611
  {
1612
  "epoch": 16.076923076923077,
1613
+ "grad_norm": 12.219758033752441,
1614
  "learning_rate": 3.923076923076923e-06,
1615
+ "loss": 0.7381,
1616
  "step": 2090
1617
  },
1618
  {
1619
  "epoch": 16.153846153846153,
1620
+ "grad_norm": 4.024055480957031,
1621
  "learning_rate": 3.846153846153847e-06,
1622
+ "loss": 0.708,
1623
  "step": 2100
1624
  },
1625
  {
1626
  "epoch": 16.23076923076923,
1627
+ "grad_norm": 5.345643043518066,
1628
  "learning_rate": 3.7692307692307694e-06,
1629
+ "loss": 0.7214,
1630
  "step": 2110
1631
  },
1632
  {
1633
  "epoch": 16.307692307692307,
1634
+ "grad_norm": 7.9586992263793945,
1635
  "learning_rate": 3.692307692307693e-06,
1636
+ "loss": 0.7163,
1637
  "step": 2120
1638
  },
1639
  {
1640
  "epoch": 16.384615384615383,
1641
+ "grad_norm": 7.007926940917969,
1642
  "learning_rate": 3.6153846153846156e-06,
1643
+ "loss": 0.6692,
1644
  "step": 2130
1645
  },
1646
  {
1647
  "epoch": 16.46153846153846,
1648
+ "grad_norm": 8.54269027709961,
1649
  "learning_rate": 3.538461538461539e-06,
1650
+ "loss": 0.661,
1651
  "step": 2140
1652
  },
1653
  {
1654
  "epoch": 16.53846153846154,
1655
+ "grad_norm": 6.417664051055908,
1656
  "learning_rate": 3.4615384615384617e-06,
1657
+ "loss": 0.6186,
1658
  "step": 2150
1659
  },
1660
  {
1661
  "epoch": 16.615384615384617,
1662
+ "grad_norm": 3.766784429550171,
1663
  "learning_rate": 3.384615384615385e-06,
1664
+ "loss": 0.6254,
1665
  "step": 2160
1666
  },
1667
  {
1668
  "epoch": 16.692307692307693,
1669
+ "grad_norm": 4.219268798828125,
1670
  "learning_rate": 3.307692307692308e-06,
1671
+ "loss": 0.6308,
1672
  "step": 2170
1673
  },
1674
  {
1675
  "epoch": 16.76923076923077,
1676
+ "grad_norm": 6.51754903793335,
1677
  "learning_rate": 3.2307692307692313e-06,
1678
+ "loss": 0.7395,
1679
  "step": 2180
1680
  },
1681
  {
1682
  "epoch": 16.846153846153847,
1683
+ "grad_norm": 4.084061622619629,
1684
  "learning_rate": 3.153846153846154e-06,
1685
+ "loss": 0.7093,
1686
  "step": 2190
1687
  },
1688
  {
1689
  "epoch": 16.923076923076923,
1690
+ "grad_norm": 2.852893590927124,
1691
  "learning_rate": 3.0769230769230774e-06,
1692
+ "loss": 0.6213,
1693
  "step": 2200
1694
  },
1695
  {
1696
  "epoch": 17.0,
1697
+ "grad_norm": 9.559004783630371,
1698
  "learning_rate": 3e-06,
1699
+ "loss": 0.6128,
1700
  "step": 2210
1701
  },
1702
  {
1703
  "epoch": 17.0,
1704
+ "eval_accuracy": 0.8421052631578947,
1705
+ "eval_loss": 0.5081294178962708,
1706
+ "eval_runtime": 0.9208,
1707
+ "eval_samples_per_second": 144.443,
1708
+ "eval_steps_per_second": 18.463,
1709
  "step": 2210
1710
  },
1711
  {
1712
  "epoch": 17.076923076923077,
1713
+ "grad_norm": 5.92500114440918,
1714
  "learning_rate": 2.9230769230769236e-06,
1715
+ "loss": 0.6286,
1716
  "step": 2220
1717
  },
1718
  {
1719
  "epoch": 17.153846153846153,
1720
+ "grad_norm": 5.656437397003174,
1721
  "learning_rate": 2.846153846153846e-06,
1722
+ "loss": 0.6561,
1723
  "step": 2230
1724
  },
1725
  {
1726
  "epoch": 17.23076923076923,
1727
+ "grad_norm": 5.492016792297363,
1728
  "learning_rate": 2.7692307692307697e-06,
1729
+ "loss": 0.6089,
1730
  "step": 2240
1731
  },
1732
  {
1733
  "epoch": 17.307692307692307,
1734
+ "grad_norm": 5.833240985870361,
1735
  "learning_rate": 2.6923076923076923e-06,
1736
+ "loss": 0.6818,
1737
  "step": 2250
1738
  },
1739
  {
1740
  "epoch": 17.384615384615383,
1741
+ "grad_norm": 3.1228320598602295,
1742
  "learning_rate": 2.615384615384616e-06,
1743
+ "loss": 0.6544,
1744
  "step": 2260
1745
  },
1746
  {
1747
  "epoch": 17.46153846153846,
1748
+ "grad_norm": 5.364021301269531,
1749
  "learning_rate": 2.5384615384615385e-06,
1750
+ "loss": 0.5678,
1751
  "step": 2270
1752
  },
1753
  {
1754
  "epoch": 17.53846153846154,
1755
+ "grad_norm": 5.820409297943115,
1756
  "learning_rate": 2.461538461538462e-06,
1757
+ "loss": 0.7722,
1758
  "step": 2280
1759
  },
1760
  {
1761
  "epoch": 17.615384615384617,
1762
+ "grad_norm": 4.316230297088623,
1763
  "learning_rate": 2.384615384615385e-06,
1764
+ "loss": 0.6169,
1765
  "step": 2290
1766
  },
1767
  {
1768
  "epoch": 17.692307692307693,
1769
+ "grad_norm": 4.034291744232178,
1770
  "learning_rate": 2.307692307692308e-06,
1771
+ "loss": 0.616,
1772
  "step": 2300
1773
  },
1774
  {
1775
  "epoch": 17.76923076923077,
1776
+ "grad_norm": 9.319421768188477,
1777
  "learning_rate": 2.230769230769231e-06,
1778
+ "loss": 0.6772,
1779
  "step": 2310
1780
  },
1781
  {
1782
  "epoch": 17.846153846153847,
1783
+ "grad_norm": 4.886960029602051,
1784
  "learning_rate": 2.153846153846154e-06,
1785
+ "loss": 0.6425,
1786
  "step": 2320
1787
  },
1788
  {
1789
  "epoch": 17.923076923076923,
1790
+ "grad_norm": 3.6336989402770996,
1791
  "learning_rate": 2.0769230769230773e-06,
1792
+ "loss": 0.6684,
1793
  "step": 2330
1794
  },
1795
  {
1796
  "epoch": 18.0,
1797
+ "grad_norm": 6.341444492340088,
1798
  "learning_rate": 2.0000000000000003e-06,
1799
+ "loss": 0.685,
1800
  "step": 2340
1801
  },
1802
  {
1803
  "epoch": 18.0,
1804
+ "eval_accuracy": 0.8571428571428571,
1805
+ "eval_loss": 0.49125248193740845,
1806
+ "eval_runtime": 0.9184,
1807
+ "eval_samples_per_second": 144.82,
1808
+ "eval_steps_per_second": 18.511,
1809
  "step": 2340
1810
  },
1811
  {
1812
  "epoch": 18.076923076923077,
1813
+ "grad_norm": 4.5043158531188965,
1814
  "learning_rate": 1.9230769230769234e-06,
1815
+ "loss": 0.5884,
1816
  "step": 2350
1817
  },
1818
  {
1819
  "epoch": 18.153846153846153,
1820
+ "grad_norm": 5.0124592781066895,
1821
  "learning_rate": 1.8461538461538465e-06,
1822
+ "loss": 0.7235,
1823
  "step": 2360
1824
  },
1825
  {
1826
  "epoch": 18.23076923076923,
1827
+ "grad_norm": 3.5005030632019043,
1828
  "learning_rate": 1.7692307692307695e-06,
1829
+ "loss": 0.6181,
1830
  "step": 2370
1831
  },
1832
  {
1833
  "epoch": 18.307692307692307,
1834
+ "grad_norm": 5.270286560058594,
1835
  "learning_rate": 1.6923076923076926e-06,
1836
+ "loss": 0.6754,
1837
  "step": 2380
1838
  },
1839
  {
1840
  "epoch": 18.384615384615383,
1841
+ "grad_norm": 3.9892470836639404,
1842
  "learning_rate": 1.6153846153846157e-06,
1843
+ "loss": 0.6778,
1844
  "step": 2390
1845
  },
1846
  {
1847
  "epoch": 18.46153846153846,
1848
+ "grad_norm": 4.945601940155029,
1849
  "learning_rate": 1.5384615384615387e-06,
1850
+ "loss": 0.6981,
1851
  "step": 2400
1852
  },
1853
  {
1854
  "epoch": 18.53846153846154,
1855
+ "grad_norm": 6.190303802490234,
1856
  "learning_rate": 1.4615384615384618e-06,
1857
+ "loss": 0.6147,
1858
  "step": 2410
1859
  },
1860
  {
1861
  "epoch": 18.615384615384617,
1862
+ "grad_norm": 11.984601974487305,
1863
  "learning_rate": 1.3846153846153848e-06,
1864
+ "loss": 0.695,
1865
  "step": 2420
1866
  },
1867
  {
1868
  "epoch": 18.692307692307693,
1869
+ "grad_norm": 5.748422622680664,
1870
  "learning_rate": 1.307692307692308e-06,
1871
+ "loss": 0.6945,
1872
  "step": 2430
1873
  },
1874
  {
1875
  "epoch": 18.76923076923077,
1876
+ "grad_norm": 7.194711685180664,
1877
  "learning_rate": 1.230769230769231e-06,
1878
+ "loss": 0.7725,
1879
  "step": 2440
1880
  },
1881
  {
1882
  "epoch": 18.846153846153847,
1883
+ "grad_norm": 4.310044288635254,
1884
  "learning_rate": 1.153846153846154e-06,
1885
+ "loss": 0.6847,
1886
  "step": 2450
1887
  },
1888
  {
1889
  "epoch": 18.923076923076923,
1890
+ "grad_norm": 4.310553073883057,
1891
  "learning_rate": 1.076923076923077e-06,
1892
+ "loss": 0.7078,
1893
  "step": 2460
1894
  },
1895
  {
1896
  "epoch": 19.0,
1897
+ "grad_norm": 7.30472993850708,
1898
  "learning_rate": 1.0000000000000002e-06,
1899
+ "loss": 0.6614,
1900
  "step": 2470
1901
  },
1902
  {
1903
  "epoch": 19.0,
1904
+ "eval_accuracy": 0.849624060150376,
1905
+ "eval_loss": 0.49367815256118774,
1906
+ "eval_runtime": 0.9215,
1907
+ "eval_samples_per_second": 144.328,
1908
+ "eval_steps_per_second": 18.448,
1909
  "step": 2470
1910
  },
1911
  {
1912
  "epoch": 19.076923076923077,
1913
+ "grad_norm": 3.58013653755188,
1914
  "learning_rate": 9.230769230769232e-07,
1915
+ "loss": 0.621,
1916
  "step": 2480
1917
  },
1918
  {
1919
  "epoch": 19.153846153846153,
1920
+ "grad_norm": 5.2532453536987305,
1921
  "learning_rate": 8.461538461538463e-07,
1922
+ "loss": 0.6367,
1923
  "step": 2490
1924
  },
1925
  {
1926
  "epoch": 19.23076923076923,
1927
+ "grad_norm": 11.078712463378906,
1928
  "learning_rate": 7.692307692307694e-07,
1929
+ "loss": 0.6835,
1930
  "step": 2500
1931
  },
1932
  {
1933
  "epoch": 19.307692307692307,
1934
+ "grad_norm": 4.118062973022461,
1935
  "learning_rate": 6.923076923076924e-07,
1936
+ "loss": 0.6719,
1937
  "step": 2510
1938
  },
1939
  {
1940
  "epoch": 19.384615384615383,
1941
+ "grad_norm": 4.630733966827393,
1942
  "learning_rate": 6.153846153846155e-07,
1943
+ "loss": 0.6546,
1944
  "step": 2520
1945
  },
1946
  {
1947
  "epoch": 19.46153846153846,
1948
+ "grad_norm": 4.452598571777344,
1949
  "learning_rate": 5.384615384615386e-07,
1950
+ "loss": 0.6052,
1951
  "step": 2530
1952
  },
1953
  {
1954
  "epoch": 19.53846153846154,
1955
+ "grad_norm": 5.847692966461182,
1956
  "learning_rate": 4.615384615384616e-07,
1957
+ "loss": 0.6786,
1958
  "step": 2540
1959
  },
1960
  {
1961
  "epoch": 19.615384615384617,
1962
+ "grad_norm": 5.957764625549316,
1963
  "learning_rate": 3.846153846153847e-07,
1964
+ "loss": 0.677,
1965
  "step": 2550
1966
  },
1967
  {
1968
  "epoch": 19.692307692307693,
1969
+ "grad_norm": 6.010245323181152,
1970
  "learning_rate": 3.0769230769230774e-07,
1971
+ "loss": 0.6106,
1972
  "step": 2560
1973
  },
1974
  {
1975
  "epoch": 19.76923076923077,
1976
+ "grad_norm": 5.3892412185668945,
1977
  "learning_rate": 2.307692307692308e-07,
1978
+ "loss": 0.5836,
1979
  "step": 2570
1980
  },
1981
  {
1982
  "epoch": 19.846153846153847,
1983
+ "grad_norm": 4.214204788208008,
1984
  "learning_rate": 1.5384615384615387e-07,
1985
+ "loss": 0.6448,
1986
  "step": 2580
1987
  },
1988
  {
1989
  "epoch": 19.923076923076923,
1990
+ "grad_norm": 4.230797290802002,
1991
  "learning_rate": 7.692307692307694e-08,
1992
+ "loss": 0.5617,
1993
  "step": 2590
1994
  },
1995
  {
1996
  "epoch": 20.0,
1997
+ "grad_norm": 10.618528366088867,
1998
  "learning_rate": 0.0,
1999
+ "loss": 0.6934,
2000
  "step": 2600
2001
  },
2002
  {
2003
  "epoch": 20.0,
2004
+ "eval_accuracy": 0.8571428571428571,
2005
+ "eval_loss": 0.5026500821113586,
2006
+ "eval_runtime": 1.0302,
2007
+ "eval_samples_per_second": 129.097,
2008
+ "eval_steps_per_second": 16.501,
2009
  "step": 2600
2010
  },
2011
  {
2012
  "epoch": 20.0,
2013
  "step": 2600,
2014
+ "total_flos": 7.939121542823117e+17,
2015
+ "train_loss": 0.8216404274793772,
2016
+ "train_runtime": 338.1702,
2017
+ "train_samples_per_second": 61.153,
2018
+ "train_steps_per_second": 7.688
2019
  }
2020
  ],
2021
  "logging_steps": 10,
 
2035
  "attributes": {}
2036
  }
2037
  },
2038
+ "total_flos": 7.939121542823117e+17,
2039
  "train_batch_size": 8,
2040
  "trial_name": null,
2041
  "trial_params": null