nemik commited on
Commit
70ae379
·
verified ·
1 Parent(s): ddb3152

frostsolutions/frost-vision-v2-google_vit-base-patch16-224-v2024-11-14

Browse files
README.md CHANGED
@@ -26,16 +26,16 @@ model-index:
26
  metrics:
27
  - name: Accuracy
28
  type: accuracy
29
- value: 0.9401234567901234
30
  - name: F1
31
  type: f1
32
- value: 0.847723704866562
33
  - name: Precision
34
  type: precision
35
- value: 0.864
36
  - name: Recall
37
  type: recall
38
- value: 0.8320493066255779
39
  ---
40
 
41
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -45,11 +45,11 @@ should probably proofread and complete it, then remove this comment. -->
45
 
46
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the webdataset dataset.
47
  It achieves the following results on the evaluation set:
48
- - Loss: 0.1817
49
- - Accuracy: 0.9401
50
- - F1: 0.8477
51
- - Precision: 0.864
52
- - Recall: 0.8320
53
 
54
  ## Model description
55
 
 
26
  metrics:
27
  - name: Accuracy
28
  type: accuracy
29
+ value: 0.9388888888888889
30
  - name: F1
31
  type: f1
32
+ value: 0.8436018957345972
33
  - name: Precision
34
  type: precision
35
+ value: 0.8654781199351702
36
  - name: Recall
37
  type: recall
38
+ value: 0.8228043143297381
39
  ---
40
 
41
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
45
 
46
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the webdataset dataset.
47
  It achieves the following results on the evaluation set:
48
+ - Loss: 0.1577
49
+ - Accuracy: 0.9389
50
+ - F1: 0.8436
51
+ - Precision: 0.8655
52
+ - Recall: 0.8228
53
 
54
  ## Model description
55
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 30.0,
3
- "eval_accuracy": 0.924074074074074,
4
- "eval_f1": 0.8069073783359497,
5
- "eval_loss": 0.18580639362335205,
6
- "eval_precision": 0.8524046434494196,
7
- "eval_recall": 0.7660208643815202,
8
- "eval_runtime": 3.2436,
9
- "eval_samples_per_second": 99.889,
10
- "eval_steps_per_second": 12.64,
11
  "total_flos": 3.008454731998249e+18,
12
- "train_loss": 0.08984789192185971,
13
- "train_runtime": 670.5653,
14
- "train_samples_per_second": 57.891,
15
- "train_steps_per_second": 3.624
16
  }
 
1
  {
2
  "epoch": 30.0,
3
+ "eval_accuracy": 0.9388888888888889,
4
+ "eval_f1": 0.8436018957345972,
5
+ "eval_loss": 0.15773314237594604,
6
+ "eval_precision": 0.8654781199351702,
7
+ "eval_recall": 0.8228043143297381,
8
+ "eval_runtime": 2.635,
9
+ "eval_samples_per_second": 122.962,
10
+ "eval_steps_per_second": 15.56,
11
  "total_flos": 3.008454731998249e+18,
12
+ "train_loss": 0.09584075045070531,
13
+ "train_runtime": 684.1699,
14
+ "train_samples_per_second": 56.74,
15
+ "train_steps_per_second": 3.552
16
  }
eval_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 30.0,
3
- "eval_accuracy": 0.924074074074074,
4
- "eval_f1": 0.8069073783359497,
5
- "eval_loss": 0.18580639362335205,
6
- "eval_precision": 0.8524046434494196,
7
- "eval_recall": 0.7660208643815202,
8
- "eval_runtime": 3.2436,
9
- "eval_samples_per_second": 99.889,
10
- "eval_steps_per_second": 12.64
11
  }
 
1
  {
2
  "epoch": 30.0,
3
+ "eval_accuracy": 0.9388888888888889,
4
+ "eval_f1": 0.8436018957345972,
5
+ "eval_loss": 0.15773314237594604,
6
+ "eval_precision": 0.8654781199351702,
7
+ "eval_recall": 0.8228043143297381,
8
+ "eval_runtime": 2.635,
9
+ "eval_samples_per_second": 122.962,
10
+ "eval_steps_per_second": 15.56
11
  }
runs/Nov14_23-23-05_ba4b501b14a9/events.out.tfevents.1731627314.ba4b501b14a9.833.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06c4a31a750ad5b1bfddbbed253fc1582e33e701c2089178daa31f1a37ba4142
3
+ size 560
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 30.0,
3
  "total_flos": 3.008454731998249e+18,
4
- "train_loss": 0.08984789192185971,
5
- "train_runtime": 670.5653,
6
- "train_samples_per_second": 57.891,
7
- "train_steps_per_second": 3.624
8
  }
 
1
  {
2
  "epoch": 30.0,
3
  "total_flos": 3.008454731998249e+18,
4
+ "train_loss": 0.09584075045070531,
5
+ "train_runtime": 684.1699,
6
+ "train_samples_per_second": 56.74,
7
+ "train_steps_per_second": 3.552
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.18580639362335205,
3
- "best_model_checkpoint": "frostsolutions/frost-vision-v2-google_vit-base-patch16-224-v2024-11-14/checkpoint-500",
4
  "epoch": 30.0,
5
  "eval_steps": 100,
6
  "global_step": 2430,
@@ -10,2001 +10,2001 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.12345679012345678,
13
- "grad_norm": 1.8872668743133545,
14
- "learning_rate": 8.23045267489712e-06,
15
- "loss": 0.7179,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.24691358024691357,
20
- "grad_norm": 1.4793915748596191,
21
- "learning_rate": 1.646090534979424e-05,
22
- "loss": 0.632,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.37037037037037035,
27
- "grad_norm": 1.0972111225128174,
28
- "learning_rate": 2.4691358024691357e-05,
29
- "loss": 0.5057,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.49382716049382713,
34
- "grad_norm": 1.1177233457565308,
35
- "learning_rate": 3.292181069958848e-05,
36
- "loss": 0.4289,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.6172839506172839,
41
- "grad_norm": 1.0670571327209473,
42
- "learning_rate": 4.11522633744856e-05,
43
- "loss": 0.3715,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.7407407407407407,
48
- "grad_norm": 1.502959132194519,
49
- "learning_rate": 4.938271604938271e-05,
50
- "loss": 0.3202,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.8641975308641975,
55
- "grad_norm": 1.198879361152649,
56
- "learning_rate": 5.761316872427984e-05,
57
- "loss": 0.3032,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.9876543209876543,
62
- "grad_norm": 1.1655505895614624,
63
- "learning_rate": 6.584362139917696e-05,
64
- "loss": 0.2917,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 1.1111111111111112,
69
- "grad_norm": 1.1428577899932861,
70
- "learning_rate": 7.407407407407407e-05,
71
- "loss": 0.2521,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 1.2345679012345678,
76
- "grad_norm": 0.6600608825683594,
77
- "learning_rate": 8.23045267489712e-05,
78
- "loss": 0.2436,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 1.2345679012345678,
83
- "eval_accuracy": 0.9012345679012346,
84
- "eval_f1": 0.744408945686901,
85
- "eval_loss": 0.24551859498023987,
86
- "eval_precision": 0.802065404475043,
87
- "eval_recall": 0.6944858420268256,
88
- "eval_runtime": 3.0062,
89
- "eval_samples_per_second": 107.778,
90
- "eval_steps_per_second": 13.639,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 1.3580246913580247,
95
- "grad_norm": 0.9017321467399597,
96
- "learning_rate": 9.053497942386831e-05,
97
- "loss": 0.2218,
98
  "step": 110
99
  },
100
  {
101
  "epoch": 1.4814814814814814,
102
- "grad_norm": 0.7535004615783691,
103
- "learning_rate": 9.876543209876543e-05,
104
- "loss": 0.2229,
105
  "step": 120
106
  },
107
  {
108
  "epoch": 1.6049382716049383,
109
- "grad_norm": 0.7849411964416504,
110
- "learning_rate": 0.00010699588477366255,
111
- "loss": 0.2478,
112
  "step": 130
113
  },
114
  {
115
  "epoch": 1.7283950617283952,
116
- "grad_norm": 1.008712649345398,
117
- "learning_rate": 0.00011522633744855968,
118
- "loss": 0.237,
119
  "step": 140
120
  },
121
  {
122
  "epoch": 1.8518518518518519,
123
- "grad_norm": 0.7667405605316162,
124
- "learning_rate": 0.0001234567901234568,
125
- "loss": 0.2143,
126
  "step": 150
127
  },
128
  {
129
  "epoch": 1.9753086419753085,
130
- "grad_norm": 1.147663950920105,
131
- "learning_rate": 0.00013168724279835392,
132
- "loss": 0.2208,
133
  "step": 160
134
  },
135
  {
136
  "epoch": 2.0987654320987654,
137
- "grad_norm": 0.746328592300415,
138
- "learning_rate": 0.00013991769547325105,
139
- "loss": 0.1848,
140
  "step": 170
141
  },
142
  {
143
  "epoch": 2.2222222222222223,
144
- "grad_norm": 1.367189884185791,
145
- "learning_rate": 0.00014814814814814815,
146
- "loss": 0.1901,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 2.3456790123456788,
151
- "grad_norm": 0.8272459506988525,
152
- "learning_rate": 0.00015637860082304527,
153
- "loss": 0.1811,
154
  "step": 190
155
  },
156
  {
157
  "epoch": 2.4691358024691357,
158
- "grad_norm": 0.7679163217544556,
159
- "learning_rate": 0.0001646090534979424,
160
- "loss": 0.1639,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 2.4691358024691357,
165
- "eval_accuracy": 0.917283950617284,
166
- "eval_f1": 0.7981927710843374,
167
- "eval_loss": 0.2006644308567047,
168
- "eval_precision": 0.806697108066971,
169
- "eval_recall": 0.789865871833085,
170
- "eval_runtime": 2.6836,
171
- "eval_samples_per_second": 120.731,
172
- "eval_steps_per_second": 15.278,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 2.5925925925925926,
177
- "grad_norm": 1.0779305696487427,
178
- "learning_rate": 0.0001728395061728395,
179
- "loss": 0.1977,
180
  "step": 210
181
  },
182
  {
183
  "epoch": 2.7160493827160495,
184
- "grad_norm": 1.054652452468872,
185
- "learning_rate": 0.00018106995884773663,
186
- "loss": 0.2195,
187
  "step": 220
188
  },
189
  {
190
  "epoch": 2.8395061728395063,
191
- "grad_norm": 0.7771775722503662,
192
- "learning_rate": 0.00018930041152263375,
193
- "loss": 0.1997,
194
  "step": 230
195
  },
196
  {
197
  "epoch": 2.962962962962963,
198
- "grad_norm": 1.1053166389465332,
199
- "learning_rate": 0.00019753086419753085,
200
- "loss": 0.2222,
201
  "step": 240
202
  },
203
  {
204
  "epoch": 3.0864197530864197,
205
- "grad_norm": 0.7908689975738525,
206
- "learning_rate": 0.00019935985368084132,
207
- "loss": 0.2012,
208
  "step": 250
209
  },
210
  {
211
  "epoch": 3.2098765432098766,
212
- "grad_norm": 1.0789804458618164,
213
- "learning_rate": 0.0001984453589391861,
214
- "loss": 0.1826,
215
  "step": 260
216
  },
217
  {
218
  "epoch": 3.3333333333333335,
219
- "grad_norm": 0.8242624998092651,
220
- "learning_rate": 0.00019753086419753085,
221
- "loss": 0.2012,
222
  "step": 270
223
  },
224
  {
225
  "epoch": 3.45679012345679,
226
- "grad_norm": 1.4627937078475952,
227
- "learning_rate": 0.00019661636945587566,
228
- "loss": 0.2098,
229
  "step": 280
230
  },
231
  {
232
  "epoch": 3.580246913580247,
233
- "grad_norm": 1.009103536605835,
234
- "learning_rate": 0.0001957018747142204,
235
- "loss": 0.1744,
236
  "step": 290
237
  },
238
  {
239
  "epoch": 3.7037037037037037,
240
- "grad_norm": 0.6586538553237915,
241
- "learning_rate": 0.0001947873799725652,
242
- "loss": 0.188,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 3.7037037037037037,
247
- "eval_accuracy": 0.9219135802469136,
248
- "eval_f1": 0.8064269319051263,
249
- "eval_loss": 0.19134539365768433,
250
- "eval_precision": 0.8286163522012578,
251
- "eval_recall": 0.7853949329359166,
252
- "eval_runtime": 2.376,
253
- "eval_samples_per_second": 136.362,
254
- "eval_steps_per_second": 17.256,
255
  "step": 300
256
  },
257
  {
258
  "epoch": 3.8271604938271606,
259
- "grad_norm": 0.6922177076339722,
260
- "learning_rate": 0.00019387288523090994,
261
- "loss": 0.1915,
262
  "step": 310
263
  },
264
  {
265
  "epoch": 3.950617283950617,
266
- "grad_norm": 0.9769504070281982,
267
- "learning_rate": 0.0001929583904892547,
268
- "loss": 0.1663,
269
  "step": 320
270
  },
271
  {
272
  "epoch": 4.074074074074074,
273
- "grad_norm": 0.6337828636169434,
274
- "learning_rate": 0.00019204389574759947,
275
- "loss": 0.159,
276
  "step": 330
277
  },
278
  {
279
  "epoch": 4.197530864197531,
280
- "grad_norm": 0.7267969846725464,
281
- "learning_rate": 0.00019112940100594422,
282
- "loss": 0.1347,
283
  "step": 340
284
  },
285
  {
286
  "epoch": 4.320987654320987,
287
- "grad_norm": 1.197481632232666,
288
- "learning_rate": 0.000190214906264289,
289
- "loss": 0.1392,
290
  "step": 350
291
  },
292
  {
293
  "epoch": 4.444444444444445,
294
- "grad_norm": 1.4104204177856445,
295
- "learning_rate": 0.00018930041152263375,
296
- "loss": 0.1655,
297
  "step": 360
298
  },
299
  {
300
  "epoch": 4.567901234567901,
301
- "grad_norm": 1.115108609199524,
302
- "learning_rate": 0.00018838591678097853,
303
- "loss": 0.1792,
304
  "step": 370
305
  },
306
  {
307
  "epoch": 4.6913580246913575,
308
- "grad_norm": 1.7156143188476562,
309
- "learning_rate": 0.00018747142203932328,
310
- "loss": 0.1826,
311
  "step": 380
312
  },
313
  {
314
  "epoch": 4.814814814814815,
315
- "grad_norm": 0.7819073796272278,
316
- "learning_rate": 0.00018655692729766806,
317
- "loss": 0.2012,
318
  "step": 390
319
  },
320
  {
321
  "epoch": 4.938271604938271,
322
- "grad_norm": 0.9515486359596252,
323
- "learning_rate": 0.00018564243255601281,
324
- "loss": 0.1695,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 4.938271604938271,
329
- "eval_accuracy": 0.9243827160493827,
330
- "eval_f1": 0.8167539267015707,
331
- "eval_loss": 0.1824890375137329,
332
- "eval_precision": 0.8198198198198198,
333
- "eval_recall": 0.8137108792846498,
334
- "eval_runtime": 2.4365,
335
- "eval_samples_per_second": 132.979,
336
- "eval_steps_per_second": 16.828,
337
  "step": 400
338
  },
339
  {
340
  "epoch": 5.061728395061729,
341
- "grad_norm": 0.7488430738449097,
342
- "learning_rate": 0.00018472793781435757,
343
- "loss": 0.1436,
344
  "step": 410
345
  },
346
  {
347
  "epoch": 5.185185185185185,
348
- "grad_norm": 0.7038572430610657,
349
- "learning_rate": 0.00018381344307270234,
350
- "loss": 0.132,
351
  "step": 420
352
  },
353
  {
354
  "epoch": 5.308641975308642,
355
- "grad_norm": 0.8166447877883911,
356
- "learning_rate": 0.0001828989483310471,
357
- "loss": 0.1565,
358
  "step": 430
359
  },
360
  {
361
  "epoch": 5.432098765432099,
362
- "grad_norm": 1.1276097297668457,
363
- "learning_rate": 0.00018198445358939187,
364
- "loss": 0.1642,
365
  "step": 440
366
  },
367
  {
368
  "epoch": 5.555555555555555,
369
- "grad_norm": 0.6070886850357056,
370
- "learning_rate": 0.00018106995884773663,
371
- "loss": 0.1353,
372
  "step": 450
373
  },
374
  {
375
  "epoch": 5.679012345679013,
376
- "grad_norm": 0.8393697738647461,
377
- "learning_rate": 0.0001801554641060814,
378
- "loss": 0.1169,
379
  "step": 460
380
  },
381
  {
382
  "epoch": 5.802469135802469,
383
- "grad_norm": 1.5119699239730835,
384
- "learning_rate": 0.00017924096936442616,
385
- "loss": 0.1269,
386
  "step": 470
387
  },
388
  {
389
  "epoch": 5.925925925925926,
390
- "grad_norm": 0.9335212707519531,
391
- "learning_rate": 0.00017832647462277094,
392
- "loss": 0.1603,
393
  "step": 480
394
  },
395
  {
396
  "epoch": 6.049382716049383,
397
- "grad_norm": 0.6569867730140686,
398
- "learning_rate": 0.0001774119798811157,
399
- "loss": 0.123,
400
  "step": 490
401
  },
402
  {
403
  "epoch": 6.172839506172839,
404
- "grad_norm": 0.5314657092094421,
405
- "learning_rate": 0.00017649748513946044,
406
- "loss": 0.1204,
407
  "step": 500
408
  },
409
  {
410
  "epoch": 6.172839506172839,
411
- "eval_accuracy": 0.924074074074074,
412
- "eval_f1": 0.8069073783359497,
413
- "eval_loss": 0.18580639362335205,
414
- "eval_precision": 0.8524046434494196,
415
- "eval_recall": 0.7660208643815202,
416
- "eval_runtime": 3.2628,
417
- "eval_samples_per_second": 99.302,
418
- "eval_steps_per_second": 12.566,
419
  "step": 500
420
  },
421
  {
422
  "epoch": 6.296296296296296,
423
- "grad_norm": 0.8376681804656982,
424
- "learning_rate": 0.00017558299039780522,
425
- "loss": 0.1327,
426
  "step": 510
427
  },
428
  {
429
  "epoch": 6.419753086419753,
430
- "grad_norm": 0.7419152855873108,
431
- "learning_rate": 0.00017466849565614997,
432
- "loss": 0.1355,
433
  "step": 520
434
  },
435
  {
436
  "epoch": 6.54320987654321,
437
- "grad_norm": 1.5098567008972168,
438
- "learning_rate": 0.00017375400091449475,
439
- "loss": 0.1223,
440
  "step": 530
441
  },
442
  {
443
  "epoch": 6.666666666666667,
444
- "grad_norm": 0.4890337884426117,
445
- "learning_rate": 0.0001728395061728395,
446
- "loss": 0.1307,
447
  "step": 540
448
  },
449
  {
450
  "epoch": 6.790123456790123,
451
- "grad_norm": 0.9887601137161255,
452
- "learning_rate": 0.00017192501143118428,
453
- "loss": 0.1118,
454
  "step": 550
455
  },
456
  {
457
  "epoch": 6.91358024691358,
458
- "grad_norm": 0.7607052326202393,
459
- "learning_rate": 0.00017101051668952906,
460
- "loss": 0.1177,
461
  "step": 560
462
  },
463
  {
464
  "epoch": 7.037037037037037,
465
- "grad_norm": 0.6813036799430847,
466
- "learning_rate": 0.0001700960219478738,
467
- "loss": 0.1367,
468
  "step": 570
469
  },
470
  {
471
  "epoch": 7.160493827160494,
472
- "grad_norm": 0.5105597972869873,
473
- "learning_rate": 0.0001691815272062186,
474
- "loss": 0.1151,
475
  "step": 580
476
  },
477
  {
478
  "epoch": 7.283950617283951,
479
- "grad_norm": 0.7654514312744141,
480
- "learning_rate": 0.00016826703246456334,
481
- "loss": 0.1483,
482
  "step": 590
483
  },
484
  {
485
  "epoch": 7.407407407407407,
486
- "grad_norm": 0.48647335171699524,
487
- "learning_rate": 0.00016735253772290812,
488
- "loss": 0.1268,
489
  "step": 600
490
  },
491
  {
492
  "epoch": 7.407407407407407,
493
- "eval_accuracy": 0.9222222222222223,
494
- "eval_f1": 0.8061538461538461,
495
- "eval_loss": 0.18393439054489136,
496
- "eval_precision": 0.8330683624801272,
497
- "eval_recall": 0.7809239940387481,
498
- "eval_runtime": 2.3802,
499
- "eval_samples_per_second": 136.124,
500
- "eval_steps_per_second": 17.226,
501
  "step": 600
502
  },
503
  {
504
  "epoch": 7.530864197530864,
505
- "grad_norm": 0.6816988587379456,
506
- "learning_rate": 0.00016643804298125287,
507
- "loss": 0.1027,
508
  "step": 610
509
  },
510
  {
511
  "epoch": 7.654320987654321,
512
- "grad_norm": 0.8071882128715515,
513
- "learning_rate": 0.00016552354823959765,
514
- "loss": 0.1145,
515
  "step": 620
516
  },
517
  {
518
  "epoch": 7.777777777777778,
519
- "grad_norm": 0.532025933265686,
520
- "learning_rate": 0.0001646090534979424,
521
- "loss": 0.1179,
522
  "step": 630
523
  },
524
  {
525
  "epoch": 7.901234567901234,
526
- "grad_norm": 0.7129656076431274,
527
- "learning_rate": 0.00016369455875628715,
528
- "loss": 0.1304,
529
  "step": 640
530
  },
531
  {
532
  "epoch": 8.024691358024691,
533
- "grad_norm": 1.0503504276275635,
534
- "learning_rate": 0.00016278006401463193,
535
- "loss": 0.1319,
536
  "step": 650
537
  },
538
  {
539
  "epoch": 8.148148148148149,
540
- "grad_norm": 0.8545478582382202,
541
- "learning_rate": 0.00016186556927297668,
542
- "loss": 0.1009,
543
  "step": 660
544
  },
545
  {
546
  "epoch": 8.271604938271604,
547
- "grad_norm": 0.39897826313972473,
548
- "learning_rate": 0.00016095107453132146,
549
- "loss": 0.0829,
550
  "step": 670
551
  },
552
  {
553
  "epoch": 8.395061728395062,
554
- "grad_norm": 0.5581258535385132,
555
- "learning_rate": 0.0001600365797896662,
556
- "loss": 0.1089,
557
  "step": 680
558
  },
559
  {
560
  "epoch": 8.518518518518519,
561
- "grad_norm": 0.7780981659889221,
562
- "learning_rate": 0.000159122085048011,
563
- "loss": 0.116,
564
  "step": 690
565
  },
566
  {
567
  "epoch": 8.641975308641975,
568
- "grad_norm": 0.9378405213356018,
569
- "learning_rate": 0.00015820759030635574,
570
- "loss": 0.1089,
571
  "step": 700
572
  },
573
  {
574
  "epoch": 8.641975308641975,
575
- "eval_accuracy": 0.9271604938271605,
576
- "eval_f1": 0.8214826021180031,
577
- "eval_loss": 0.18152755498886108,
578
- "eval_precision": 0.8341013824884793,
579
- "eval_recall": 0.8092399403874814,
580
- "eval_runtime": 3.9346,
581
- "eval_samples_per_second": 82.345,
582
- "eval_steps_per_second": 10.42,
583
  "step": 700
584
  },
585
  {
586
  "epoch": 8.765432098765432,
587
- "grad_norm": 0.8875961899757385,
588
- "learning_rate": 0.00015729309556470052,
589
- "loss": 0.0944,
590
  "step": 710
591
  },
592
  {
593
  "epoch": 8.88888888888889,
594
- "grad_norm": 0.5635790824890137,
595
- "learning_rate": 0.00015637860082304527,
596
- "loss": 0.1047,
597
  "step": 720
598
  },
599
  {
600
  "epoch": 9.012345679012345,
601
- "grad_norm": 0.5476753115653992,
602
- "learning_rate": 0.00015546410608139002,
603
- "loss": 0.0961,
604
  "step": 730
605
  },
606
  {
607
  "epoch": 9.135802469135802,
608
- "grad_norm": 0.895350456237793,
609
- "learning_rate": 0.0001545496113397348,
610
- "loss": 0.097,
611
  "step": 740
612
  },
613
  {
614
  "epoch": 9.25925925925926,
615
- "grad_norm": 0.5731819868087769,
616
- "learning_rate": 0.00015363511659807956,
617
- "loss": 0.093,
618
  "step": 750
619
  },
620
  {
621
  "epoch": 9.382716049382717,
622
- "grad_norm": 0.7692080140113831,
623
- "learning_rate": 0.00015272062185642433,
624
- "loss": 0.1054,
625
  "step": 760
626
  },
627
  {
628
  "epoch": 9.506172839506172,
629
- "grad_norm": 0.5154545903205872,
630
- "learning_rate": 0.00015180612711476909,
631
- "loss": 0.083,
632
  "step": 770
633
  },
634
  {
635
  "epoch": 9.62962962962963,
636
- "grad_norm": 0.9912018179893494,
637
- "learning_rate": 0.00015089163237311386,
638
- "loss": 0.079,
639
  "step": 780
640
  },
641
  {
642
  "epoch": 9.753086419753087,
643
- "grad_norm": 0.6497861742973328,
644
- "learning_rate": 0.00014997713763145862,
645
- "loss": 0.0813,
646
  "step": 790
647
  },
648
  {
649
  "epoch": 9.876543209876543,
650
- "grad_norm": 0.36326050758361816,
651
- "learning_rate": 0.0001490626428898034,
652
- "loss": 0.0863,
653
  "step": 800
654
  },
655
  {
656
  "epoch": 9.876543209876543,
657
- "eval_accuracy": 0.924074074074074,
658
- "eval_f1": 0.8095975232198143,
659
- "eval_loss": 0.20340517163276672,
660
- "eval_precision": 0.8421900161030595,
661
- "eval_recall": 0.7794336810730254,
662
- "eval_runtime": 2.5219,
663
- "eval_samples_per_second": 128.472,
664
- "eval_steps_per_second": 16.257,
665
  "step": 800
666
  },
667
  {
668
  "epoch": 10.0,
669
- "grad_norm": 1.1620311737060547,
670
- "learning_rate": 0.00014814814814814815,
671
- "loss": 0.1242,
672
  "step": 810
673
  },
674
  {
675
  "epoch": 10.123456790123457,
676
- "grad_norm": 0.5626382827758789,
677
- "learning_rate": 0.0001472336534064929,
678
- "loss": 0.0817,
679
  "step": 820
680
  },
681
  {
682
  "epoch": 10.246913580246913,
683
- "grad_norm": 0.4754534363746643,
684
- "learning_rate": 0.00014631915866483768,
685
- "loss": 0.0885,
686
  "step": 830
687
  },
688
  {
689
  "epoch": 10.37037037037037,
690
- "grad_norm": 0.8991501927375793,
691
- "learning_rate": 0.00014540466392318243,
692
- "loss": 0.1016,
693
  "step": 840
694
  },
695
  {
696
  "epoch": 10.493827160493828,
697
- "grad_norm": 0.4675813317298889,
698
- "learning_rate": 0.00014449016918152723,
699
- "loss": 0.0988,
700
  "step": 850
701
  },
702
  {
703
  "epoch": 10.617283950617283,
704
- "grad_norm": 0.8737104535102844,
705
- "learning_rate": 0.00014357567443987199,
706
- "loss": 0.0838,
707
  "step": 860
708
  },
709
  {
710
  "epoch": 10.74074074074074,
711
- "grad_norm": 0.7189881205558777,
712
- "learning_rate": 0.00014266117969821676,
713
- "loss": 0.0733,
714
  "step": 870
715
  },
716
  {
717
  "epoch": 10.864197530864198,
718
- "grad_norm": 1.0447670221328735,
719
- "learning_rate": 0.00014174668495656152,
720
- "loss": 0.1154,
721
  "step": 880
722
  },
723
  {
724
  "epoch": 10.987654320987655,
725
- "grad_norm": 0.6584729552268982,
726
- "learning_rate": 0.00014083219021490627,
727
- "loss": 0.0786,
728
  "step": 890
729
  },
730
  {
731
  "epoch": 11.11111111111111,
732
- "grad_norm": 0.4896683096885681,
733
- "learning_rate": 0.00013991769547325105,
734
- "loss": 0.0758,
735
  "step": 900
736
  },
737
  {
738
  "epoch": 11.11111111111111,
739
- "eval_accuracy": 0.929320987654321,
740
- "eval_f1": 0.8287210172026926,
741
- "eval_loss": 0.193252295255661,
742
- "eval_precision": 0.8318318318318318,
743
- "eval_recall": 0.8256333830104322,
744
- "eval_runtime": 2.417,
745
- "eval_samples_per_second": 134.049,
746
- "eval_steps_per_second": 16.963,
747
  "step": 900
748
  },
749
  {
750
  "epoch": 11.234567901234568,
751
- "grad_norm": 0.4879882037639618,
752
- "learning_rate": 0.0001390032007315958,
753
  "loss": 0.0758,
754
  "step": 910
755
  },
756
  {
757
  "epoch": 11.358024691358025,
758
- "grad_norm": 0.40832993388175964,
759
- "learning_rate": 0.00013808870598994058,
760
- "loss": 0.0659,
761
  "step": 920
762
  },
763
  {
764
  "epoch": 11.481481481481481,
765
- "grad_norm": 0.36086103320121765,
766
- "learning_rate": 0.00013717421124828533,
767
- "loss": 0.0564,
768
  "step": 930
769
  },
770
  {
771
  "epoch": 11.604938271604938,
772
- "grad_norm": 0.4690726101398468,
773
- "learning_rate": 0.0001362597165066301,
774
- "loss": 0.0853,
775
  "step": 940
776
  },
777
  {
778
  "epoch": 11.728395061728396,
779
- "grad_norm": 0.4532856047153473,
780
- "learning_rate": 0.00013534522176497486,
781
- "loss": 0.082,
782
  "step": 950
783
  },
784
  {
785
  "epoch": 11.851851851851851,
786
- "grad_norm": 0.4193461835384369,
787
- "learning_rate": 0.00013443072702331964,
788
- "loss": 0.0692,
789
  "step": 960
790
  },
791
  {
792
  "epoch": 11.975308641975309,
793
- "grad_norm": 0.7810946702957153,
794
- "learning_rate": 0.0001335162322816644,
795
- "loss": 0.0724,
796
  "step": 970
797
  },
798
  {
799
  "epoch": 12.098765432098766,
800
- "grad_norm": 0.49589404463768005,
801
- "learning_rate": 0.00013260173754000914,
802
- "loss": 0.058,
803
  "step": 980
804
  },
805
  {
806
  "epoch": 12.222222222222221,
807
- "grad_norm": 0.43445077538490295,
808
- "learning_rate": 0.00013168724279835392,
809
- "loss": 0.0657,
810
  "step": 990
811
  },
812
  {
813
  "epoch": 12.345679012345679,
814
- "grad_norm": 0.4995817542076111,
815
- "learning_rate": 0.00013077274805669867,
816
- "loss": 0.0605,
817
  "step": 1000
818
  },
819
  {
820
  "epoch": 12.345679012345679,
821
- "eval_accuracy": 0.9302469135802469,
822
- "eval_f1": 0.8293051359516617,
823
- "eval_loss": 0.1941601037979126,
824
- "eval_precision": 0.8407350689127105,
825
- "eval_recall": 0.8181818181818182,
826
- "eval_runtime": 2.5849,
827
- "eval_samples_per_second": 125.345,
828
- "eval_steps_per_second": 15.862,
829
  "step": 1000
830
  },
831
  {
832
  "epoch": 12.469135802469136,
833
- "grad_norm": 0.6132990121841431,
834
- "learning_rate": 0.00012985825331504345,
835
- "loss": 0.0618,
836
  "step": 1010
837
  },
838
  {
839
  "epoch": 12.592592592592592,
840
- "grad_norm": 1.0372841358184814,
841
- "learning_rate": 0.0001289437585733882,
842
- "loss": 0.0839,
843
  "step": 1020
844
  },
845
  {
846
  "epoch": 12.716049382716049,
847
- "grad_norm": 0.7716336846351624,
848
- "learning_rate": 0.00012802926383173298,
849
- "loss": 0.0868,
850
  "step": 1030
851
  },
852
  {
853
  "epoch": 12.839506172839506,
854
- "grad_norm": 0.8065176010131836,
855
- "learning_rate": 0.00012711476909007773,
856
- "loss": 0.082,
857
  "step": 1040
858
  },
859
  {
860
  "epoch": 12.962962962962964,
861
- "grad_norm": 0.5325743556022644,
862
- "learning_rate": 0.0001262002743484225,
863
- "loss": 0.0621,
864
  "step": 1050
865
  },
866
  {
867
  "epoch": 13.08641975308642,
868
- "grad_norm": 0.595810055732727,
869
- "learning_rate": 0.00012528577960676726,
870
- "loss": 0.0704,
871
  "step": 1060
872
  },
873
  {
874
  "epoch": 13.209876543209877,
875
- "grad_norm": 0.43285462260246277,
876
- "learning_rate": 0.00012437128486511201,
877
- "loss": 0.0731,
878
  "step": 1070
879
  },
880
  {
881
  "epoch": 13.333333333333334,
882
- "grad_norm": 0.6610433459281921,
883
- "learning_rate": 0.0001234567901234568,
884
- "loss": 0.0883,
885
  "step": 1080
886
  },
887
  {
888
  "epoch": 13.45679012345679,
889
- "grad_norm": 0.4961569011211395,
890
- "learning_rate": 0.00012254229538180154,
891
- "loss": 0.0628,
892
  "step": 1090
893
  },
894
  {
895
  "epoch": 13.580246913580247,
896
- "grad_norm": 1.3226776123046875,
897
- "learning_rate": 0.00012162780064014631,
898
- "loss": 0.0726,
899
  "step": 1100
900
  },
901
  {
902
  "epoch": 13.580246913580247,
903
- "eval_accuracy": 0.9271604938271605,
904
- "eval_f1": 0.8238805970149253,
905
- "eval_loss": 0.19768880307674408,
906
- "eval_precision": 0.8251121076233184,
907
- "eval_recall": 0.8226527570789866,
908
- "eval_runtime": 2.5461,
909
- "eval_samples_per_second": 127.251,
910
- "eval_steps_per_second": 16.103,
911
  "step": 1100
912
  },
913
  {
914
  "epoch": 13.703703703703704,
915
- "grad_norm": 0.36743324995040894,
916
- "learning_rate": 0.00012071330589849108,
917
- "loss": 0.0654,
918
  "step": 1110
919
  },
920
  {
921
  "epoch": 13.82716049382716,
922
- "grad_norm": 0.251310259103775,
923
- "learning_rate": 0.00011979881115683584,
924
- "loss": 0.061,
925
  "step": 1120
926
  },
927
  {
928
  "epoch": 13.950617283950617,
929
- "grad_norm": 0.29951387643814087,
930
- "learning_rate": 0.0001188843164151806,
931
- "loss": 0.0623,
932
  "step": 1130
933
  },
934
  {
935
  "epoch": 14.074074074074074,
936
- "grad_norm": 0.18914303183555603,
937
- "learning_rate": 0.0001179698216735254,
938
- "loss": 0.0481,
939
  "step": 1140
940
  },
941
  {
942
  "epoch": 14.197530864197532,
943
- "grad_norm": 0.5847985744476318,
944
- "learning_rate": 0.00011705532693187016,
945
- "loss": 0.0665,
946
  "step": 1150
947
  },
948
  {
949
  "epoch": 14.320987654320987,
950
- "grad_norm": 0.8701656460762024,
951
- "learning_rate": 0.00011614083219021491,
952
- "loss": 0.0717,
953
  "step": 1160
954
  },
955
  {
956
  "epoch": 14.444444444444445,
957
- "grad_norm": 0.7420238256454468,
958
- "learning_rate": 0.00011522633744855968,
959
- "loss": 0.075,
960
  "step": 1170
961
  },
962
  {
963
  "epoch": 14.567901234567902,
964
- "grad_norm": 0.6891819834709167,
965
- "learning_rate": 0.00011431184270690445,
966
- "loss": 0.0625,
967
  "step": 1180
968
  },
969
  {
970
  "epoch": 14.691358024691358,
971
- "grad_norm": 0.6125108003616333,
972
- "learning_rate": 0.00011339734796524921,
973
- "loss": 0.0635,
974
  "step": 1190
975
  },
976
  {
977
  "epoch": 14.814814814814815,
978
- "grad_norm": 0.6992429494857788,
979
- "learning_rate": 0.00011248285322359398,
980
- "loss": 0.0637,
981
  "step": 1200
982
  },
983
  {
984
  "epoch": 14.814814814814815,
985
- "eval_accuracy": 0.9311728395061728,
986
- "eval_f1": 0.8324567993989481,
987
- "eval_loss": 0.2040136605501175,
988
- "eval_precision": 0.8393939393939394,
989
- "eval_recall": 0.8256333830104322,
990
- "eval_runtime": 2.5198,
991
- "eval_samples_per_second": 128.58,
992
- "eval_steps_per_second": 16.271,
993
  "step": 1200
994
  },
995
  {
996
  "epoch": 14.938271604938272,
997
- "grad_norm": 0.37562182545661926,
998
- "learning_rate": 0.00011156835848193874,
999
- "loss": 0.0494,
1000
  "step": 1210
1001
  },
1002
  {
1003
  "epoch": 15.061728395061728,
1004
- "grad_norm": 0.6514123678207397,
1005
- "learning_rate": 0.0001106538637402835,
1006
- "loss": 0.0513,
1007
  "step": 1220
1008
  },
1009
  {
1010
  "epoch": 15.185185185185185,
1011
- "grad_norm": 0.3561560809612274,
1012
- "learning_rate": 0.00010973936899862827,
1013
- "loss": 0.069,
1014
  "step": 1230
1015
  },
1016
  {
1017
  "epoch": 15.308641975308642,
1018
- "grad_norm": 1.3583102226257324,
1019
- "learning_rate": 0.00010882487425697304,
1020
- "loss": 0.0714,
1021
  "step": 1240
1022
  },
1023
  {
1024
  "epoch": 15.432098765432098,
1025
- "grad_norm": 0.3257612884044647,
1026
- "learning_rate": 0.00010791037951531779,
1027
- "loss": 0.0494,
1028
  "step": 1250
1029
  },
1030
  {
1031
  "epoch": 15.555555555555555,
1032
- "grad_norm": 0.4650997519493103,
1033
- "learning_rate": 0.00010699588477366255,
1034
- "loss": 0.0464,
1035
  "step": 1260
1036
  },
1037
  {
1038
  "epoch": 15.679012345679013,
1039
- "grad_norm": 0.7076647281646729,
1040
- "learning_rate": 0.00010608139003200732,
1041
- "loss": 0.0575,
1042
  "step": 1270
1043
  },
1044
  {
1045
  "epoch": 15.802469135802468,
1046
- "grad_norm": 0.3548375070095062,
1047
- "learning_rate": 0.00010516689529035208,
1048
- "loss": 0.0605,
1049
  "step": 1280
1050
  },
1051
  {
1052
  "epoch": 15.925925925925926,
1053
- "grad_norm": 0.3160780966281891,
1054
- "learning_rate": 0.00010425240054869685,
1055
- "loss": 0.0695,
1056
  "step": 1290
1057
  },
1058
  {
1059
  "epoch": 16.049382716049383,
1060
- "grad_norm": 0.43579068779945374,
1061
- "learning_rate": 0.00010333790580704161,
1062
- "loss": 0.0549,
1063
  "step": 1300
1064
  },
1065
  {
1066
  "epoch": 16.049382716049383,
1067
- "eval_accuracy": 0.9308641975308642,
1068
- "eval_f1": 0.8325859491778774,
1069
- "eval_loss": 0.21728986501693726,
1070
- "eval_precision": 0.8350824587706147,
1071
- "eval_recall": 0.8301043219076006,
1072
- "eval_runtime": 3.2646,
1073
- "eval_samples_per_second": 99.247,
1074
- "eval_steps_per_second": 12.559,
1075
  "step": 1300
1076
  },
1077
  {
1078
  "epoch": 16.17283950617284,
1079
- "grad_norm": 0.6600210070610046,
1080
- "learning_rate": 0.00010242341106538638,
1081
- "loss": 0.0348,
1082
  "step": 1310
1083
  },
1084
  {
1085
  "epoch": 16.296296296296298,
1086
- "grad_norm": 0.45880362391471863,
1087
- "learning_rate": 0.00010150891632373114,
1088
- "loss": 0.053,
1089
  "step": 1320
1090
  },
1091
  {
1092
  "epoch": 16.419753086419753,
1093
- "grad_norm": 0.25807011127471924,
1094
- "learning_rate": 0.00010059442158207591,
1095
  "loss": 0.0534,
1096
  "step": 1330
1097
  },
1098
  {
1099
  "epoch": 16.54320987654321,
1100
- "grad_norm": 0.7079629898071289,
1101
- "learning_rate": 9.967992684042066e-05,
1102
- "loss": 0.0519,
1103
  "step": 1340
1104
  },
1105
  {
1106
  "epoch": 16.666666666666668,
1107
- "grad_norm": 0.5823590159416199,
1108
- "learning_rate": 9.876543209876543e-05,
1109
- "loss": 0.0532,
1110
  "step": 1350
1111
  },
1112
  {
1113
  "epoch": 16.790123456790123,
1114
- "grad_norm": 0.8557542562484741,
1115
- "learning_rate": 9.78509373571102e-05,
1116
- "loss": 0.0554,
1117
  "step": 1360
1118
  },
1119
  {
1120
  "epoch": 16.91358024691358,
1121
- "grad_norm": 0.35440272092819214,
1122
- "learning_rate": 9.693644261545497e-05,
1123
- "loss": 0.0447,
1124
  "step": 1370
1125
  },
1126
  {
1127
  "epoch": 17.037037037037038,
1128
- "grad_norm": 0.7405946850776672,
1129
- "learning_rate": 9.602194787379974e-05,
1130
- "loss": 0.0502,
1131
  "step": 1380
1132
  },
1133
  {
1134
  "epoch": 17.160493827160494,
1135
- "grad_norm": 0.350297212600708,
1136
- "learning_rate": 9.51074531321445e-05,
1137
- "loss": 0.039,
1138
  "step": 1390
1139
  },
1140
  {
1141
  "epoch": 17.28395061728395,
1142
- "grad_norm": 0.6312181353569031,
1143
- "learning_rate": 9.419295839048927e-05,
1144
- "loss": 0.0566,
1145
  "step": 1400
1146
  },
1147
  {
1148
  "epoch": 17.28395061728395,
1149
- "eval_accuracy": 0.9314814814814815,
1150
- "eval_f1": 0.8353115727002968,
1151
- "eval_loss": 0.20715734362602234,
1152
- "eval_precision": 0.8316100443131462,
1153
- "eval_recall": 0.8390461997019374,
1154
- "eval_runtime": 2.7588,
1155
- "eval_samples_per_second": 117.44,
1156
- "eval_steps_per_second": 14.861,
1157
  "step": 1400
1158
  },
1159
  {
1160
  "epoch": 17.40740740740741,
1161
- "grad_norm": 0.17453396320343018,
1162
- "learning_rate": 9.327846364883403e-05,
1163
- "loss": 0.0498,
1164
  "step": 1410
1165
  },
1166
  {
1167
  "epoch": 17.530864197530864,
1168
- "grad_norm": 0.36732375621795654,
1169
- "learning_rate": 9.236396890717878e-05,
1170
- "loss": 0.0549,
1171
  "step": 1420
1172
  },
1173
  {
1174
  "epoch": 17.65432098765432,
1175
- "grad_norm": 0.7602197527885437,
1176
- "learning_rate": 9.144947416552355e-05,
1177
- "loss": 0.0447,
1178
  "step": 1430
1179
  },
1180
  {
1181
  "epoch": 17.77777777777778,
1182
- "grad_norm": 0.8259577751159668,
1183
- "learning_rate": 9.053497942386831e-05,
1184
- "loss": 0.0584,
1185
  "step": 1440
1186
  },
1187
  {
1188
  "epoch": 17.901234567901234,
1189
- "grad_norm": 0.43447449803352356,
1190
- "learning_rate": 8.962048468221308e-05,
1191
- "loss": 0.0359,
1192
  "step": 1450
1193
  },
1194
  {
1195
  "epoch": 18.02469135802469,
1196
- "grad_norm": 0.6444792151451111,
1197
- "learning_rate": 8.870598994055784e-05,
1198
- "loss": 0.0437,
1199
  "step": 1460
1200
  },
1201
  {
1202
  "epoch": 18.14814814814815,
1203
- "grad_norm": 0.9808084964752197,
1204
- "learning_rate": 8.779149519890261e-05,
1205
- "loss": 0.0428,
1206
  "step": 1470
1207
  },
1208
  {
1209
  "epoch": 18.271604938271604,
1210
- "grad_norm": 0.4970827102661133,
1211
- "learning_rate": 8.687700045724737e-05,
1212
- "loss": 0.0325,
1213
  "step": 1480
1214
  },
1215
  {
1216
  "epoch": 18.395061728395063,
1217
- "grad_norm": 0.3154457211494446,
1218
- "learning_rate": 8.596250571559214e-05,
1219
- "loss": 0.0443,
1220
  "step": 1490
1221
  },
1222
  {
1223
  "epoch": 18.51851851851852,
1224
- "grad_norm": 0.38939017057418823,
1225
- "learning_rate": 8.50480109739369e-05,
1226
- "loss": 0.0491,
1227
  "step": 1500
1228
  },
1229
  {
1230
  "epoch": 18.51851851851852,
1231
- "eval_accuracy": 0.928395061728395,
1232
- "eval_f1": 0.824773413897281,
1233
- "eval_loss": 0.2156645506620407,
1234
- "eval_precision": 0.8361408882082695,
1235
- "eval_recall": 0.8137108792846498,
1236
- "eval_runtime": 2.3248,
1237
- "eval_samples_per_second": 139.37,
1238
- "eval_steps_per_second": 17.636,
1239
  "step": 1500
1240
  },
1241
  {
1242
  "epoch": 18.641975308641975,
1243
- "grad_norm": 0.7808548808097839,
1244
- "learning_rate": 8.413351623228167e-05,
1245
- "loss": 0.0342,
1246
  "step": 1510
1247
  },
1248
  {
1249
  "epoch": 18.765432098765434,
1250
- "grad_norm": 0.11940323561429977,
1251
- "learning_rate": 8.321902149062643e-05,
1252
- "loss": 0.0528,
1253
  "step": 1520
1254
  },
1255
  {
1256
  "epoch": 18.88888888888889,
1257
- "grad_norm": 0.6686835885047913,
1258
- "learning_rate": 8.23045267489712e-05,
1259
- "loss": 0.037,
1260
  "step": 1530
1261
  },
1262
  {
1263
  "epoch": 19.012345679012345,
1264
- "grad_norm": 0.21024620532989502,
1265
- "learning_rate": 8.139003200731597e-05,
1266
- "loss": 0.0505,
1267
  "step": 1540
1268
  },
1269
  {
1270
  "epoch": 19.135802469135804,
1271
- "grad_norm": 0.5132074952125549,
1272
- "learning_rate": 8.047553726566073e-05,
1273
  "loss": 0.0404,
1274
  "step": 1550
1275
  },
1276
  {
1277
  "epoch": 19.25925925925926,
1278
- "grad_norm": 0.32788872718811035,
1279
- "learning_rate": 7.95610425240055e-05,
1280
- "loss": 0.0513,
1281
  "step": 1560
1282
  },
1283
  {
1284
  "epoch": 19.382716049382715,
1285
- "grad_norm": 0.31631699204444885,
1286
- "learning_rate": 7.864654778235026e-05,
1287
- "loss": 0.0343,
1288
  "step": 1570
1289
  },
1290
  {
1291
  "epoch": 19.506172839506174,
1292
- "grad_norm": 0.6206240057945251,
1293
- "learning_rate": 7.773205304069501e-05,
1294
- "loss": 0.044,
1295
  "step": 1580
1296
  },
1297
  {
1298
  "epoch": 19.62962962962963,
1299
- "grad_norm": 0.46290040016174316,
1300
- "learning_rate": 7.681755829903978e-05,
1301
- "loss": 0.0456,
1302
  "step": 1590
1303
  },
1304
  {
1305
  "epoch": 19.753086419753085,
1306
- "grad_norm": 0.5506296753883362,
1307
- "learning_rate": 7.590306355738454e-05,
1308
- "loss": 0.0496,
1309
  "step": 1600
1310
  },
1311
  {
1312
  "epoch": 19.753086419753085,
1313
- "eval_accuracy": 0.9351851851851852,
1314
- "eval_f1": 0.84375,
1315
- "eval_loss": 0.19679902493953705,
1316
- "eval_precision": 0.8424962852897474,
1317
- "eval_recall": 0.8450074515648286,
1318
- "eval_runtime": 3.3713,
1319
- "eval_samples_per_second": 96.105,
1320
- "eval_steps_per_second": 12.161,
1321
  "step": 1600
1322
  },
1323
  {
1324
  "epoch": 19.876543209876544,
1325
- "grad_norm": 0.48592284321784973,
1326
- "learning_rate": 7.498856881572931e-05,
1327
- "loss": 0.042,
1328
  "step": 1610
1329
  },
1330
  {
1331
  "epoch": 20.0,
1332
- "grad_norm": 0.3968632221221924,
1333
- "learning_rate": 7.407407407407407e-05,
1334
- "loss": 0.0434,
1335
  "step": 1620
1336
  },
1337
  {
1338
  "epoch": 20.123456790123456,
1339
- "grad_norm": 0.16500796377658844,
1340
- "learning_rate": 7.315957933241884e-05,
1341
- "loss": 0.0479,
1342
  "step": 1630
1343
  },
1344
  {
1345
  "epoch": 20.246913580246915,
1346
- "grad_norm": 0.12196658551692963,
1347
- "learning_rate": 7.224508459076362e-05,
1348
- "loss": 0.0309,
1349
  "step": 1640
1350
  },
1351
  {
1352
  "epoch": 20.37037037037037,
1353
- "grad_norm": 0.46755895018577576,
1354
- "learning_rate": 7.133058984910838e-05,
1355
- "loss": 0.0448,
1356
  "step": 1650
1357
  },
1358
  {
1359
  "epoch": 20.493827160493826,
1360
- "grad_norm": 0.26269474625587463,
1361
- "learning_rate": 7.041609510745313e-05,
1362
- "loss": 0.0281,
1363
  "step": 1660
1364
  },
1365
  {
1366
  "epoch": 20.617283950617285,
1367
- "grad_norm": 0.28748786449432373,
1368
- "learning_rate": 6.95016003657979e-05,
1369
- "loss": 0.0419,
1370
  "step": 1670
1371
  },
1372
  {
1373
  "epoch": 20.74074074074074,
1374
- "grad_norm": 0.48799601197242737,
1375
- "learning_rate": 6.858710562414266e-05,
1376
- "loss": 0.0408,
1377
  "step": 1680
1378
  },
1379
  {
1380
  "epoch": 20.864197530864196,
1381
- "grad_norm": 0.259390264749527,
1382
- "learning_rate": 6.767261088248743e-05,
1383
- "loss": 0.0285,
1384
  "step": 1690
1385
  },
1386
  {
1387
  "epoch": 20.987654320987655,
1388
- "grad_norm": 0.29502254724502563,
1389
- "learning_rate": 6.67581161408322e-05,
1390
- "loss": 0.0317,
1391
  "step": 1700
1392
  },
1393
  {
1394
  "epoch": 20.987654320987655,
1395
- "eval_accuracy": 0.9361111111111111,
1396
- "eval_f1": 0.8456375838926175,
1397
- "eval_loss": 0.2053374946117401,
1398
- "eval_precision": 0.8462686567164179,
1399
- "eval_recall": 0.8450074515648286,
1400
- "eval_runtime": 2.8712,
1401
- "eval_samples_per_second": 112.844,
1402
- "eval_steps_per_second": 14.28,
1403
  "step": 1700
1404
  },
1405
  {
1406
  "epoch": 21.11111111111111,
1407
- "grad_norm": 0.5067708492279053,
1408
- "learning_rate": 6.584362139917696e-05,
1409
- "loss": 0.0398,
1410
  "step": 1710
1411
  },
1412
  {
1413
  "epoch": 21.234567901234566,
1414
- "grad_norm": 0.3558381199836731,
1415
- "learning_rate": 6.492912665752173e-05,
1416
- "loss": 0.0396,
1417
  "step": 1720
1418
  },
1419
  {
1420
  "epoch": 21.358024691358025,
1421
- "grad_norm": 0.1958431601524353,
1422
- "learning_rate": 6.401463191586649e-05,
1423
- "loss": 0.0284,
1424
  "step": 1730
1425
  },
1426
  {
1427
  "epoch": 21.48148148148148,
1428
- "grad_norm": 0.12868855893611908,
1429
- "learning_rate": 6.310013717421126e-05,
1430
- "loss": 0.0249,
1431
  "step": 1740
1432
  },
1433
  {
1434
  "epoch": 21.604938271604937,
1435
- "grad_norm": 0.1328994780778885,
1436
- "learning_rate": 6.218564243255601e-05,
1437
- "loss": 0.0336,
1438
  "step": 1750
1439
  },
1440
  {
1441
  "epoch": 21.728395061728396,
1442
- "grad_norm": 0.2639734745025635,
1443
- "learning_rate": 6.127114769090077e-05,
1444
- "loss": 0.0281,
1445
  "step": 1760
1446
  },
1447
  {
1448
  "epoch": 21.85185185185185,
1449
- "grad_norm": 0.20830489695072174,
1450
- "learning_rate": 6.035665294924554e-05,
1451
- "loss": 0.0371,
1452
  "step": 1770
1453
  },
1454
  {
1455
  "epoch": 21.97530864197531,
1456
- "grad_norm": 0.3154972195625305,
1457
- "learning_rate": 5.94421582075903e-05,
1458
- "loss": 0.0449,
1459
  "step": 1780
1460
  },
1461
  {
1462
  "epoch": 22.098765432098766,
1463
- "grad_norm": 0.15233543515205383,
1464
- "learning_rate": 5.852766346593508e-05,
1465
- "loss": 0.033,
1466
  "step": 1790
1467
  },
1468
  {
1469
  "epoch": 22.22222222222222,
1470
- "grad_norm": 0.2932221293449402,
1471
- "learning_rate": 5.761316872427984e-05,
1472
- "loss": 0.0326,
1473
  "step": 1800
1474
  },
1475
  {
1476
  "epoch": 22.22222222222222,
1477
- "eval_accuracy": 0.9305555555555556,
1478
- "eval_f1": 0.8288973384030418,
1479
- "eval_loss": 0.21322904527187347,
1480
- "eval_precision": 0.8462732919254659,
1481
- "eval_recall": 0.812220566318927,
1482
- "eval_runtime": 2.3717,
1483
- "eval_samples_per_second": 136.612,
1484
- "eval_steps_per_second": 17.287,
1485
  "step": 1800
1486
  },
1487
  {
1488
  "epoch": 22.34567901234568,
1489
- "grad_norm": 0.2980441153049469,
1490
- "learning_rate": 5.6698673982624605e-05,
1491
- "loss": 0.0293,
1492
  "step": 1810
1493
  },
1494
  {
1495
  "epoch": 22.469135802469136,
1496
- "grad_norm": 0.28522634506225586,
1497
- "learning_rate": 5.578417924096937e-05,
1498
- "loss": 0.0334,
1499
  "step": 1820
1500
  },
1501
  {
1502
  "epoch": 22.59259259259259,
1503
- "grad_norm": 0.40435144305229187,
1504
- "learning_rate": 5.4869684499314136e-05,
1505
- "loss": 0.0377,
1506
  "step": 1830
1507
  },
1508
  {
1509
  "epoch": 22.71604938271605,
1510
- "grad_norm": 0.43217939138412476,
1511
- "learning_rate": 5.3955189757658894e-05,
1512
- "loss": 0.0288,
1513
  "step": 1840
1514
  },
1515
  {
1516
  "epoch": 22.839506172839506,
1517
- "grad_norm": 0.30265820026397705,
1518
- "learning_rate": 5.304069501600366e-05,
1519
- "loss": 0.0359,
1520
  "step": 1850
1521
  },
1522
  {
1523
  "epoch": 22.962962962962962,
1524
- "grad_norm": 0.0610734224319458,
1525
- "learning_rate": 5.2126200274348424e-05,
1526
- "loss": 0.037,
1527
  "step": 1860
1528
  },
1529
  {
1530
  "epoch": 23.08641975308642,
1531
- "grad_norm": 0.5076401233673096,
1532
- "learning_rate": 5.121170553269319e-05,
1533
- "loss": 0.0344,
1534
  "step": 1870
1535
  },
1536
  {
1537
  "epoch": 23.209876543209877,
1538
- "grad_norm": 0.26529771089553833,
1539
- "learning_rate": 5.0297210791037955e-05,
1540
- "loss": 0.0321,
1541
  "step": 1880
1542
  },
1543
  {
1544
  "epoch": 23.333333333333332,
1545
- "grad_norm": 0.46441182494163513,
1546
- "learning_rate": 4.938271604938271e-05,
1547
- "loss": 0.0255,
1548
  "step": 1890
1549
  },
1550
  {
1551
  "epoch": 23.45679012345679,
1552
- "grad_norm": 0.32864171266555786,
1553
- "learning_rate": 4.8468221307727485e-05,
1554
- "loss": 0.0307,
1555
  "step": 1900
1556
  },
1557
  {
1558
  "epoch": 23.45679012345679,
1559
- "eval_accuracy": 0.9308641975308642,
1560
- "eval_f1": 0.8330849478390462,
1561
- "eval_loss": 0.21188341081142426,
1562
- "eval_precision": 0.8330849478390462,
1563
- "eval_recall": 0.8330849478390462,
1564
- "eval_runtime": 2.4849,
1565
- "eval_samples_per_second": 130.385,
1566
- "eval_steps_per_second": 16.499,
1567
  "step": 1900
1568
  },
1569
  {
1570
  "epoch": 23.580246913580247,
1571
- "grad_norm": 0.06886278837919235,
1572
- "learning_rate": 4.755372656607225e-05,
1573
- "loss": 0.0283,
1574
  "step": 1910
1575
  },
1576
  {
1577
  "epoch": 23.703703703703702,
1578
- "grad_norm": 0.15195724368095398,
1579
- "learning_rate": 4.6639231824417016e-05,
1580
- "loss": 0.0274,
1581
  "step": 1920
1582
  },
1583
  {
1584
  "epoch": 23.82716049382716,
1585
- "grad_norm": 0.2511798143386841,
1586
- "learning_rate": 4.5724737082761774e-05,
1587
- "loss": 0.0254,
1588
  "step": 1930
1589
  },
1590
  {
1591
  "epoch": 23.950617283950617,
1592
- "grad_norm": 0.41059139370918274,
1593
- "learning_rate": 4.481024234110654e-05,
1594
- "loss": 0.0322,
1595
  "step": 1940
1596
  },
1597
  {
1598
  "epoch": 24.074074074074073,
1599
- "grad_norm": 0.22714923322200775,
1600
- "learning_rate": 4.3895747599451304e-05,
1601
- "loss": 0.0245,
1602
  "step": 1950
1603
  },
1604
  {
1605
  "epoch": 24.19753086419753,
1606
- "grad_norm": 0.4791244566440582,
1607
- "learning_rate": 4.298125285779607e-05,
1608
- "loss": 0.037,
1609
  "step": 1960
1610
  },
1611
  {
1612
  "epoch": 24.320987654320987,
1613
- "grad_norm": 0.24066051840782166,
1614
- "learning_rate": 4.2066758116140835e-05,
1615
- "loss": 0.0269,
1616
  "step": 1970
1617
  },
1618
  {
1619
  "epoch": 24.444444444444443,
1620
- "grad_norm": 0.43945303559303284,
1621
- "learning_rate": 4.11522633744856e-05,
1622
- "loss": 0.0355,
1623
  "step": 1980
1624
  },
1625
  {
1626
  "epoch": 24.567901234567902,
1627
- "grad_norm": 0.21818186342716217,
1628
- "learning_rate": 4.0237768632830365e-05,
1629
- "loss": 0.0223,
1630
  "step": 1990
1631
  },
1632
  {
1633
  "epoch": 24.691358024691358,
1634
- "grad_norm": 0.18842598795890808,
1635
- "learning_rate": 3.932327389117513e-05,
1636
- "loss": 0.0147,
1637
  "step": 2000
1638
  },
1639
  {
1640
  "epoch": 24.691358024691358,
1641
- "eval_accuracy": 0.9358024691358025,
1642
- "eval_f1": 0.844776119402985,
1643
- "eval_loss": 0.2153269648551941,
1644
- "eval_precision": 0.8460388639760837,
1645
- "eval_recall": 0.8435171385991058,
1646
- "eval_runtime": 3.6738,
1647
- "eval_samples_per_second": 88.193,
1648
- "eval_steps_per_second": 11.16,
1649
  "step": 2000
1650
  },
1651
  {
1652
  "epoch": 24.814814814814813,
1653
- "grad_norm": 0.13068807125091553,
1654
- "learning_rate": 3.840877914951989e-05,
1655
- "loss": 0.0217,
1656
  "step": 2010
1657
  },
1658
  {
1659
  "epoch": 24.938271604938272,
1660
- "grad_norm": 0.17009228467941284,
1661
- "learning_rate": 3.7494284407864654e-05,
1662
- "loss": 0.0217,
1663
  "step": 2020
1664
  },
1665
  {
1666
  "epoch": 25.061728395061728,
1667
- "grad_norm": 0.2452479898929596,
1668
- "learning_rate": 3.657978966620942e-05,
1669
- "loss": 0.0235,
1670
  "step": 2030
1671
  },
1672
  {
1673
  "epoch": 25.185185185185187,
1674
- "grad_norm": 0.09778738021850586,
1675
- "learning_rate": 3.566529492455419e-05,
1676
- "loss": 0.0222,
1677
  "step": 2040
1678
  },
1679
  {
1680
  "epoch": 25.308641975308642,
1681
- "grad_norm": 0.3846670687198639,
1682
- "learning_rate": 3.475080018289895e-05,
1683
- "loss": 0.0311,
1684
  "step": 2050
1685
  },
1686
  {
1687
  "epoch": 25.432098765432098,
1688
- "grad_norm": 0.3299199640750885,
1689
- "learning_rate": 3.3836305441243715e-05,
1690
- "loss": 0.0282,
1691
  "step": 2060
1692
  },
1693
  {
1694
  "epoch": 25.555555555555557,
1695
- "grad_norm": 0.33067819476127625,
1696
- "learning_rate": 3.292181069958848e-05,
1697
- "loss": 0.0342,
1698
  "step": 2070
1699
  },
1700
  {
1701
  "epoch": 25.679012345679013,
1702
- "grad_norm": 0.1282743513584137,
1703
- "learning_rate": 3.2007315957933245e-05,
1704
- "loss": 0.0228,
1705
  "step": 2080
1706
  },
1707
  {
1708
  "epoch": 25.80246913580247,
1709
- "grad_norm": 0.1972442865371704,
1710
- "learning_rate": 3.1092821216278004e-05,
1711
- "loss": 0.0217,
1712
  "step": 2090
1713
  },
1714
  {
1715
  "epoch": 25.925925925925927,
1716
- "grad_norm": 0.6959353089332581,
1717
- "learning_rate": 3.017832647462277e-05,
1718
- "loss": 0.03,
1719
  "step": 2100
1720
  },
1721
  {
1722
  "epoch": 25.925925925925927,
1723
- "eval_accuracy": 0.9342592592592592,
1724
- "eval_f1": 0.8421052631578947,
1725
- "eval_loss": 0.21478094160556793,
1726
- "eval_precision": 0.8377581120943953,
1727
- "eval_recall": 0.8464977645305514,
1728
- "eval_runtime": 2.688,
1729
- "eval_samples_per_second": 120.534,
1730
- "eval_steps_per_second": 15.253,
1731
  "step": 2100
1732
  },
1733
  {
1734
  "epoch": 26.049382716049383,
1735
- "grad_norm": 0.18116125464439392,
1736
- "learning_rate": 2.926383173296754e-05,
1737
- "loss": 0.0392,
1738
  "step": 2110
1739
  },
1740
  {
1741
  "epoch": 26.17283950617284,
1742
- "grad_norm": 0.12619613111019135,
1743
- "learning_rate": 2.8349336991312303e-05,
1744
- "loss": 0.0189,
1745
  "step": 2120
1746
  },
1747
  {
1748
  "epoch": 26.296296296296298,
1749
- "grad_norm": 0.041821230202913284,
1750
- "learning_rate": 2.7434842249657068e-05,
1751
- "loss": 0.0334,
1752
  "step": 2130
1753
  },
1754
  {
1755
  "epoch": 26.419753086419753,
1756
- "grad_norm": 0.25579944252967834,
1757
- "learning_rate": 2.652034750800183e-05,
1758
- "loss": 0.0252,
1759
  "step": 2140
1760
  },
1761
  {
1762
  "epoch": 26.54320987654321,
1763
- "grad_norm": 0.5486050844192505,
1764
- "learning_rate": 2.5605852766346595e-05,
1765
- "loss": 0.022,
1766
  "step": 2150
1767
  },
1768
  {
1769
  "epoch": 26.666666666666668,
1770
- "grad_norm": 0.204714834690094,
1771
- "learning_rate": 2.4691358024691357e-05,
1772
- "loss": 0.0242,
1773
  "step": 2160
1774
  },
1775
  {
1776
  "epoch": 26.790123456790123,
1777
- "grad_norm": 0.9011967182159424,
1778
- "learning_rate": 2.3776863283036125e-05,
1779
- "loss": 0.0416,
1780
  "step": 2170
1781
  },
1782
  {
1783
  "epoch": 26.91358024691358,
1784
- "grad_norm": 0.072830930352211,
1785
- "learning_rate": 2.2862368541380887e-05,
1786
- "loss": 0.0148,
1787
  "step": 2180
1788
  },
1789
  {
1790
  "epoch": 27.037037037037038,
1791
- "grad_norm": 0.13628140091896057,
1792
- "learning_rate": 2.1947873799725652e-05,
1793
- "loss": 0.0256,
1794
  "step": 2190
1795
  },
1796
  {
1797
  "epoch": 27.160493827160494,
1798
- "grad_norm": 0.48816823959350586,
1799
- "learning_rate": 2.1033379058070417e-05,
1800
- "loss": 0.0228,
1801
  "step": 2200
1802
  },
1803
  {
1804
  "epoch": 27.160493827160494,
1805
- "eval_accuracy": 0.9385802469135802,
1806
- "eval_f1": 0.8511593118922962,
1807
- "eval_loss": 0.20911623537540436,
1808
- "eval_precision": 0.8543543543543544,
1809
- "eval_recall": 0.8479880774962743,
1810
- "eval_runtime": 3.05,
1811
- "eval_samples_per_second": 106.231,
1812
- "eval_steps_per_second": 13.443,
1813
  "step": 2200
1814
  },
1815
  {
1816
  "epoch": 27.28395061728395,
1817
- "grad_norm": 0.17725728452205658,
1818
- "learning_rate": 2.0118884316415183e-05,
1819
- "loss": 0.0367,
1820
  "step": 2210
1821
  },
1822
  {
1823
  "epoch": 27.40740740740741,
1824
- "grad_norm": 0.14292789995670319,
1825
- "learning_rate": 1.9204389574759944e-05,
1826
- "loss": 0.0179,
1827
  "step": 2220
1828
  },
1829
  {
1830
  "epoch": 27.530864197530864,
1831
- "grad_norm": 0.5695340037345886,
1832
- "learning_rate": 1.828989483310471e-05,
1833
- "loss": 0.025,
1834
  "step": 2230
1835
  },
1836
  {
1837
  "epoch": 27.65432098765432,
1838
- "grad_norm": 0.12956739962100983,
1839
- "learning_rate": 1.7375400091449475e-05,
1840
- "loss": 0.024,
1841
  "step": 2240
1842
  },
1843
  {
1844
  "epoch": 27.77777777777778,
1845
- "grad_norm": 0.07494452595710754,
1846
- "learning_rate": 1.646090534979424e-05,
1847
- "loss": 0.0313,
1848
  "step": 2250
1849
  },
1850
  {
1851
  "epoch": 27.901234567901234,
1852
- "grad_norm": 0.1356113702058792,
1853
- "learning_rate": 1.5546410608139002e-05,
1854
- "loss": 0.0146,
1855
  "step": 2260
1856
  },
1857
  {
1858
  "epoch": 28.02469135802469,
1859
- "grad_norm": 0.08553437143564224,
1860
- "learning_rate": 1.463191586648377e-05,
1861
- "loss": 0.0184,
1862
  "step": 2270
1863
  },
1864
  {
1865
  "epoch": 28.14814814814815,
1866
- "grad_norm": 0.6898983120918274,
1867
- "learning_rate": 1.3717421124828534e-05,
1868
- "loss": 0.0215,
1869
  "step": 2280
1870
  },
1871
  {
1872
  "epoch": 28.271604938271604,
1873
- "grad_norm": 0.150452122092247,
1874
- "learning_rate": 1.2802926383173297e-05,
1875
- "loss": 0.0289,
1876
  "step": 2290
1877
  },
1878
  {
1879
  "epoch": 28.395061728395063,
1880
- "grad_norm": 0.23294192552566528,
1881
- "learning_rate": 1.1888431641518063e-05,
1882
- "loss": 0.0167,
1883
  "step": 2300
1884
  },
1885
  {
1886
  "epoch": 28.395061728395063,
1887
- "eval_accuracy": 0.9361111111111111,
1888
- "eval_f1": 0.8447111777944486,
1889
- "eval_loss": 0.21039289236068726,
1890
- "eval_precision": 0.850453172205438,
1891
- "eval_recall": 0.8390461997019374,
1892
- "eval_runtime": 2.8019,
1893
- "eval_samples_per_second": 115.635,
1894
- "eval_steps_per_second": 14.633,
1895
  "step": 2300
1896
  },
1897
  {
1898
  "epoch": 28.51851851851852,
1899
- "grad_norm": 0.04379408434033394,
1900
- "learning_rate": 1.0973936899862826e-05,
1901
- "loss": 0.0171,
1902
  "step": 2310
1903
  },
1904
  {
1905
  "epoch": 28.641975308641975,
1906
- "grad_norm": 0.35769739747047424,
1907
- "learning_rate": 1.0059442158207591e-05,
1908
- "loss": 0.0185,
1909
  "step": 2320
1910
  },
1911
  {
1912
  "epoch": 28.765432098765434,
1913
- "grad_norm": 0.1033988893032074,
1914
- "learning_rate": 9.144947416552355e-06,
1915
- "loss": 0.0225,
1916
  "step": 2330
1917
  },
1918
  {
1919
  "epoch": 28.88888888888889,
1920
- "grad_norm": 0.26813939213752747,
1921
- "learning_rate": 8.23045267489712e-06,
1922
- "loss": 0.0187,
1923
  "step": 2340
1924
  },
1925
  {
1926
  "epoch": 29.012345679012345,
1927
- "grad_norm": 0.23706835508346558,
1928
- "learning_rate": 7.315957933241885e-06,
1929
- "loss": 0.0169,
1930
  "step": 2350
1931
  },
1932
  {
1933
  "epoch": 29.135802469135804,
1934
- "grad_norm": 0.2444809526205063,
1935
- "learning_rate": 6.401463191586649e-06,
1936
- "loss": 0.0125,
1937
  "step": 2360
1938
  },
1939
  {
1940
  "epoch": 29.25925925925926,
1941
- "grad_norm": 0.22470110654830933,
1942
- "learning_rate": 5.486968449931413e-06,
1943
- "loss": 0.0291,
1944
  "step": 2370
1945
  },
1946
  {
1947
  "epoch": 29.382716049382715,
1948
- "grad_norm": 0.10346284508705139,
1949
- "learning_rate": 4.572473708276177e-06,
1950
- "loss": 0.0234,
1951
  "step": 2380
1952
  },
1953
  {
1954
  "epoch": 29.506172839506174,
1955
- "grad_norm": 0.3980329930782318,
1956
- "learning_rate": 3.6579789666209426e-06,
1957
- "loss": 0.0175,
1958
  "step": 2390
1959
  },
1960
  {
1961
  "epoch": 29.62962962962963,
1962
- "grad_norm": 0.19435462355613708,
1963
- "learning_rate": 2.7434842249657065e-06,
1964
- "loss": 0.0273,
1965
  "step": 2400
1966
  },
1967
  {
1968
  "epoch": 29.62962962962963,
1969
- "eval_accuracy": 0.9364197530864198,
1970
- "eval_f1": 0.8458083832335329,
1971
- "eval_loss": 0.20891080796718597,
1972
- "eval_precision": 0.849624060150376,
1973
- "eval_recall": 0.842026825633383,
1974
- "eval_runtime": 2.5442,
1975
- "eval_samples_per_second": 127.348,
1976
- "eval_steps_per_second": 16.115,
1977
  "step": 2400
1978
  },
1979
  {
1980
  "epoch": 29.753086419753085,
1981
- "grad_norm": 0.3585284352302551,
1982
- "learning_rate": 1.8289894833104713e-06,
1983
- "loss": 0.0206,
1984
  "step": 2410
1985
  },
1986
  {
1987
  "epoch": 29.876543209876544,
1988
- "grad_norm": 0.11663785576820374,
1989
- "learning_rate": 9.144947416552356e-07,
1990
- "loss": 0.0169,
1991
  "step": 2420
1992
  },
1993
  {
1994
  "epoch": 30.0,
1995
- "grad_norm": 0.14282989501953125,
1996
  "learning_rate": 0.0,
1997
- "loss": 0.0191,
1998
  "step": 2430
1999
  },
2000
  {
2001
  "epoch": 30.0,
2002
  "step": 2430,
2003
  "total_flos": 3.008454731998249e+18,
2004
- "train_loss": 0.08984789192185971,
2005
- "train_runtime": 670.5653,
2006
- "train_samples_per_second": 57.891,
2007
- "train_steps_per_second": 3.624
2008
  }
2009
  ],
2010
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.15773314237594604,
3
+ "best_model_checkpoint": "frostsolutions/frost-vision-v2-google_vit-base-patch16-224-v2024-11-14/checkpoint-1000",
4
  "epoch": 30.0,
5
  "eval_steps": 100,
6
  "global_step": 2430,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.12345679012345678,
13
+ "grad_norm": 2.0258209705352783,
14
+ "learning_rate": 2.05761316872428e-06,
15
+ "loss": 0.7157,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.24691358024691357,
20
+ "grad_norm": 1.8047471046447754,
21
+ "learning_rate": 4.11522633744856e-06,
22
+ "loss": 0.6777,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.37037037037037035,
27
+ "grad_norm": 1.81009840965271,
28
+ "learning_rate": 6.172839506172839e-06,
29
+ "loss": 0.6489,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.49382716049382713,
34
+ "grad_norm": 1.4138026237487793,
35
+ "learning_rate": 8.23045267489712e-06,
36
+ "loss": 0.5874,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.6172839506172839,
41
+ "grad_norm": 1.4049081802368164,
42
+ "learning_rate": 1.02880658436214e-05,
43
+ "loss": 0.5259,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.7407407407407407,
48
+ "grad_norm": 1.0964142084121704,
49
+ "learning_rate": 1.2345679012345678e-05,
50
+ "loss": 0.4743,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.8641975308641975,
55
+ "grad_norm": 0.8998202681541443,
56
+ "learning_rate": 1.440329218106996e-05,
57
+ "loss": 0.4258,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.9876543209876543,
62
+ "grad_norm": 1.0498803853988647,
63
+ "learning_rate": 1.646090534979424e-05,
64
+ "loss": 0.4061,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 1.1111111111111112,
69
+ "grad_norm": 0.8372374773025513,
70
+ "learning_rate": 1.8518518518518518e-05,
71
+ "loss": 0.3647,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 1.2345679012345678,
76
+ "grad_norm": 0.9855860471725464,
77
+ "learning_rate": 2.05761316872428e-05,
78
+ "loss": 0.3381,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 1.2345679012345678,
83
+ "eval_accuracy": 0.8660493827160494,
84
+ "eval_f1": 0.5668662674650699,
85
+ "eval_loss": 0.3270835876464844,
86
+ "eval_precision": 0.8045325779036827,
87
+ "eval_recall": 0.43759630200308164,
88
+ "eval_runtime": 2.3341,
89
+ "eval_samples_per_second": 138.812,
90
+ "eval_steps_per_second": 17.566,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 1.3580246913580247,
95
+ "grad_norm": 0.8622191548347473,
96
+ "learning_rate": 2.2633744855967078e-05,
97
+ "loss": 0.3148,
98
  "step": 110
99
  },
100
  {
101
  "epoch": 1.4814814814814814,
102
+ "grad_norm": 0.9625290036201477,
103
+ "learning_rate": 2.4691358024691357e-05,
104
+ "loss": 0.3021,
105
  "step": 120
106
  },
107
  {
108
  "epoch": 1.6049382716049383,
109
+ "grad_norm": 0.9755824208259583,
110
+ "learning_rate": 2.6748971193415638e-05,
111
+ "loss": 0.2781,
112
  "step": 130
113
  },
114
  {
115
  "epoch": 1.7283950617283952,
116
+ "grad_norm": 0.7279321551322937,
117
+ "learning_rate": 2.880658436213992e-05,
118
+ "loss": 0.245,
119
  "step": 140
120
  },
121
  {
122
  "epoch": 1.8518518518518519,
123
+ "grad_norm": 0.8370109796524048,
124
+ "learning_rate": 3.08641975308642e-05,
125
+ "loss": 0.2747,
126
  "step": 150
127
  },
128
  {
129
  "epoch": 1.9753086419753085,
130
+ "grad_norm": 0.9776009917259216,
131
+ "learning_rate": 3.292181069958848e-05,
132
+ "loss": 0.2741,
133
  "step": 160
134
  },
135
  {
136
  "epoch": 2.0987654320987654,
137
+ "grad_norm": 0.7717245221138,
138
+ "learning_rate": 3.497942386831276e-05,
139
+ "loss": 0.2285,
140
  "step": 170
141
  },
142
  {
143
  "epoch": 2.2222222222222223,
144
+ "grad_norm": 0.9406857490539551,
145
+ "learning_rate": 3.7037037037037037e-05,
146
+ "loss": 0.2179,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 2.3456790123456788,
151
+ "grad_norm": 0.9723733067512512,
152
+ "learning_rate": 3.909465020576132e-05,
153
+ "loss": 0.2213,
154
  "step": 190
155
  },
156
  {
157
  "epoch": 2.4691358024691357,
158
+ "grad_norm": 1.2755147218704224,
159
+ "learning_rate": 4.11522633744856e-05,
160
+ "loss": 0.2067,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 2.4691358024691357,
165
+ "eval_accuracy": 0.9194444444444444,
166
+ "eval_f1": 0.7826810990840966,
167
+ "eval_loss": 0.20802636444568634,
168
+ "eval_precision": 0.8514492753623188,
169
+ "eval_recall": 0.724191063174114,
170
+ "eval_runtime": 2.6756,
171
+ "eval_samples_per_second": 121.096,
172
+ "eval_steps_per_second": 15.324,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 2.5925925925925926,
177
+ "grad_norm": 0.6742503643035889,
178
+ "learning_rate": 4.3209876543209875e-05,
179
+ "loss": 0.1916,
180
  "step": 210
181
  },
182
  {
183
  "epoch": 2.7160493827160495,
184
+ "grad_norm": 0.9410703182220459,
185
+ "learning_rate": 4.5267489711934157e-05,
186
+ "loss": 0.1932,
187
  "step": 220
188
  },
189
  {
190
  "epoch": 2.8395061728395063,
191
+ "grad_norm": 1.5295264720916748,
192
+ "learning_rate": 4.732510288065844e-05,
193
+ "loss": 0.1991,
194
  "step": 230
195
  },
196
  {
197
  "epoch": 2.962962962962963,
198
+ "grad_norm": 1.7802011966705322,
199
+ "learning_rate": 4.938271604938271e-05,
200
+ "loss": 0.2272,
201
  "step": 240
202
  },
203
  {
204
  "epoch": 3.0864197530864197,
205
+ "grad_norm": 1.6310603618621826,
206
+ "learning_rate": 4.983996342021033e-05,
207
+ "loss": 0.1905,
208
  "step": 250
209
  },
210
  {
211
  "epoch": 3.2098765432098766,
212
+ "grad_norm": 0.588090181350708,
213
+ "learning_rate": 4.9611339734796525e-05,
214
+ "loss": 0.1577,
215
  "step": 260
216
  },
217
  {
218
  "epoch": 3.3333333333333335,
219
+ "grad_norm": 0.7364725470542908,
220
+ "learning_rate": 4.938271604938271e-05,
221
+ "loss": 0.1972,
222
  "step": 270
223
  },
224
  {
225
  "epoch": 3.45679012345679,
226
+ "grad_norm": 0.9553163647651672,
227
+ "learning_rate": 4.9154092363968915e-05,
228
+ "loss": 0.1468,
229
  "step": 280
230
  },
231
  {
232
  "epoch": 3.580246913580247,
233
+ "grad_norm": 0.9130911827087402,
234
+ "learning_rate": 4.89254686785551e-05,
235
+ "loss": 0.1894,
236
  "step": 290
237
  },
238
  {
239
  "epoch": 3.7037037037037037,
240
+ "grad_norm": 0.895966649055481,
241
+ "learning_rate": 4.86968449931413e-05,
242
+ "loss": 0.1745,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 3.7037037037037037,
247
+ "eval_accuracy": 0.9228395061728395,
248
+ "eval_f1": 0.8003194888178914,
249
+ "eval_loss": 0.18639631569385529,
250
+ "eval_precision": 0.8308457711442786,
251
+ "eval_recall": 0.7719568567026194,
252
+ "eval_runtime": 3.3226,
253
+ "eval_samples_per_second": 97.513,
254
+ "eval_steps_per_second": 12.34,
255
  "step": 300
256
  },
257
  {
258
  "epoch": 3.8271604938271606,
259
+ "grad_norm": 0.9509402513504028,
260
+ "learning_rate": 4.8468221307727485e-05,
261
+ "loss": 0.1948,
262
  "step": 310
263
  },
264
  {
265
  "epoch": 3.950617283950617,
266
+ "grad_norm": 0.959931492805481,
267
+ "learning_rate": 4.823959762231367e-05,
268
+ "loss": 0.1824,
269
  "step": 320
270
  },
271
  {
272
  "epoch": 4.074074074074074,
273
+ "grad_norm": 0.6884809732437134,
274
+ "learning_rate": 4.801097393689987e-05,
275
+ "loss": 0.156,
276
  "step": 330
277
  },
278
  {
279
  "epoch": 4.197530864197531,
280
+ "grad_norm": 0.9225603342056274,
281
+ "learning_rate": 4.7782350251486056e-05,
282
+ "loss": 0.1333,
283
  "step": 340
284
  },
285
  {
286
  "epoch": 4.320987654320987,
287
+ "grad_norm": 1.0396263599395752,
288
+ "learning_rate": 4.755372656607225e-05,
289
+ "loss": 0.1715,
290
  "step": 350
291
  },
292
  {
293
  "epoch": 4.444444444444445,
294
+ "grad_norm": 0.6264840364456177,
295
+ "learning_rate": 4.732510288065844e-05,
296
+ "loss": 0.1491,
297
  "step": 360
298
  },
299
  {
300
  "epoch": 4.567901234567901,
301
+ "grad_norm": 0.756226658821106,
302
+ "learning_rate": 4.709647919524463e-05,
303
+ "loss": 0.1563,
304
  "step": 370
305
  },
306
  {
307
  "epoch": 4.6913580246913575,
308
+ "grad_norm": 0.9274685382843018,
309
+ "learning_rate": 4.686785550983082e-05,
310
+ "loss": 0.1746,
311
  "step": 380
312
  },
313
  {
314
  "epoch": 4.814814814814815,
315
+ "grad_norm": 0.8389601111412048,
316
+ "learning_rate": 4.6639231824417016e-05,
317
+ "loss": 0.14,
318
  "step": 390
319
  },
320
  {
321
  "epoch": 4.938271604938271,
322
+ "grad_norm": 0.7658799290657043,
323
+ "learning_rate": 4.6410608139003203e-05,
324
+ "loss": 0.1724,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 4.938271604938271,
329
+ "eval_accuracy": 0.9299382716049382,
330
+ "eval_f1": 0.8188347964884277,
331
+ "eval_loss": 0.17921936511993408,
332
+ "eval_precision": 0.8493377483443708,
333
+ "eval_recall": 0.7904468412942989,
334
+ "eval_runtime": 2.4889,
335
+ "eval_samples_per_second": 130.176,
336
+ "eval_steps_per_second": 16.473,
337
  "step": 400
338
  },
339
  {
340
  "epoch": 5.061728395061729,
341
+ "grad_norm": 0.47751790285110474,
342
+ "learning_rate": 4.618198445358939e-05,
343
+ "loss": 0.1397,
344
  "step": 410
345
  },
346
  {
347
  "epoch": 5.185185185185185,
348
+ "grad_norm": 0.8206377625465393,
349
+ "learning_rate": 4.5953360768175586e-05,
350
+ "loss": 0.1332,
351
  "step": 420
352
  },
353
  {
354
  "epoch": 5.308641975308642,
355
+ "grad_norm": 1.053655743598938,
356
+ "learning_rate": 4.5724737082761774e-05,
357
+ "loss": 0.1524,
358
  "step": 430
359
  },
360
  {
361
  "epoch": 5.432098765432099,
362
+ "grad_norm": 0.8656085729598999,
363
+ "learning_rate": 4.549611339734797e-05,
364
+ "loss": 0.1215,
365
  "step": 440
366
  },
367
  {
368
  "epoch": 5.555555555555555,
369
+ "grad_norm": 0.7634124755859375,
370
+ "learning_rate": 4.5267489711934157e-05,
371
+ "loss": 0.1291,
372
  "step": 450
373
  },
374
  {
375
  "epoch": 5.679012345679013,
376
+ "grad_norm": 0.6214159727096558,
377
+ "learning_rate": 4.503886602652035e-05,
378
+ "loss": 0.1314,
379
  "step": 460
380
  },
381
  {
382
  "epoch": 5.802469135802469,
383
+ "grad_norm": 1.1487807035446167,
384
+ "learning_rate": 4.481024234110654e-05,
385
+ "loss": 0.1372,
386
  "step": 470
387
  },
388
  {
389
  "epoch": 5.925925925925926,
390
+ "grad_norm": 0.7788066864013672,
391
+ "learning_rate": 4.4581618655692734e-05,
392
+ "loss": 0.1399,
393
  "step": 480
394
  },
395
  {
396
  "epoch": 6.049382716049383,
397
+ "grad_norm": 0.6658753156661987,
398
+ "learning_rate": 4.435299497027892e-05,
399
+ "loss": 0.1463,
400
  "step": 490
401
  },
402
  {
403
  "epoch": 6.172839506172839,
404
+ "grad_norm": 1.125616192817688,
405
+ "learning_rate": 4.412437128486511e-05,
406
+ "loss": 0.128,
407
  "step": 500
408
  },
409
  {
410
  "epoch": 6.172839506172839,
411
+ "eval_accuracy": 0.932716049382716,
412
+ "eval_f1": 0.829153605015674,
413
+ "eval_loss": 0.17359426617622375,
414
+ "eval_precision": 0.8437001594896332,
415
+ "eval_recall": 0.8151001540832049,
416
+ "eval_runtime": 2.3597,
417
+ "eval_samples_per_second": 137.303,
418
+ "eval_steps_per_second": 17.375,
419
  "step": 500
420
  },
421
  {
422
  "epoch": 6.296296296296296,
423
+ "grad_norm": 0.8468999862670898,
424
+ "learning_rate": 4.3895747599451304e-05,
425
+ "loss": 0.1085,
426
  "step": 510
427
  },
428
  {
429
  "epoch": 6.419753086419753,
430
+ "grad_norm": 0.8319602012634277,
431
+ "learning_rate": 4.366712391403749e-05,
432
+ "loss": 0.1436,
433
  "step": 520
434
  },
435
  {
436
  "epoch": 6.54320987654321,
437
+ "grad_norm": 1.0661511421203613,
438
+ "learning_rate": 4.343850022862369e-05,
439
+ "loss": 0.1258,
440
  "step": 530
441
  },
442
  {
443
  "epoch": 6.666666666666667,
444
+ "grad_norm": 1.0158193111419678,
445
+ "learning_rate": 4.3209876543209875e-05,
446
+ "loss": 0.1173,
447
  "step": 540
448
  },
449
  {
450
  "epoch": 6.790123456790123,
451
+ "grad_norm": 0.996032178401947,
452
+ "learning_rate": 4.298125285779607e-05,
453
+ "loss": 0.123,
454
  "step": 550
455
  },
456
  {
457
  "epoch": 6.91358024691358,
458
+ "grad_norm": 0.8598793148994446,
459
+ "learning_rate": 4.2752629172382264e-05,
460
+ "loss": 0.0955,
461
  "step": 560
462
  },
463
  {
464
  "epoch": 7.037037037037037,
465
+ "grad_norm": 0.7240075469017029,
466
+ "learning_rate": 4.252400548696845e-05,
467
+ "loss": 0.1133,
468
  "step": 570
469
  },
470
  {
471
  "epoch": 7.160493827160494,
472
+ "grad_norm": 1.5954258441925049,
473
+ "learning_rate": 4.229538180155465e-05,
474
+ "loss": 0.1092,
475
  "step": 580
476
  },
477
  {
478
  "epoch": 7.283950617283951,
479
+ "grad_norm": 0.41076725721359253,
480
+ "learning_rate": 4.2066758116140835e-05,
481
+ "loss": 0.1059,
482
  "step": 590
483
  },
484
  {
485
  "epoch": 7.407407407407407,
486
+ "grad_norm": 1.1242218017578125,
487
+ "learning_rate": 4.183813443072703e-05,
488
+ "loss": 0.1034,
489
  "step": 600
490
  },
491
  {
492
  "epoch": 7.407407407407407,
493
+ "eval_accuracy": 0.9354938271604938,
494
+ "eval_f1": 0.8347826086956521,
495
+ "eval_loss": 0.16716818511486053,
496
+ "eval_precision": 0.8571428571428571,
497
+ "eval_recall": 0.8135593220338984,
498
+ "eval_runtime": 2.3423,
499
+ "eval_samples_per_second": 138.328,
500
+ "eval_steps_per_second": 17.504,
501
  "step": 600
502
  },
503
  {
504
  "epoch": 7.530864197530864,
505
+ "grad_norm": 0.97056645154953,
506
+ "learning_rate": 4.160951074531322e-05,
507
+ "loss": 0.1239,
508
  "step": 610
509
  },
510
  {
511
  "epoch": 7.654320987654321,
512
+ "grad_norm": 0.8477165102958679,
513
+ "learning_rate": 4.138088705989941e-05,
514
+ "loss": 0.1164,
515
  "step": 620
516
  },
517
  {
518
  "epoch": 7.777777777777778,
519
+ "grad_norm": 0.7572548389434814,
520
+ "learning_rate": 4.11522633744856e-05,
521
+ "loss": 0.1156,
522
  "step": 630
523
  },
524
  {
525
  "epoch": 7.901234567901234,
526
+ "grad_norm": 1.5163884162902832,
527
+ "learning_rate": 4.092363968907179e-05,
528
+ "loss": 0.1195,
529
  "step": 640
530
  },
531
  {
532
  "epoch": 8.024691358024691,
533
+ "grad_norm": 0.7944018840789795,
534
+ "learning_rate": 4.069501600365798e-05,
535
+ "loss": 0.1086,
536
  "step": 650
537
  },
538
  {
539
  "epoch": 8.148148148148149,
540
+ "grad_norm": 1.1152986288070679,
541
+ "learning_rate": 4.046639231824417e-05,
542
+ "loss": 0.1101,
543
  "step": 660
544
  },
545
  {
546
  "epoch": 8.271604938271604,
547
+ "grad_norm": 0.7614450454711914,
548
+ "learning_rate": 4.0237768632830365e-05,
549
+ "loss": 0.1004,
550
  "step": 670
551
  },
552
  {
553
  "epoch": 8.395061728395062,
554
+ "grad_norm": 0.6931539177894592,
555
+ "learning_rate": 4.000914494741655e-05,
556
+ "loss": 0.1047,
557
  "step": 680
558
  },
559
  {
560
  "epoch": 8.518518518518519,
561
+ "grad_norm": 0.6190685629844666,
562
+ "learning_rate": 3.978052126200275e-05,
563
+ "loss": 0.0811,
564
  "step": 690
565
  },
566
  {
567
  "epoch": 8.641975308641975,
568
+ "grad_norm": 0.814577579498291,
569
+ "learning_rate": 3.9551897576588936e-05,
570
+ "loss": 0.0944,
571
  "step": 700
572
  },
573
  {
574
  "epoch": 8.641975308641975,
575
+ "eval_accuracy": 0.9391975308641975,
576
+ "eval_f1": 0.845247446975648,
577
+ "eval_loss": 0.15788604319095612,
578
+ "eval_precision": 0.8621794871794872,
579
+ "eval_recall": 0.8289676425269645,
580
+ "eval_runtime": 2.3023,
581
+ "eval_samples_per_second": 140.727,
582
+ "eval_steps_per_second": 17.808,
583
  "step": 700
584
  },
585
  {
586
  "epoch": 8.765432098765432,
587
+ "grad_norm": 0.6677612066268921,
588
+ "learning_rate": 3.932327389117513e-05,
589
+ "loss": 0.0907,
590
  "step": 710
591
  },
592
  {
593
  "epoch": 8.88888888888889,
594
+ "grad_norm": 1.435887098312378,
595
+ "learning_rate": 3.909465020576132e-05,
596
+ "loss": 0.0941,
597
  "step": 720
598
  },
599
  {
600
  "epoch": 9.012345679012345,
601
+ "grad_norm": 0.8256314992904663,
602
+ "learning_rate": 3.8866026520347506e-05,
603
+ "loss": 0.1155,
604
  "step": 730
605
  },
606
  {
607
  "epoch": 9.135802469135802,
608
+ "grad_norm": 0.7372011542320251,
609
+ "learning_rate": 3.86374028349337e-05,
610
+ "loss": 0.0868,
611
  "step": 740
612
  },
613
  {
614
  "epoch": 9.25925925925926,
615
+ "grad_norm": 0.6247950196266174,
616
+ "learning_rate": 3.840877914951989e-05,
617
+ "loss": 0.0791,
618
  "step": 750
619
  },
620
  {
621
  "epoch": 9.382716049382717,
622
+ "grad_norm": 0.7330453395843506,
623
+ "learning_rate": 3.8180155464106083e-05,
624
+ "loss": 0.0766,
625
  "step": 760
626
  },
627
  {
628
  "epoch": 9.506172839506172,
629
+ "grad_norm": 0.946180522441864,
630
+ "learning_rate": 3.795153177869227e-05,
631
+ "loss": 0.092,
632
  "step": 770
633
  },
634
  {
635
  "epoch": 9.62962962962963,
636
+ "grad_norm": 0.6986804008483887,
637
+ "learning_rate": 3.7722908093278466e-05,
638
+ "loss": 0.0695,
639
  "step": 780
640
  },
641
  {
642
  "epoch": 9.753086419753087,
643
+ "grad_norm": 0.4678453505039215,
644
+ "learning_rate": 3.7494284407864654e-05,
645
+ "loss": 0.0896,
646
  "step": 790
647
  },
648
  {
649
  "epoch": 9.876543209876543,
650
+ "grad_norm": 0.8175441026687622,
651
+ "learning_rate": 3.726566072245085e-05,
652
+ "loss": 0.0919,
653
  "step": 800
654
  },
655
  {
656
  "epoch": 9.876543209876543,
657
+ "eval_accuracy": 0.9364197530864198,
658
+ "eval_f1": 0.8346709470304976,
659
+ "eval_loss": 0.16307669878005981,
660
+ "eval_precision": 0.8710217755443886,
661
+ "eval_recall": 0.8012326656394453,
662
+ "eval_runtime": 2.5754,
663
+ "eval_samples_per_second": 125.805,
664
+ "eval_steps_per_second": 15.92,
665
  "step": 800
666
  },
667
  {
668
  "epoch": 10.0,
669
+ "grad_norm": 0.7374185919761658,
670
+ "learning_rate": 3.7037037037037037e-05,
671
+ "loss": 0.1001,
672
  "step": 810
673
  },
674
  {
675
  "epoch": 10.123456790123457,
676
+ "grad_norm": 0.5478447079658508,
677
+ "learning_rate": 3.6808413351623224e-05,
678
+ "loss": 0.0939,
679
  "step": 820
680
  },
681
  {
682
  "epoch": 10.246913580246913,
683
+ "grad_norm": 0.8081673979759216,
684
+ "learning_rate": 3.657978966620942e-05,
685
+ "loss": 0.0704,
686
  "step": 830
687
  },
688
  {
689
  "epoch": 10.37037037037037,
690
+ "grad_norm": 0.7327490448951721,
691
+ "learning_rate": 3.635116598079561e-05,
692
+ "loss": 0.0742,
693
  "step": 840
694
  },
695
  {
696
  "epoch": 10.493827160493828,
697
+ "grad_norm": 0.6625596284866333,
698
+ "learning_rate": 3.612254229538181e-05,
699
+ "loss": 0.0784,
700
  "step": 850
701
  },
702
  {
703
  "epoch": 10.617283950617283,
704
+ "grad_norm": 1.724616289138794,
705
+ "learning_rate": 3.5893918609967996e-05,
706
+ "loss": 0.0848,
707
  "step": 860
708
  },
709
  {
710
  "epoch": 10.74074074074074,
711
+ "grad_norm": 0.6218889951705933,
712
+ "learning_rate": 3.566529492455419e-05,
713
+ "loss": 0.0752,
714
  "step": 870
715
  },
716
  {
717
  "epoch": 10.864197530864198,
718
+ "grad_norm": 1.1755985021591187,
719
+ "learning_rate": 3.543667123914038e-05,
720
+ "loss": 0.0751,
721
  "step": 880
722
  },
723
  {
724
  "epoch": 10.987654320987655,
725
+ "grad_norm": 1.2024366855621338,
726
+ "learning_rate": 3.520804755372657e-05,
727
+ "loss": 0.0827,
728
  "step": 890
729
  },
730
  {
731
  "epoch": 11.11111111111111,
732
+ "grad_norm": 1.065391182899475,
733
+ "learning_rate": 3.497942386831276e-05,
734
+ "loss": 0.0791,
735
  "step": 900
736
  },
737
  {
738
  "epoch": 11.11111111111111,
739
+ "eval_accuracy": 0.937962962962963,
740
+ "eval_f1": 0.8382944489139179,
741
+ "eval_loss": 0.15917660295963287,
742
+ "eval_precision": 0.877104377104377,
743
+ "eval_recall": 0.802773497688752,
744
+ "eval_runtime": 2.965,
745
+ "eval_samples_per_second": 109.275,
746
+ "eval_steps_per_second": 13.828,
747
  "step": 900
748
  },
749
  {
750
  "epoch": 11.234567901234568,
751
+ "grad_norm": 0.6627580523490906,
752
+ "learning_rate": 3.475080018289895e-05,
753
  "loss": 0.0758,
754
  "step": 910
755
  },
756
  {
757
  "epoch": 11.358024691358025,
758
+ "grad_norm": 0.5924692749977112,
759
+ "learning_rate": 3.4522176497485144e-05,
760
+ "loss": 0.0672,
761
  "step": 920
762
  },
763
  {
764
  "epoch": 11.481481481481481,
765
+ "grad_norm": 0.5990743637084961,
766
+ "learning_rate": 3.429355281207133e-05,
767
+ "loss": 0.0822,
768
  "step": 930
769
  },
770
  {
771
  "epoch": 11.604938271604938,
772
+ "grad_norm": 0.9133488535881042,
773
+ "learning_rate": 3.406492912665753e-05,
774
+ "loss": 0.0883,
775
  "step": 940
776
  },
777
  {
778
  "epoch": 11.728395061728396,
779
+ "grad_norm": 1.091486930847168,
780
+ "learning_rate": 3.3836305441243715e-05,
781
+ "loss": 0.0639,
782
  "step": 950
783
  },
784
  {
785
  "epoch": 11.851851851851851,
786
+ "grad_norm": 0.3895137310028076,
787
+ "learning_rate": 3.360768175582991e-05,
788
+ "loss": 0.0687,
789
  "step": 960
790
  },
791
  {
792
  "epoch": 11.975308641975309,
793
+ "grad_norm": 0.5844981074333191,
794
+ "learning_rate": 3.33790580704161e-05,
795
+ "loss": 0.0576,
796
  "step": 970
797
  },
798
  {
799
  "epoch": 12.098765432098766,
800
+ "grad_norm": 0.3059285879135132,
801
+ "learning_rate": 3.3150434385002285e-05,
802
+ "loss": 0.0689,
803
  "step": 980
804
  },
805
  {
806
  "epoch": 12.222222222222221,
807
+ "grad_norm": 0.6075615286827087,
808
+ "learning_rate": 3.292181069958848e-05,
809
+ "loss": 0.0525,
810
  "step": 990
811
  },
812
  {
813
  "epoch": 12.345679012345679,
814
+ "grad_norm": 0.8529097437858582,
815
+ "learning_rate": 3.269318701417467e-05,
816
+ "loss": 0.0684,
817
  "step": 1000
818
  },
819
  {
820
  "epoch": 12.345679012345679,
821
+ "eval_accuracy": 0.9388888888888889,
822
+ "eval_f1": 0.8436018957345972,
823
+ "eval_loss": 0.15773314237594604,
824
+ "eval_precision": 0.8654781199351702,
825
+ "eval_recall": 0.8228043143297381,
826
+ "eval_runtime": 3.556,
827
+ "eval_samples_per_second": 91.114,
828
+ "eval_steps_per_second": 11.53,
829
  "step": 1000
830
  },
831
  {
832
  "epoch": 12.469135802469136,
833
+ "grad_norm": 0.2871643602848053,
834
+ "learning_rate": 3.246456332876086e-05,
835
+ "loss": 0.0721,
836
  "step": 1010
837
  },
838
  {
839
  "epoch": 12.592592592592592,
840
+ "grad_norm": 1.031972050666809,
841
+ "learning_rate": 3.223593964334705e-05,
842
+ "loss": 0.0762,
843
  "step": 1020
844
  },
845
  {
846
  "epoch": 12.716049382716049,
847
+ "grad_norm": 0.8951663374900818,
848
+ "learning_rate": 3.2007315957933245e-05,
849
+ "loss": 0.078,
850
  "step": 1030
851
  },
852
  {
853
  "epoch": 12.839506172839506,
854
+ "grad_norm": 0.38367488980293274,
855
+ "learning_rate": 3.177869227251943e-05,
856
+ "loss": 0.0629,
857
  "step": 1040
858
  },
859
  {
860
  "epoch": 12.962962962962964,
861
+ "grad_norm": 1.0400196313858032,
862
+ "learning_rate": 3.155006858710563e-05,
863
+ "loss": 0.0755,
864
  "step": 1050
865
  },
866
  {
867
  "epoch": 13.08641975308642,
868
+ "grad_norm": 0.9411855340003967,
869
+ "learning_rate": 3.1321444901691816e-05,
870
+ "loss": 0.0624,
871
  "step": 1060
872
  },
873
  {
874
  "epoch": 13.209876543209877,
875
+ "grad_norm": 0.9051506519317627,
876
+ "learning_rate": 3.1092821216278004e-05,
877
+ "loss": 0.0573,
878
  "step": 1070
879
  },
880
  {
881
  "epoch": 13.333333333333334,
882
+ "grad_norm": 1.214126706123352,
883
+ "learning_rate": 3.08641975308642e-05,
884
+ "loss": 0.0658,
885
  "step": 1080
886
  },
887
  {
888
  "epoch": 13.45679012345679,
889
+ "grad_norm": 0.8709390163421631,
890
+ "learning_rate": 3.0635573845450386e-05,
891
+ "loss": 0.0626,
892
  "step": 1090
893
  },
894
  {
895
  "epoch": 13.580246913580247,
896
+ "grad_norm": 1.0249199867248535,
897
+ "learning_rate": 3.0406950160036577e-05,
898
+ "loss": 0.0737,
899
  "step": 1100
900
  },
901
  {
902
  "epoch": 13.580246913580247,
903
+ "eval_accuracy": 0.937962962962963,
904
+ "eval_f1": 0.8416075650118203,
905
+ "eval_loss": 0.16777929663658142,
906
+ "eval_precision": 0.8612903225806452,
907
+ "eval_recall": 0.8228043143297381,
908
+ "eval_runtime": 2.5879,
909
+ "eval_samples_per_second": 125.2,
910
+ "eval_steps_per_second": 15.843,
911
  "step": 1100
912
  },
913
  {
914
  "epoch": 13.703703703703704,
915
+ "grad_norm": 0.6229693293571472,
916
+ "learning_rate": 3.017832647462277e-05,
917
+ "loss": 0.0611,
918
  "step": 1110
919
  },
920
  {
921
  "epoch": 13.82716049382716,
922
+ "grad_norm": 0.29374879598617554,
923
+ "learning_rate": 2.994970278920896e-05,
924
+ "loss": 0.0616,
925
  "step": 1120
926
  },
927
  {
928
  "epoch": 13.950617283950617,
929
+ "grad_norm": 0.5640215277671814,
930
+ "learning_rate": 2.972107910379515e-05,
931
+ "loss": 0.0703,
932
  "step": 1130
933
  },
934
  {
935
  "epoch": 14.074074074074074,
936
+ "grad_norm": 0.44930121302604675,
937
+ "learning_rate": 2.949245541838135e-05,
938
+ "loss": 0.0523,
939
  "step": 1140
940
  },
941
  {
942
  "epoch": 14.197530864197532,
943
+ "grad_norm": 0.3680154085159302,
944
+ "learning_rate": 2.926383173296754e-05,
945
+ "loss": 0.0608,
946
  "step": 1150
947
  },
948
  {
949
  "epoch": 14.320987654320987,
950
+ "grad_norm": 0.5697309374809265,
951
+ "learning_rate": 2.903520804755373e-05,
952
+ "loss": 0.0581,
953
  "step": 1160
954
  },
955
  {
956
  "epoch": 14.444444444444445,
957
+ "grad_norm": 0.6320594549179077,
958
+ "learning_rate": 2.880658436213992e-05,
959
+ "loss": 0.0557,
960
  "step": 1170
961
  },
962
  {
963
  "epoch": 14.567901234567902,
964
+ "grad_norm": 0.8799903988838196,
965
+ "learning_rate": 2.857796067672611e-05,
966
+ "loss": 0.0556,
967
  "step": 1180
968
  },
969
  {
970
  "epoch": 14.691358024691358,
971
+ "grad_norm": 0.7066503167152405,
972
+ "learning_rate": 2.8349336991312303e-05,
973
+ "loss": 0.0574,
974
  "step": 1190
975
  },
976
  {
977
  "epoch": 14.814814814814815,
978
+ "grad_norm": 1.6003211736679077,
979
+ "learning_rate": 2.8120713305898494e-05,
980
+ "loss": 0.0625,
981
  "step": 1200
982
  },
983
  {
984
  "epoch": 14.814814814814815,
985
+ "eval_accuracy": 0.9425925925925925,
986
+ "eval_f1": 0.85423197492163,
987
+ "eval_loss": 0.16456876695156097,
988
+ "eval_precision": 0.8692185007974481,
989
+ "eval_recall": 0.8397534668721109,
990
+ "eval_runtime": 3.5694,
991
+ "eval_samples_per_second": 90.771,
992
+ "eval_steps_per_second": 11.486,
993
  "step": 1200
994
  },
995
  {
996
  "epoch": 14.938271604938272,
997
+ "grad_norm": 0.6639634966850281,
998
+ "learning_rate": 2.7892089620484685e-05,
999
+ "loss": 0.0612,
1000
  "step": 1210
1001
  },
1002
  {
1003
  "epoch": 15.061728395061728,
1004
+ "grad_norm": 1.4719339609146118,
1005
+ "learning_rate": 2.7663465935070876e-05,
1006
+ "loss": 0.0537,
1007
  "step": 1220
1008
  },
1009
  {
1010
  "epoch": 15.185185185185185,
1011
+ "grad_norm": 0.6944316625595093,
1012
+ "learning_rate": 2.7434842249657068e-05,
1013
+ "loss": 0.0669,
1014
  "step": 1230
1015
  },
1016
  {
1017
  "epoch": 15.308641975308642,
1018
+ "grad_norm": 0.43684977293014526,
1019
+ "learning_rate": 2.720621856424326e-05,
1020
+ "loss": 0.0475,
1021
  "step": 1240
1022
  },
1023
  {
1024
  "epoch": 15.432098765432098,
1025
+ "grad_norm": 0.3456471264362335,
1026
+ "learning_rate": 2.6977594878829447e-05,
1027
+ "loss": 0.0391,
1028
  "step": 1250
1029
  },
1030
  {
1031
  "epoch": 15.555555555555555,
1032
+ "grad_norm": 0.9245675206184387,
1033
+ "learning_rate": 2.6748971193415638e-05,
1034
+ "loss": 0.0564,
1035
  "step": 1260
1036
  },
1037
  {
1038
  "epoch": 15.679012345679013,
1039
+ "grad_norm": 1.1026465892791748,
1040
+ "learning_rate": 2.652034750800183e-05,
1041
+ "loss": 0.0628,
1042
  "step": 1270
1043
  },
1044
  {
1045
  "epoch": 15.802469135802468,
1046
+ "grad_norm": 0.5103587508201599,
1047
+ "learning_rate": 2.629172382258802e-05,
1048
+ "loss": 0.0554,
1049
  "step": 1280
1050
  },
1051
  {
1052
  "epoch": 15.925925925925926,
1053
+ "grad_norm": 0.6337056159973145,
1054
+ "learning_rate": 2.6063100137174212e-05,
1055
+ "loss": 0.0569,
1056
  "step": 1290
1057
  },
1058
  {
1059
  "epoch": 16.049382716049383,
1060
+ "grad_norm": 1.0704419612884521,
1061
+ "learning_rate": 2.5834476451760403e-05,
1062
+ "loss": 0.0591,
1063
  "step": 1300
1064
  },
1065
  {
1066
  "epoch": 16.049382716049383,
1067
+ "eval_accuracy": 0.9432098765432099,
1068
+ "eval_f1": 0.8548895899053628,
1069
+ "eval_loss": 0.16249051690101624,
1070
+ "eval_precision": 0.875605815831987,
1071
+ "eval_recall": 0.8351309707241911,
1072
+ "eval_runtime": 2.3147,
1073
+ "eval_samples_per_second": 139.976,
1074
+ "eval_steps_per_second": 17.713,
1075
  "step": 1300
1076
  },
1077
  {
1078
  "epoch": 16.17283950617284,
1079
+ "grad_norm": 1.0367881059646606,
1080
+ "learning_rate": 2.5605852766346595e-05,
1081
+ "loss": 0.0414,
1082
  "step": 1310
1083
  },
1084
  {
1085
  "epoch": 16.296296296296298,
1086
+ "grad_norm": 0.8111926317214966,
1087
+ "learning_rate": 2.5377229080932786e-05,
1088
+ "loss": 0.0649,
1089
  "step": 1320
1090
  },
1091
  {
1092
  "epoch": 16.419753086419753,
1093
+ "grad_norm": 0.42399314045906067,
1094
+ "learning_rate": 2.5148605395518977e-05,
1095
  "loss": 0.0534,
1096
  "step": 1330
1097
  },
1098
  {
1099
  "epoch": 16.54320987654321,
1100
+ "grad_norm": 0.4055217206478119,
1101
+ "learning_rate": 2.4919981710105165e-05,
1102
+ "loss": 0.0451,
1103
  "step": 1340
1104
  },
1105
  {
1106
  "epoch": 16.666666666666668,
1107
+ "grad_norm": 0.33903083205223083,
1108
+ "learning_rate": 2.4691358024691357e-05,
1109
+ "loss": 0.0599,
1110
  "step": 1350
1111
  },
1112
  {
1113
  "epoch": 16.790123456790123,
1114
+ "grad_norm": 0.4832461476325989,
1115
+ "learning_rate": 2.446273433927755e-05,
1116
+ "loss": 0.0388,
1117
  "step": 1360
1118
  },
1119
  {
1120
  "epoch": 16.91358024691358,
1121
+ "grad_norm": 1.0101662874221802,
1122
+ "learning_rate": 2.4234110653863743e-05,
1123
+ "loss": 0.0523,
1124
  "step": 1370
1125
  },
1126
  {
1127
  "epoch": 17.037037037037038,
1128
+ "grad_norm": 0.7684808969497681,
1129
+ "learning_rate": 2.4005486968449934e-05,
1130
+ "loss": 0.0569,
1131
  "step": 1380
1132
  },
1133
  {
1134
  "epoch": 17.160493827160494,
1135
+ "grad_norm": 0.5816587209701538,
1136
+ "learning_rate": 2.3776863283036125e-05,
1137
+ "loss": 0.0471,
1138
  "step": 1390
1139
  },
1140
  {
1141
  "epoch": 17.28395061728395,
1142
+ "grad_norm": 0.5327761769294739,
1143
+ "learning_rate": 2.3548239597622316e-05,
1144
+ "loss": 0.0464,
1145
  "step": 1400
1146
  },
1147
  {
1148
  "epoch": 17.28395061728395,
1149
+ "eval_accuracy": 0.9385802469135802,
1150
+ "eval_f1": 0.8421887390959556,
1151
+ "eval_loss": 0.1721687614917755,
1152
+ "eval_precision": 0.8676470588235294,
1153
+ "eval_recall": 0.8181818181818182,
1154
+ "eval_runtime": 2.3252,
1155
+ "eval_samples_per_second": 139.345,
1156
+ "eval_steps_per_second": 17.633,
1157
  "step": 1400
1158
  },
1159
  {
1160
  "epoch": 17.40740740740741,
1161
+ "grad_norm": 1.4246679544448853,
1162
+ "learning_rate": 2.3319615912208508e-05,
1163
+ "loss": 0.0593,
1164
  "step": 1410
1165
  },
1166
  {
1167
  "epoch": 17.530864197530864,
1168
+ "grad_norm": 0.6948336958885193,
1169
+ "learning_rate": 2.3090992226794696e-05,
1170
+ "loss": 0.0368,
1171
  "step": 1420
1172
  },
1173
  {
1174
  "epoch": 17.65432098765432,
1175
+ "grad_norm": 0.7839669585227966,
1176
+ "learning_rate": 2.2862368541380887e-05,
1177
+ "loss": 0.0662,
1178
  "step": 1430
1179
  },
1180
  {
1181
  "epoch": 17.77777777777778,
1182
+ "grad_norm": 0.6608372330665588,
1183
+ "learning_rate": 2.2633744855967078e-05,
1184
+ "loss": 0.0457,
1185
  "step": 1440
1186
  },
1187
  {
1188
  "epoch": 17.901234567901234,
1189
+ "grad_norm": 1.2672737836837769,
1190
+ "learning_rate": 2.240512117055327e-05,
1191
+ "loss": 0.0517,
1192
  "step": 1450
1193
  },
1194
  {
1195
  "epoch": 18.02469135802469,
1196
+ "grad_norm": 0.8855928778648376,
1197
+ "learning_rate": 2.217649748513946e-05,
1198
+ "loss": 0.0524,
1199
  "step": 1460
1200
  },
1201
  {
1202
  "epoch": 18.14814814814815,
1203
+ "grad_norm": 0.7090869545936584,
1204
+ "learning_rate": 2.1947873799725652e-05,
1205
+ "loss": 0.0449,
1206
  "step": 1470
1207
  },
1208
  {
1209
  "epoch": 18.271604938271604,
1210
+ "grad_norm": 0.5972793698310852,
1211
+ "learning_rate": 2.1719250114311843e-05,
1212
+ "loss": 0.0358,
1213
  "step": 1480
1214
  },
1215
  {
1216
  "epoch": 18.395061728395063,
1217
+ "grad_norm": 0.2597953677177429,
1218
+ "learning_rate": 2.1490626428898035e-05,
1219
+ "loss": 0.0599,
1220
  "step": 1490
1221
  },
1222
  {
1223
  "epoch": 18.51851851851852,
1224
+ "grad_norm": 0.5771777629852295,
1225
+ "learning_rate": 2.1262002743484226e-05,
1226
+ "loss": 0.048,
1227
  "step": 1500
1228
  },
1229
  {
1230
  "epoch": 18.51851851851852,
1231
+ "eval_accuracy": 0.9401234567901234,
1232
+ "eval_f1": 0.8472440944881889,
1233
+ "eval_loss": 0.16935202479362488,
1234
+ "eval_precision": 0.8663446054750402,
1235
+ "eval_recall": 0.8289676425269645,
1236
+ "eval_runtime": 2.5549,
1237
+ "eval_samples_per_second": 126.816,
1238
+ "eval_steps_per_second": 16.048,
1239
  "step": 1500
1240
  },
1241
  {
1242
  "epoch": 18.641975308641975,
1243
+ "grad_norm": 1.0612865686416626,
1244
+ "learning_rate": 2.1033379058070417e-05,
1245
+ "loss": 0.0458,
1246
  "step": 1510
1247
  },
1248
  {
1249
  "epoch": 18.765432098765434,
1250
+ "grad_norm": 1.0192160606384277,
1251
+ "learning_rate": 2.080475537265661e-05,
1252
+ "loss": 0.045,
1253
  "step": 1520
1254
  },
1255
  {
1256
  "epoch": 18.88888888888889,
1257
+ "grad_norm": 0.8331177830696106,
1258
+ "learning_rate": 2.05761316872428e-05,
1259
+ "loss": 0.0458,
1260
  "step": 1530
1261
  },
1262
  {
1263
  "epoch": 19.012345679012345,
1264
+ "grad_norm": 0.5867314338684082,
1265
+ "learning_rate": 2.034750800182899e-05,
1266
+ "loss": 0.0462,
1267
  "step": 1540
1268
  },
1269
  {
1270
  "epoch": 19.135802469135804,
1271
+ "grad_norm": 0.2448338121175766,
1272
+ "learning_rate": 2.0118884316415183e-05,
1273
  "loss": 0.0404,
1274
  "step": 1550
1275
  },
1276
  {
1277
  "epoch": 19.25925925925926,
1278
+ "grad_norm": 0.7374415397644043,
1279
+ "learning_rate": 1.9890260631001374e-05,
1280
+ "loss": 0.0442,
1281
  "step": 1560
1282
  },
1283
  {
1284
  "epoch": 19.382716049382715,
1285
+ "grad_norm": 0.6374390125274658,
1286
+ "learning_rate": 1.9661636945587565e-05,
1287
+ "loss": 0.0435,
1288
  "step": 1570
1289
  },
1290
  {
1291
  "epoch": 19.506172839506174,
1292
+ "grad_norm": 0.6881589889526367,
1293
+ "learning_rate": 1.9433013260173753e-05,
1294
+ "loss": 0.0498,
1295
  "step": 1580
1296
  },
1297
  {
1298
  "epoch": 19.62962962962963,
1299
+ "grad_norm": 0.26705750823020935,
1300
+ "learning_rate": 1.9204389574759944e-05,
1301
+ "loss": 0.0415,
1302
  "step": 1590
1303
  },
1304
  {
1305
  "epoch": 19.753086419753085,
1306
+ "grad_norm": 0.7690613269805908,
1307
+ "learning_rate": 1.8975765889346136e-05,
1308
+ "loss": 0.0353,
1309
  "step": 1600
1310
  },
1311
  {
1312
  "epoch": 19.753086419753085,
1313
+ "eval_accuracy": 0.9391975308641975,
1314
+ "eval_f1": 0.8462138953942233,
1315
+ "eval_loss": 0.17146864533424377,
1316
+ "eval_precision": 0.8575949367088608,
1317
+ "eval_recall": 0.8351309707241911,
1318
+ "eval_runtime": 2.9802,
1319
+ "eval_samples_per_second": 108.717,
1320
+ "eval_steps_per_second": 13.757,
1321
  "step": 1600
1322
  },
1323
  {
1324
  "epoch": 19.876543209876544,
1325
+ "grad_norm": 0.630096971988678,
1326
+ "learning_rate": 1.8747142203932327e-05,
1327
+ "loss": 0.0385,
1328
  "step": 1610
1329
  },
1330
  {
1331
  "epoch": 20.0,
1332
+ "grad_norm": 0.6776329874992371,
1333
+ "learning_rate": 1.8518518518518518e-05,
1334
+ "loss": 0.0465,
1335
  "step": 1620
1336
  },
1337
  {
1338
  "epoch": 20.123456790123456,
1339
+ "grad_norm": 0.8204653859138489,
1340
+ "learning_rate": 1.828989483310471e-05,
1341
+ "loss": 0.037,
1342
  "step": 1630
1343
  },
1344
  {
1345
  "epoch": 20.246913580246915,
1346
+ "grad_norm": 0.11897799372673035,
1347
+ "learning_rate": 1.8061271147690904e-05,
1348
+ "loss": 0.0308,
1349
  "step": 1640
1350
  },
1351
  {
1352
  "epoch": 20.37037037037037,
1353
+ "grad_norm": 0.6061619520187378,
1354
+ "learning_rate": 1.7832647462277096e-05,
1355
+ "loss": 0.0456,
1356
  "step": 1650
1357
  },
1358
  {
1359
  "epoch": 20.493827160493826,
1360
+ "grad_norm": 0.24529418349266052,
1361
+ "learning_rate": 1.7604023776863283e-05,
1362
+ "loss": 0.0374,
1363
  "step": 1660
1364
  },
1365
  {
1366
  "epoch": 20.617283950617285,
1367
+ "grad_norm": 0.9412081241607666,
1368
+ "learning_rate": 1.7375400091449475e-05,
1369
+ "loss": 0.0455,
1370
  "step": 1670
1371
  },
1372
  {
1373
  "epoch": 20.74074074074074,
1374
+ "grad_norm": 0.39813077449798584,
1375
+ "learning_rate": 1.7146776406035666e-05,
1376
+ "loss": 0.0395,
1377
  "step": 1680
1378
  },
1379
  {
1380
  "epoch": 20.864197530864196,
1381
+ "grad_norm": 0.36594846844673157,
1382
+ "learning_rate": 1.6918152720621857e-05,
1383
+ "loss": 0.0376,
1384
  "step": 1690
1385
  },
1386
  {
1387
  "epoch": 20.987654320987655,
1388
+ "grad_norm": 0.7182625532150269,
1389
+ "learning_rate": 1.668952903520805e-05,
1390
+ "loss": 0.0434,
1391
  "step": 1700
1392
  },
1393
  {
1394
  "epoch": 20.987654320987655,
1395
+ "eval_accuracy": 0.937037037037037,
1396
+ "eval_f1": 0.8386075949367089,
1397
+ "eval_loss": 0.18171226978302002,
1398
+ "eval_precision": 0.8617886178861789,
1399
+ "eval_recall": 0.8166409861325116,
1400
+ "eval_runtime": 2.6381,
1401
+ "eval_samples_per_second": 122.816,
1402
+ "eval_steps_per_second": 15.541,
1403
  "step": 1700
1404
  },
1405
  {
1406
  "epoch": 21.11111111111111,
1407
+ "grad_norm": 0.6655898094177246,
1408
+ "learning_rate": 1.646090534979424e-05,
1409
+ "loss": 0.0327,
1410
  "step": 1710
1411
  },
1412
  {
1413
  "epoch": 21.234567901234566,
1414
+ "grad_norm": 0.5218392610549927,
1415
+ "learning_rate": 1.623228166438043e-05,
1416
+ "loss": 0.0452,
1417
  "step": 1720
1418
  },
1419
  {
1420
  "epoch": 21.358024691358025,
1421
+ "grad_norm": 0.6255172491073608,
1422
+ "learning_rate": 1.6003657978966623e-05,
1423
+ "loss": 0.033,
1424
  "step": 1730
1425
  },
1426
  {
1427
  "epoch": 21.48148148148148,
1428
+ "grad_norm": 0.6865390539169312,
1429
+ "learning_rate": 1.5775034293552814e-05,
1430
+ "loss": 0.046,
1431
  "step": 1740
1432
  },
1433
  {
1434
  "epoch": 21.604938271604937,
1435
+ "grad_norm": 0.577601432800293,
1436
+ "learning_rate": 1.5546410608139002e-05,
1437
+ "loss": 0.0367,
1438
  "step": 1750
1439
  },
1440
  {
1441
  "epoch": 21.728395061728396,
1442
+ "grad_norm": 0.5450471639633179,
1443
+ "learning_rate": 1.5317786922725193e-05,
1444
+ "loss": 0.038,
1445
  "step": 1760
1446
  },
1447
  {
1448
  "epoch": 21.85185185185185,
1449
+ "grad_norm": 0.8480527997016907,
1450
+ "learning_rate": 1.5089163237311384e-05,
1451
+ "loss": 0.0399,
1452
  "step": 1770
1453
  },
1454
  {
1455
  "epoch": 21.97530864197531,
1456
+ "grad_norm": 0.3345847427845001,
1457
+ "learning_rate": 1.4860539551897576e-05,
1458
+ "loss": 0.0333,
1459
  "step": 1780
1460
  },
1461
  {
1462
  "epoch": 22.098765432098766,
1463
+ "grad_norm": 0.47660622000694275,
1464
+ "learning_rate": 1.463191586648377e-05,
1465
+ "loss": 0.0404,
1466
  "step": 1790
1467
  },
1468
  {
1469
  "epoch": 22.22222222222222,
1470
+ "grad_norm": 0.42594680190086365,
1471
+ "learning_rate": 1.440329218106996e-05,
1472
+ "loss": 0.0332,
1473
  "step": 1800
1474
  },
1475
  {
1476
  "epoch": 22.22222222222222,
1477
+ "eval_accuracy": 0.9382716049382716,
1478
+ "eval_f1": 0.8422712933753943,
1479
+ "eval_loss": 0.17968803644180298,
1480
+ "eval_precision": 0.8626817447495961,
1481
+ "eval_recall": 0.8228043143297381,
1482
+ "eval_runtime": 2.3118,
1483
+ "eval_samples_per_second": 140.152,
1484
+ "eval_steps_per_second": 17.735,
1485
  "step": 1800
1486
  },
1487
  {
1488
  "epoch": 22.34567901234568,
1489
+ "grad_norm": 0.23526711761951447,
1490
+ "learning_rate": 1.4174668495656151e-05,
1491
+ "loss": 0.0336,
1492
  "step": 1810
1493
  },
1494
  {
1495
  "epoch": 22.469135802469136,
1496
+ "grad_norm": 0.30145469307899475,
1497
+ "learning_rate": 1.3946044810242343e-05,
1498
+ "loss": 0.033,
1499
  "step": 1820
1500
  },
1501
  {
1502
  "epoch": 22.59259259259259,
1503
+ "grad_norm": 0.4943447709083557,
1504
+ "learning_rate": 1.3717421124828534e-05,
1505
+ "loss": 0.0336,
1506
  "step": 1830
1507
  },
1508
  {
1509
  "epoch": 22.71604938271605,
1510
+ "grad_norm": 0.5769455432891846,
1511
+ "learning_rate": 1.3488797439414723e-05,
1512
+ "loss": 0.0328,
1513
  "step": 1840
1514
  },
1515
  {
1516
  "epoch": 22.839506172839506,
1517
+ "grad_norm": 1.038840651512146,
1518
+ "learning_rate": 1.3260173754000915e-05,
1519
+ "loss": 0.0382,
1520
  "step": 1850
1521
  },
1522
  {
1523
  "epoch": 22.962962962962962,
1524
+ "grad_norm": 0.4028635323047638,
1525
+ "learning_rate": 1.3031550068587106e-05,
1526
+ "loss": 0.0315,
1527
  "step": 1860
1528
  },
1529
  {
1530
  "epoch": 23.08641975308642,
1531
+ "grad_norm": 0.10601099580526352,
1532
+ "learning_rate": 1.2802926383173297e-05,
1533
+ "loss": 0.0282,
1534
  "step": 1870
1535
  },
1536
  {
1537
  "epoch": 23.209876543209877,
1538
+ "grad_norm": 0.5546613931655884,
1539
+ "learning_rate": 1.2574302697759489e-05,
1540
+ "loss": 0.0506,
1541
  "step": 1880
1542
  },
1543
  {
1544
  "epoch": 23.333333333333332,
1545
+ "grad_norm": 0.3614629805088043,
1546
+ "learning_rate": 1.2345679012345678e-05,
1547
+ "loss": 0.0248,
1548
  "step": 1890
1549
  },
1550
  {
1551
  "epoch": 23.45679012345679,
1552
+ "grad_norm": 0.47462597489356995,
1553
+ "learning_rate": 1.2117055326931871e-05,
1554
+ "loss": 0.0283,
1555
  "step": 1900
1556
  },
1557
  {
1558
  "epoch": 23.45679012345679,
1559
+ "eval_accuracy": 0.9401234567901234,
1560
+ "eval_f1": 0.8482003129890454,
1561
+ "eval_loss": 0.18097253143787384,
1562
+ "eval_precision": 0.8616852146263911,
1563
+ "eval_recall": 0.8351309707241911,
1564
+ "eval_runtime": 3.0694,
1565
+ "eval_samples_per_second": 105.557,
1566
+ "eval_steps_per_second": 13.358,
1567
  "step": 1900
1568
  },
1569
  {
1570
  "epoch": 23.580246913580247,
1571
+ "grad_norm": 0.18128247559070587,
1572
+ "learning_rate": 1.1888431641518063e-05,
1573
+ "loss": 0.0338,
1574
  "step": 1910
1575
  },
1576
  {
1577
  "epoch": 23.703703703703702,
1578
+ "grad_norm": 0.23011884093284607,
1579
+ "learning_rate": 1.1659807956104254e-05,
1580
+ "loss": 0.0453,
1581
  "step": 1920
1582
  },
1583
  {
1584
  "epoch": 23.82716049382716,
1585
+ "grad_norm": 0.3289709985256195,
1586
+ "learning_rate": 1.1431184270690443e-05,
1587
+ "loss": 0.0279,
1588
  "step": 1930
1589
  },
1590
  {
1591
  "epoch": 23.950617283950617,
1592
+ "grad_norm": 0.13935135304927826,
1593
+ "learning_rate": 1.1202560585276635e-05,
1594
+ "loss": 0.0307,
1595
  "step": 1940
1596
  },
1597
  {
1598
  "epoch": 24.074074074074073,
1599
+ "grad_norm": 0.6951057314872742,
1600
+ "learning_rate": 1.0973936899862826e-05,
1601
+ "loss": 0.0382,
1602
  "step": 1950
1603
  },
1604
  {
1605
  "epoch": 24.19753086419753,
1606
+ "grad_norm": 1.6290189027786255,
1607
+ "learning_rate": 1.0745313214449017e-05,
1608
+ "loss": 0.0388,
1609
  "step": 1960
1610
  },
1611
  {
1612
  "epoch": 24.320987654320987,
1613
+ "grad_norm": 0.18432094156742096,
1614
+ "learning_rate": 1.0516689529035209e-05,
1615
+ "loss": 0.0303,
1616
  "step": 1970
1617
  },
1618
  {
1619
  "epoch": 24.444444444444443,
1620
+ "grad_norm": 0.47055086493492126,
1621
+ "learning_rate": 1.02880658436214e-05,
1622
+ "loss": 0.0308,
1623
  "step": 1980
1624
  },
1625
  {
1626
  "epoch": 24.567901234567902,
1627
+ "grad_norm": 0.41197624802589417,
1628
+ "learning_rate": 1.0059442158207591e-05,
1629
+ "loss": 0.0338,
1630
  "step": 1990
1631
  },
1632
  {
1633
  "epoch": 24.691358024691358,
1634
+ "grad_norm": 0.4777185916900635,
1635
+ "learning_rate": 9.830818472793783e-06,
1636
+ "loss": 0.0474,
1637
  "step": 2000
1638
  },
1639
  {
1640
  "epoch": 24.691358024691358,
1641
+ "eval_accuracy": 0.9398148148148148,
1642
+ "eval_f1": 0.845360824742268,
1643
+ "eval_loss": 0.17648009955883026,
1644
+ "eval_precision": 0.8709150326797386,
1645
+ "eval_recall": 0.8212634822804314,
1646
+ "eval_runtime": 2.9138,
1647
+ "eval_samples_per_second": 111.195,
1648
+ "eval_steps_per_second": 14.071,
1649
  "step": 2000
1650
  },
1651
  {
1652
  "epoch": 24.814814814814813,
1653
+ "grad_norm": 0.45768871903419495,
1654
+ "learning_rate": 9.602194787379972e-06,
1655
+ "loss": 0.0269,
1656
  "step": 2010
1657
  },
1658
  {
1659
  "epoch": 24.938271604938272,
1660
+ "grad_norm": 0.9567142128944397,
1661
+ "learning_rate": 9.373571101966163e-06,
1662
+ "loss": 0.0335,
1663
  "step": 2020
1664
  },
1665
  {
1666
  "epoch": 25.061728395061728,
1667
+ "grad_norm": 0.5306533575057983,
1668
+ "learning_rate": 9.144947416552355e-06,
1669
+ "loss": 0.0229,
1670
  "step": 2030
1671
  },
1672
  {
1673
  "epoch": 25.185185185185187,
1674
+ "grad_norm": 0.5475009083747864,
1675
+ "learning_rate": 8.916323731138548e-06,
1676
+ "loss": 0.0245,
1677
  "step": 2040
1678
  },
1679
  {
1680
  "epoch": 25.308641975308642,
1681
+ "grad_norm": 0.611080527305603,
1682
+ "learning_rate": 8.687700045724737e-06,
1683
+ "loss": 0.029,
1684
  "step": 2050
1685
  },
1686
  {
1687
  "epoch": 25.432098765432098,
1688
+ "grad_norm": 0.4131525158882141,
1689
+ "learning_rate": 8.459076360310929e-06,
1690
+ "loss": 0.034,
1691
  "step": 2060
1692
  },
1693
  {
1694
  "epoch": 25.555555555555557,
1695
+ "grad_norm": 1.0216596126556396,
1696
+ "learning_rate": 8.23045267489712e-06,
1697
+ "loss": 0.0345,
1698
  "step": 2070
1699
  },
1700
  {
1701
  "epoch": 25.679012345679013,
1702
+ "grad_norm": 0.6425824165344238,
1703
+ "learning_rate": 8.001828989483311e-06,
1704
+ "loss": 0.0416,
1705
  "step": 2080
1706
  },
1707
  {
1708
  "epoch": 25.80246913580247,
1709
+ "grad_norm": 0.9126815795898438,
1710
+ "learning_rate": 7.773205304069501e-06,
1711
+ "loss": 0.0402,
1712
  "step": 2090
1713
  },
1714
  {
1715
  "epoch": 25.925925925925927,
1716
+ "grad_norm": 0.35245445370674133,
1717
+ "learning_rate": 7.544581618655692e-06,
1718
+ "loss": 0.0365,
1719
  "step": 2100
1720
  },
1721
  {
1722
  "epoch": 25.925925925925927,
1723
+ "eval_accuracy": 0.941358024691358,
1724
+ "eval_f1": 0.8515625,
1725
+ "eval_loss": 0.18350541591644287,
1726
+ "eval_precision": 0.8637083993660856,
1727
+ "eval_recall": 0.8397534668721109,
1728
+ "eval_runtime": 3.2348,
1729
+ "eval_samples_per_second": 100.161,
1730
+ "eval_steps_per_second": 12.675,
1731
  "step": 2100
1732
  },
1733
  {
1734
  "epoch": 26.049382716049383,
1735
+ "grad_norm": 0.654412031173706,
1736
+ "learning_rate": 7.315957933241885e-06,
1737
+ "loss": 0.0312,
1738
  "step": 2110
1739
  },
1740
  {
1741
  "epoch": 26.17283950617284,
1742
+ "grad_norm": 1.0009406805038452,
1743
+ "learning_rate": 7.087334247828076e-06,
1744
+ "loss": 0.0336,
1745
  "step": 2120
1746
  },
1747
  {
1748
  "epoch": 26.296296296296298,
1749
+ "grad_norm": 0.28231924772262573,
1750
+ "learning_rate": 6.858710562414267e-06,
1751
+ "loss": 0.0408,
1752
  "step": 2130
1753
  },
1754
  {
1755
  "epoch": 26.419753086419753,
1756
+ "grad_norm": 0.6254132390022278,
1757
+ "learning_rate": 6.630086877000457e-06,
1758
+ "loss": 0.0298,
1759
  "step": 2140
1760
  },
1761
  {
1762
  "epoch": 26.54320987654321,
1763
+ "grad_norm": 1.2474095821380615,
1764
+ "learning_rate": 6.401463191586649e-06,
1765
+ "loss": 0.0382,
1766
  "step": 2150
1767
  },
1768
  {
1769
  "epoch": 26.666666666666668,
1770
+ "grad_norm": 0.5490561723709106,
1771
+ "learning_rate": 6.172839506172839e-06,
1772
+ "loss": 0.0283,
1773
  "step": 2160
1774
  },
1775
  {
1776
  "epoch": 26.790123456790123,
1777
+ "grad_norm": 0.9913358688354492,
1778
+ "learning_rate": 5.944215820759031e-06,
1779
+ "loss": 0.0339,
1780
  "step": 2170
1781
  },
1782
  {
1783
  "epoch": 26.91358024691358,
1784
+ "grad_norm": 0.9252423644065857,
1785
+ "learning_rate": 5.715592135345222e-06,
1786
+ "loss": 0.0286,
1787
  "step": 2180
1788
  },
1789
  {
1790
  "epoch": 27.037037037037038,
1791
+ "grad_norm": 0.5318057537078857,
1792
+ "learning_rate": 5.486968449931413e-06,
1793
+ "loss": 0.0316,
1794
  "step": 2190
1795
  },
1796
  {
1797
  "epoch": 27.160493827160494,
1798
+ "grad_norm": 0.3603754937648773,
1799
+ "learning_rate": 5.258344764517604e-06,
1800
+ "loss": 0.0244,
1801
  "step": 2200
1802
  },
1803
  {
1804
  "epoch": 27.160493827160494,
1805
+ "eval_accuracy": 0.9404320987654321,
1806
+ "eval_f1": 0.8479117415287628,
1807
+ "eval_loss": 0.18215233087539673,
1808
+ "eval_precision": 0.867741935483871,
1809
+ "eval_recall": 0.8289676425269645,
1810
+ "eval_runtime": 3.1643,
1811
+ "eval_samples_per_second": 102.391,
1812
+ "eval_steps_per_second": 12.957,
1813
  "step": 2200
1814
  },
1815
  {
1816
  "epoch": 27.28395061728395,
1817
+ "grad_norm": 0.6729068756103516,
1818
+ "learning_rate": 5.029721079103796e-06,
1819
+ "loss": 0.0396,
1820
  "step": 2210
1821
  },
1822
  {
1823
  "epoch": 27.40740740740741,
1824
+ "grad_norm": 0.8200917840003967,
1825
+ "learning_rate": 4.801097393689986e-06,
1826
+ "loss": 0.0297,
1827
  "step": 2220
1828
  },
1829
  {
1830
  "epoch": 27.530864197530864,
1831
+ "grad_norm": 0.9590497612953186,
1832
+ "learning_rate": 4.572473708276177e-06,
1833
+ "loss": 0.0371,
1834
  "step": 2230
1835
  },
1836
  {
1837
  "epoch": 27.65432098765432,
1838
+ "grad_norm": 0.23443204164505005,
1839
+ "learning_rate": 4.343850022862369e-06,
1840
+ "loss": 0.0305,
1841
  "step": 2240
1842
  },
1843
  {
1844
  "epoch": 27.77777777777778,
1845
+ "grad_norm": 0.18787816166877747,
1846
+ "learning_rate": 4.11522633744856e-06,
1847
+ "loss": 0.0298,
1848
  "step": 2250
1849
  },
1850
  {
1851
  "epoch": 27.901234567901234,
1852
+ "grad_norm": 0.32972452044487,
1853
+ "learning_rate": 3.8866026520347504e-06,
1854
+ "loss": 0.023,
1855
  "step": 2260
1856
  },
1857
  {
1858
  "epoch": 28.02469135802469,
1859
+ "grad_norm": 0.16079440712928772,
1860
+ "learning_rate": 3.6579789666209426e-06,
1861
+ "loss": 0.0225,
1862
  "step": 2270
1863
  },
1864
  {
1865
  "epoch": 28.14814814814815,
1866
+ "grad_norm": 1.2685606479644775,
1867
+ "learning_rate": 3.4293552812071335e-06,
1868
+ "loss": 0.0329,
1869
  "step": 2280
1870
  },
1871
  {
1872
  "epoch": 28.271604938271604,
1873
+ "grad_norm": 0.6398904323577881,
1874
+ "learning_rate": 3.2007315957933243e-06,
1875
+ "loss": 0.0307,
1876
  "step": 2290
1877
  },
1878
  {
1879
  "epoch": 28.395061728395063,
1880
+ "grad_norm": 0.6520938873291016,
1881
+ "learning_rate": 2.9721079103795156e-06,
1882
+ "loss": 0.0242,
1883
  "step": 2300
1884
  },
1885
  {
1886
  "epoch": 28.395061728395063,
1887
+ "eval_accuracy": 0.9407407407407408,
1888
+ "eval_f1": 0.8483412322274881,
1889
+ "eval_loss": 0.18079863488674164,
1890
+ "eval_precision": 0.8703403565640194,
1891
+ "eval_recall": 0.827426810477658,
1892
+ "eval_runtime": 2.3949,
1893
+ "eval_samples_per_second": 135.29,
1894
+ "eval_steps_per_second": 17.12,
1895
  "step": 2300
1896
  },
1897
  {
1898
  "epoch": 28.51851851851852,
1899
+ "grad_norm": 0.20304135978221893,
1900
+ "learning_rate": 2.7434842249657065e-06,
1901
+ "loss": 0.0225,
1902
  "step": 2310
1903
  },
1904
  {
1905
  "epoch": 28.641975308641975,
1906
+ "grad_norm": 0.9393714070320129,
1907
+ "learning_rate": 2.514860539551898e-06,
1908
+ "loss": 0.0294,
1909
  "step": 2320
1910
  },
1911
  {
1912
  "epoch": 28.765432098765434,
1913
+ "grad_norm": 0.3730199933052063,
1914
+ "learning_rate": 2.2862368541380887e-06,
1915
+ "loss": 0.0339,
1916
  "step": 2330
1917
  },
1918
  {
1919
  "epoch": 28.88888888888889,
1920
+ "grad_norm": 0.6354162096977234,
1921
+ "learning_rate": 2.05761316872428e-06,
1922
+ "loss": 0.028,
1923
  "step": 2340
1924
  },
1925
  {
1926
  "epoch": 29.012345679012345,
1927
+ "grad_norm": 0.3703112006187439,
1928
+ "learning_rate": 1.8289894833104713e-06,
1929
+ "loss": 0.0314,
1930
  "step": 2350
1931
  },
1932
  {
1933
  "epoch": 29.135802469135804,
1934
+ "grad_norm": 0.5848723649978638,
1935
+ "learning_rate": 1.6003657978966622e-06,
1936
+ "loss": 0.0361,
1937
  "step": 2360
1938
  },
1939
  {
1940
  "epoch": 29.25925925925926,
1941
+ "grad_norm": 0.5978838205337524,
1942
+ "learning_rate": 1.3717421124828533e-06,
1943
+ "loss": 0.0406,
1944
  "step": 2370
1945
  },
1946
  {
1947
  "epoch": 29.382716049382715,
1948
+ "grad_norm": 0.280225545167923,
1949
+ "learning_rate": 1.1431184270690443e-06,
1950
+ "loss": 0.0319,
1951
  "step": 2380
1952
  },
1953
  {
1954
  "epoch": 29.506172839506174,
1955
+ "grad_norm": 0.39151689410209656,
1956
+ "learning_rate": 9.144947416552356e-07,
1957
+ "loss": 0.0263,
1958
  "step": 2390
1959
  },
1960
  {
1961
  "epoch": 29.62962962962963,
1962
+ "grad_norm": 0.5079048871994019,
1963
+ "learning_rate": 6.858710562414266e-07,
1964
+ "loss": 0.0296,
1965
  "step": 2400
1966
  },
1967
  {
1968
  "epoch": 29.62962962962963,
1969
+ "eval_accuracy": 0.9401234567901234,
1970
+ "eval_f1": 0.847723704866562,
1971
+ "eval_loss": 0.18171092867851257,
1972
+ "eval_precision": 0.864,
1973
+ "eval_recall": 0.8320493066255779,
1974
+ "eval_runtime": 2.4278,
1975
+ "eval_samples_per_second": 133.453,
1976
+ "eval_steps_per_second": 16.888,
1977
  "step": 2400
1978
  },
1979
  {
1980
  "epoch": 29.753086419753085,
1981
+ "grad_norm": 0.5158771276473999,
1982
+ "learning_rate": 4.572473708276178e-07,
1983
+ "loss": 0.026,
1984
  "step": 2410
1985
  },
1986
  {
1987
  "epoch": 29.876543209876544,
1988
+ "grad_norm": 0.7846933603286743,
1989
+ "learning_rate": 2.286236854138089e-07,
1990
+ "loss": 0.0275,
1991
  "step": 2420
1992
  },
1993
  {
1994
  "epoch": 30.0,
1995
+ "grad_norm": 0.403104305267334,
1996
  "learning_rate": 0.0,
1997
+ "loss": 0.0277,
1998
  "step": 2430
1999
  },
2000
  {
2001
  "epoch": 30.0,
2002
  "step": 2430,
2003
  "total_flos": 3.008454731998249e+18,
2004
+ "train_loss": 0.09584075045070531,
2005
+ "train_runtime": 684.1699,
2006
+ "train_samples_per_second": 56.74,
2007
+ "train_steps_per_second": 3.552
2008
  }
2009
  ],
2010
  "logging_steps": 10,