Ben10x commited on
Commit
b5efbb6
·
verified ·
1 Parent(s): f56accb

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,8 @@ license: apache-2.0
4
  base_model: bert-base-uncased
5
  tags:
6
  - generated_from_trainer
 
 
7
  metrics:
8
  - precision
9
  - recall
@@ -11,7 +13,26 @@ metrics:
11
  - accuracy
12
  model-index:
13
  - name: bert-base-medmentions
14
- results: []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ---
16
 
17
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -19,13 +40,13 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # bert-base-medmentions
21
 
22
- This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on an unknown dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.6247
25
- - Precision: 0.6473
26
- - Recall: 0.6735
27
- - F1: 0.6601
28
- - Accuracy: 0.8847
29
 
30
  ## Model description
31
 
 
4
  base_model: bert-base-uncased
5
  tags:
6
  - generated_from_trainer
7
+ datasets:
8
+ - Ben10x/MedMentions-NER
9
  metrics:
10
  - precision
11
  - recall
 
13
  - accuracy
14
  model-index:
15
  - name: bert-base-medmentions
16
+ results:
17
+ - task:
18
+ name: Token Classification
19
+ type: token-classification
20
+ dataset:
21
+ name: Ben10x/MedMentions-NER
22
+ type: Ben10x/MedMentions-NER
23
+ metrics:
24
+ - name: Precision
25
+ type: precision
26
+ value: 0.5820728291316527
27
+ - name: Recall
28
+ type: recall
29
+ value: 0.6344207955338451
30
+ - name: F1
31
+ type: f1
32
+ value: 0.6071204975165909
33
+ - name: Accuracy
34
+ type: accuracy
35
+ value: 0.8688595400463357
36
  ---
37
 
38
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
40
 
41
  # bert-base-medmentions
42
 
43
+ This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on the Ben10x/MedMentions-NER dataset.
44
  It achieves the following results on the evaluation set:
45
+ - Loss: 1.5156
46
+ - Precision: 0.5821
47
+ - Recall: 0.6344
48
+ - F1: 0.6071
49
+ - Accuracy: 0.8689
50
 
51
  ## Model description
52
 
all_results.json CHANGED
@@ -1,26 +1,26 @@
1
  {
2
- "epoch": 40.0,
3
- "eval_accuracy": 0.865718137671959,
4
- "eval_f1": 0.6036826135749616,
5
- "eval_loss": 1.9494872093200684,
6
- "eval_precision": 0.5765780071456927,
7
- "eval_recall": 0.6334612700628053,
8
- "eval_runtime": 4.5356,
9
  "eval_samples": 2910,
10
- "eval_samples_per_second": 641.59,
11
- "eval_steps_per_second": 80.254,
12
- "predict_accuracy": 0.8719908892175985,
13
- "predict_f1": 0.6101030325783173,
14
- "predict_loss": 1.9410734176635742,
15
- "predict_precision": 0.5870594846271173,
16
- "predict_recall": 0.6350295241403265,
17
- "predict_runtime": 4.1755,
18
- "predict_samples_per_second": 697.404,
19
- "predict_steps_per_second": 87.175,
20
- "total_flos": 3.172672952125471e+16,
21
- "train_loss": 1.7340915796560996,
22
- "train_runtime": 6267.5858,
23
  "train_samples": 23285,
24
- "train_samples_per_second": 148.606,
25
- "train_steps_per_second": 18.578
26
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "eval_accuracy": 0.8688595400463357,
4
+ "eval_f1": 0.6071204975165909,
5
+ "eval_loss": 1.5156257152557373,
6
+ "eval_precision": 0.5820728291316527,
7
+ "eval_recall": 0.6344207955338451,
8
+ "eval_runtime": 4.3275,
9
  "eval_samples": 2910,
10
+ "eval_samples_per_second": 672.436,
11
+ "eval_steps_per_second": 84.112,
12
+ "predict_accuracy": 0.8737973374523844,
13
+ "predict_f1": 0.6134236041457707,
14
+ "predict_loss": 1.508852243423462,
15
+ "predict_precision": 0.5912826297131808,
16
+ "predict_recall": 0.6372872525182355,
17
+ "predict_runtime": 4.1494,
18
+ "predict_samples_per_second": 701.789,
19
+ "predict_steps_per_second": 87.724,
20
+ "total_flos": 1.1901430945516224e+16,
21
+ "train_loss": 1.311661433704009,
22
+ "train_runtime": 2299.911,
23
  "train_samples": 23285,
24
+ "train_samples_per_second": 151.865,
25
+ "train_steps_per_second": 18.986
26
  }
eval_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 40.0,
3
- "eval_accuracy": 0.865718137671959,
4
- "eval_f1": 0.6036826135749616,
5
- "eval_loss": 1.9494872093200684,
6
- "eval_precision": 0.5765780071456927,
7
- "eval_recall": 0.6334612700628053,
8
- "eval_runtime": 4.5356,
9
  "eval_samples": 2910,
10
- "eval_samples_per_second": 641.59,
11
- "eval_steps_per_second": 80.254
12
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "eval_accuracy": 0.8688595400463357,
4
+ "eval_f1": 0.6071204975165909,
5
+ "eval_loss": 1.5156257152557373,
6
+ "eval_precision": 0.5820728291316527,
7
+ "eval_recall": 0.6344207955338451,
8
+ "eval_runtime": 4.3275,
9
  "eval_samples": 2910,
10
+ "eval_samples_per_second": 672.436,
11
+ "eval_steps_per_second": 84.112
12
  }
predict_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "predict_accuracy": 0.8719908892175985,
3
- "predict_f1": 0.6101030325783173,
4
- "predict_loss": 1.9410734176635742,
5
- "predict_precision": 0.5870594846271173,
6
- "predict_recall": 0.6350295241403265,
7
- "predict_runtime": 4.1755,
8
- "predict_samples_per_second": 697.404,
9
- "predict_steps_per_second": 87.175
10
  }
 
1
  {
2
+ "predict_accuracy": 0.8737973374523844,
3
+ "predict_f1": 0.6134236041457707,
4
+ "predict_loss": 1.508852243423462,
5
+ "predict_precision": 0.5912826297131808,
6
+ "predict_recall": 0.6372872525182355,
7
+ "predict_runtime": 4.1494,
8
+ "predict_samples_per_second": 701.789,
9
+ "predict_steps_per_second": 87.724
10
  }
predictions.txt CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 40.0,
3
- "total_flos": 3.172672952125471e+16,
4
- "train_loss": 1.7340915796560996,
5
- "train_runtime": 6267.5858,
6
  "train_samples": 23285,
7
- "train_samples_per_second": 148.606,
8
- "train_steps_per_second": 18.578
9
  }
 
1
  {
2
+ "epoch": 15.0,
3
+ "total_flos": 1.1901430945516224e+16,
4
+ "train_loss": 1.311661433704009,
5
+ "train_runtime": 2299.911,
6
  "train_samples": 23285,
7
+ "train_samples_per_second": 151.865,
8
+ "train_steps_per_second": 18.986
9
  }
trainer_state.json CHANGED
@@ -1,2132 +1,817 @@
1
  {
2
  "best_global_step": 5822,
3
- "best_metric": 1.9494872093200684,
4
  "best_model_checkpoint": "./output/bert-base-medmentions/checkpoint-5822",
5
- "epoch": 40.0,
6
  "eval_steps": 500,
7
- "global_step": 116440,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.1717622810030917,
14
- "grad_norm": 1.5916364192962646,
15
- "learning_rate": 4.978529714874613e-05,
16
- "loss": 2.1139,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.3435245620061834,
21
- "grad_norm": 1.203733205795288,
22
- "learning_rate": 4.9570594297492275e-05,
23
- "loss": 2.0281,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 0.5152868430092752,
28
- "grad_norm": 1.7044838666915894,
29
- "learning_rate": 4.9355891446238405e-05,
30
- "loss": 2.008,
31
  "step": 1500
32
  },
33
  {
34
  "epoch": 0.6870491240123668,
35
- "grad_norm": 1.8281738758087158,
36
- "learning_rate": 4.914118859498454e-05,
37
- "loss": 1.9914,
38
  "step": 2000
39
  },
40
  {
41
  "epoch": 0.8588114050154586,
42
- "grad_norm": 1.2349461317062378,
43
- "learning_rate": 4.892648574373068e-05,
44
- "loss": 1.9885,
45
  "step": 2500
46
  },
47
  {
48
  "epoch": 1.0,
49
- "eval_accuracy": 0.8572232620845822,
50
- "eval_f1": 0.5660144689246958,
51
- "eval_loss": 1.9655100107192993,
52
- "eval_precision": 0.535214552238806,
53
- "eval_recall": 0.6005757152826239,
54
- "eval_runtime": 4.686,
55
- "eval_samples_per_second": 620.994,
56
- "eval_steps_per_second": 77.678,
57
  "step": 2911
58
  },
59
  {
60
  "epoch": 1.0305736860185504,
61
- "grad_norm": 1.0518437623977661,
62
- "learning_rate": 4.8711782892476815e-05,
63
- "loss": 1.9721,
64
  "step": 3000
65
  },
66
  {
67
  "epoch": 1.202335967021642,
68
- "grad_norm": 0.796907901763916,
69
- "learning_rate": 4.8497080041222944e-05,
70
- "loss": 1.918,
71
  "step": 3500
72
  },
73
  {
74
  "epoch": 1.3740982480247337,
75
- "grad_norm": 1.3571090698242188,
76
- "learning_rate": 4.828237718996909e-05,
77
- "loss": 1.9244,
78
  "step": 4000
79
  },
80
  {
81
  "epoch": 1.5458605290278253,
82
- "grad_norm": 2.4624578952789307,
83
- "learning_rate": 4.806767433871522e-05,
84
- "loss": 1.9224,
85
  "step": 4500
86
  },
87
  {
88
  "epoch": 1.7176228100309172,
89
- "grad_norm": 1.8667396306991577,
90
- "learning_rate": 4.7852971487461354e-05,
91
- "loss": 1.9167,
92
  "step": 5000
93
  },
94
  {
95
  "epoch": 1.889385091034009,
96
- "grad_norm": 2.3962111473083496,
97
- "learning_rate": 4.763826863620749e-05,
98
- "loss": 1.9172,
99
  "step": 5500
100
  },
101
  {
102
  "epoch": 2.0,
103
- "eval_accuracy": 0.865718137671959,
104
- "eval_f1": 0.6036826135749616,
105
- "eval_loss": 1.9494872093200684,
106
- "eval_precision": 0.5765780071456927,
107
- "eval_recall": 0.6334612700628053,
108
- "eval_runtime": 4.6866,
109
- "eval_samples_per_second": 620.919,
110
- "eval_steps_per_second": 77.668,
111
  "step": 5822
112
  },
113
  {
114
  "epoch": 2.0611473720371007,
115
- "grad_norm": 4.303598880767822,
116
- "learning_rate": 4.742356578495363e-05,
117
- "loss": 1.9,
118
  "step": 6000
119
  },
120
  {
121
  "epoch": 2.2329096530401924,
122
- "grad_norm": 1.4852184057235718,
123
- "learning_rate": 4.720886293369976e-05,
124
- "loss": 1.8561,
125
  "step": 6500
126
  },
127
  {
128
  "epoch": 2.404671934043284,
129
- "grad_norm": 2.375478506088257,
130
- "learning_rate": 4.69941600824459e-05,
131
- "loss": 1.8591,
132
  "step": 7000
133
  },
134
  {
135
  "epoch": 2.5764342150463757,
136
- "grad_norm": 2.7883598804473877,
137
- "learning_rate": 4.677945723119203e-05,
138
- "loss": 1.8607,
139
  "step": 7500
140
  },
141
  {
142
  "epoch": 2.7481964960494674,
143
- "grad_norm": 1.723681092262268,
144
- "learning_rate": 4.6564754379938166e-05,
145
- "loss": 1.8603,
146
  "step": 8000
147
  },
148
  {
149
  "epoch": 2.9199587770525595,
150
- "grad_norm": 2.383392572402954,
151
- "learning_rate": 4.63500515286843e-05,
152
- "loss": 1.8613,
153
  "step": 8500
154
  },
155
  {
156
  "epoch": 3.0,
157
- "eval_accuracy": 0.8701422793492062,
158
- "eval_f1": 0.616918280275681,
159
- "eval_loss": 1.9503422975540161,
160
- "eval_precision": 0.5823265179677819,
161
- "eval_recall": 0.6558792742498255,
162
- "eval_runtime": 4.6746,
163
- "eval_samples_per_second": 622.517,
164
- "eval_steps_per_second": 77.868,
165
  "step": 8733
166
  },
167
  {
168
  "epoch": 3.091721058055651,
169
- "grad_norm": 1.104688286781311,
170
- "learning_rate": 4.613534867743044e-05,
171
- "loss": 1.8344,
172
  "step": 9000
173
  },
174
  {
175
  "epoch": 3.2634833390587428,
176
- "grad_norm": 1.6872458457946777,
177
- "learning_rate": 4.5920645826176575e-05,
178
- "loss": 1.814,
179
  "step": 9500
180
  },
181
  {
182
  "epoch": 3.4352456200618344,
183
- "grad_norm": 2.0609402656555176,
184
- "learning_rate": 4.570594297492271e-05,
185
- "loss": 1.8143,
186
  "step": 10000
187
  },
188
  {
189
  "epoch": 3.607007901064926,
190
- "grad_norm": 2.683795690536499,
191
- "learning_rate": 4.549124012366884e-05,
192
- "loss": 1.8154,
193
  "step": 10500
194
  },
195
  {
196
  "epoch": 3.7787701820680177,
197
- "grad_norm": 2.599900484085083,
198
- "learning_rate": 4.5276537272414985e-05,
199
- "loss": 1.8159,
200
  "step": 11000
201
  },
202
  {
203
  "epoch": 3.9505324630711094,
204
- "grad_norm": 1.868220567703247,
205
- "learning_rate": 4.5061834421161115e-05,
206
- "loss": 1.8187,
207
  "step": 11500
208
  },
209
  {
210
  "epoch": 4.0,
211
- "eval_accuracy": 0.875679001034045,
212
- "eval_f1": 0.6262617585155276,
213
- "eval_loss": 1.9548965692520142,
214
- "eval_precision": 0.6169078446306169,
215
- "eval_recall": 0.6359036985345429,
216
- "eval_runtime": 4.7153,
217
- "eval_samples_per_second": 617.136,
218
- "eval_steps_per_second": 77.195,
219
  "step": 11644
220
  },
221
  {
222
  "epoch": 4.1222947440742015,
223
- "grad_norm": 1.7461206912994385,
224
- "learning_rate": 4.484713156990725e-05,
225
- "loss": 1.7947,
226
  "step": 12000
227
  },
228
  {
229
  "epoch": 4.294057025077293,
230
- "grad_norm": 1.3571993112564087,
231
- "learning_rate": 4.463242871865339e-05,
232
- "loss": 1.7845,
233
  "step": 12500
234
  },
235
  {
236
  "epoch": 4.465819306080385,
237
- "grad_norm": 2.8684659004211426,
238
- "learning_rate": 4.4417725867399524e-05,
239
- "loss": 1.7824,
240
  "step": 13000
241
  },
242
  {
243
  "epoch": 4.637581587083476,
244
- "grad_norm": 1.0582610368728638,
245
- "learning_rate": 4.4203023016145654e-05,
246
- "loss": 1.7877,
247
  "step": 13500
248
  },
249
  {
250
  "epoch": 4.809343868086568,
251
- "grad_norm": 1.4852385520935059,
252
- "learning_rate": 4.39883201648918e-05,
253
- "loss": 1.789,
254
  "step": 14000
255
  },
256
  {
257
  "epoch": 4.98110614908966,
258
- "grad_norm": 1.4194121360778809,
259
- "learning_rate": 4.377361731363793e-05,
260
- "loss": 1.7887,
261
  "step": 14500
262
  },
263
  {
264
  "epoch": 5.0,
265
- "eval_accuracy": 0.8771318996321942,
266
- "eval_f1": 0.6332744734915371,
267
- "eval_loss": 1.9618757963180542,
268
- "eval_precision": 0.625435984687367,
269
- "eval_recall": 0.6413119330076762,
270
- "eval_runtime": 4.2673,
271
- "eval_samples_per_second": 681.923,
272
- "eval_steps_per_second": 85.299,
273
  "step": 14555
274
  },
275
  {
276
  "epoch": 5.152868430092751,
277
- "grad_norm": 0.8063040375709534,
278
- "learning_rate": 4.355891446238406e-05,
279
- "loss": 1.7641,
280
  "step": 15000
281
  },
282
  {
283
  "epoch": 5.3246307110958435,
284
- "grad_norm": 1.3164275884628296,
285
- "learning_rate": 4.33442116111302e-05,
286
- "loss": 1.765,
287
  "step": 15500
288
  },
289
  {
290
  "epoch": 5.496392992098935,
291
- "grad_norm": 1.073299527168274,
292
- "learning_rate": 4.3129508759876336e-05,
293
- "loss": 1.7674,
294
  "step": 16000
295
  },
296
  {
297
  "epoch": 5.668155273102027,
298
- "grad_norm": 3.724982976913452,
299
- "learning_rate": 4.2914805908622466e-05,
300
- "loss": 1.7675,
301
  "step": 16500
302
  },
303
  {
304
  "epoch": 5.839917554105119,
305
- "grad_norm": 3.8052899837493896,
306
- "learning_rate": 4.27001030573686e-05,
307
- "loss": 1.7659,
308
  "step": 17000
309
  },
310
  {
311
  "epoch": 6.0,
312
- "eval_accuracy": 0.8766214217463579,
313
- "eval_f1": 0.6387774916627971,
314
- "eval_loss": 1.9771723747253418,
315
- "eval_precision": 0.6188957055214724,
316
- "eval_recall": 0.6599790648988136,
317
- "eval_runtime": 4.894,
318
- "eval_samples_per_second": 594.61,
319
- "eval_steps_per_second": 74.377,
320
  "step": 17466
321
  },
322
  {
323
  "epoch": 6.01167983510821,
324
- "grad_norm": 1.649048924446106,
325
- "learning_rate": 4.248540020611474e-05,
326
- "loss": 1.7678,
327
  "step": 17500
328
  },
329
  {
330
  "epoch": 6.183442116111302,
331
- "grad_norm": 2.6169393062591553,
332
- "learning_rate": 4.227069735486087e-05,
333
- "loss": 1.7486,
334
  "step": 18000
335
  },
336
  {
337
  "epoch": 6.3552043971143934,
338
- "grad_norm": 2.297182321548462,
339
- "learning_rate": 4.205599450360701e-05,
340
- "loss": 1.7505,
341
  "step": 18500
342
  },
343
  {
344
  "epoch": 6.5269666781174855,
345
- "grad_norm": 4.391541481018066,
346
- "learning_rate": 4.184129165235314e-05,
347
- "loss": 1.7534,
348
  "step": 19000
349
  },
350
  {
351
  "epoch": 6.698728959120577,
352
- "grad_norm": 4.291359901428223,
353
- "learning_rate": 4.162658880109928e-05,
354
- "loss": 1.7531,
355
  "step": 19500
356
  },
357
  {
358
  "epoch": 6.870491240123669,
359
- "grad_norm": 3.1789159774780273,
360
- "learning_rate": 4.1411885949845415e-05,
361
- "loss": 1.7536,
362
  "step": 20000
363
  },
364
  {
365
  "epoch": 7.0,
366
- "eval_accuracy": 0.8789643843505806,
367
- "eval_f1": 0.6413145539906103,
368
- "eval_loss": 1.986953854560852,
369
- "eval_precision": 0.6278622764499415,
370
- "eval_recall": 0.6553558967201675,
371
- "eval_runtime": 4.3455,
372
- "eval_samples_per_second": 669.66,
373
- "eval_steps_per_second": 83.765,
374
  "step": 20377
375
  },
376
  {
377
  "epoch": 7.042253521126761,
378
- "grad_norm": 2.1500258445739746,
379
- "learning_rate": 4.119718309859155e-05,
380
- "loss": 1.7523,
381
  "step": 20500
382
  },
383
  {
384
  "epoch": 7.214015802129852,
385
- "grad_norm": 2.0542051792144775,
386
- "learning_rate": 4.098248024733769e-05,
387
- "loss": 1.7398,
388
  "step": 21000
389
  },
390
  {
391
  "epoch": 7.385778083132944,
392
- "grad_norm": 1.6763787269592285,
393
- "learning_rate": 4.0767777396083824e-05,
394
- "loss": 1.7412,
395
  "step": 21500
396
  },
397
  {
398
  "epoch": 7.5575403641360355,
399
- "grad_norm": 2.3270750045776367,
400
- "learning_rate": 4.0553074544829954e-05,
401
- "loss": 1.7436,
402
  "step": 22000
403
  },
404
  {
405
  "epoch": 7.729302645139128,
406
- "grad_norm": 4.410123825073242,
407
- "learning_rate": 4.033837169357609e-05,
408
- "loss": 1.7422,
409
  "step": 22500
410
  },
411
  {
412
  "epoch": 7.901064926142219,
413
- "grad_norm": 3.391580581665039,
414
- "learning_rate": 4.012366884232223e-05,
415
- "loss": 1.7473,
416
  "step": 23000
417
  },
418
  {
419
  "epoch": 8.0,
420
- "eval_accuracy": 0.8807314231861674,
421
- "eval_f1": 0.6478665800303227,
422
- "eval_loss": 1.9853944778442383,
423
- "eval_precision": 0.64349023319852,
424
- "eval_recall": 0.6523028611304955,
425
- "eval_runtime": 4.2788,
426
- "eval_samples_per_second": 680.101,
427
- "eval_steps_per_second": 85.071,
428
  "step": 23288
429
  },
430
  {
431
  "epoch": 8.07282720714531,
432
- "grad_norm": 2.603978395462036,
433
- "learning_rate": 3.9908965991068363e-05,
434
- "loss": 1.7421,
435
  "step": 23500
436
  },
437
  {
438
  "epoch": 8.244589488148403,
439
- "grad_norm": 1.5481454133987427,
440
- "learning_rate": 3.96942631398145e-05,
441
- "loss": 1.7339,
442
  "step": 24000
443
  },
444
  {
445
  "epoch": 8.416351769151495,
446
- "grad_norm": 1.4789228439331055,
447
- "learning_rate": 3.9479560288560636e-05,
448
- "loss": 1.737,
449
  "step": 24500
450
  },
451
  {
452
  "epoch": 8.588114050154585,
453
- "grad_norm": 1.512890100479126,
454
- "learning_rate": 3.9264857437306766e-05,
455
- "loss": 1.7348,
456
  "step": 25000
457
  },
458
  {
459
  "epoch": 8.759876331157677,
460
- "grad_norm": 4.724124908447266,
461
- "learning_rate": 3.905015458605291e-05,
462
- "loss": 1.7352,
463
  "step": 25500
464
  },
465
  {
466
  "epoch": 8.93163861216077,
467
- "grad_norm": 1.5042279958724976,
468
- "learning_rate": 3.883545173479904e-05,
469
- "loss": 1.7393,
470
  "step": 26000
471
  },
472
  {
473
  "epoch": 9.0,
474
- "eval_accuracy": 0.8795795756488959,
475
- "eval_f1": 0.6482206780374815,
476
- "eval_loss": 1.9974679946899414,
477
- "eval_precision": 0.6266590668512336,
478
- "eval_recall": 0.6713189113747383,
479
- "eval_runtime": 4.2794,
480
- "eval_samples_per_second": 680.003,
481
- "eval_steps_per_second": 85.059,
482
  "step": 26199
483
  },
484
  {
485
  "epoch": 9.103400893163862,
486
- "grad_norm": 2.2786951065063477,
487
- "learning_rate": 3.8620748883545176e-05,
488
- "loss": 1.7312,
489
  "step": 26500
490
  },
491
  {
492
  "epoch": 9.275163174166954,
493
- "grad_norm": 0.895926296710968,
494
- "learning_rate": 3.840604603229131e-05,
495
- "loss": 1.7294,
496
  "step": 27000
497
  },
498
  {
499
  "epoch": 9.446925455170044,
500
- "grad_norm": 1.9498018026351929,
501
- "learning_rate": 3.819134318103745e-05,
502
- "loss": 1.7296,
503
  "step": 27500
504
  },
505
  {
506
  "epoch": 9.618687736173136,
507
- "grad_norm": 1.1043105125427246,
508
- "learning_rate": 3.797664032978358e-05,
509
- "loss": 1.7319,
510
  "step": 28000
511
  },
512
  {
513
  "epoch": 9.790450017176228,
514
- "grad_norm": 2.098552703857422,
515
- "learning_rate": 3.776193747852972e-05,
516
- "loss": 1.7317,
517
  "step": 28500
518
  },
519
  {
520
  "epoch": 9.96221229817932,
521
- "grad_norm": 0.8599863648414612,
522
- "learning_rate": 3.754723462727585e-05,
523
- "loss": 1.7315,
524
  "step": 29000
525
  },
526
  {
527
  "epoch": 10.0,
528
- "eval_accuracy": 0.8807968690689669,
529
- "eval_f1": 0.647167461237813,
530
- "eval_loss": 2.002537965774536,
531
- "eval_precision": 0.6374481766646924,
532
- "eval_recall": 0.6571877180739707,
533
- "eval_runtime": 4.6846,
534
- "eval_samples_per_second": 621.184,
535
- "eval_steps_per_second": 77.701,
536
  "step": 29110
537
  },
538
  {
539
  "epoch": 10.13397457918241,
540
- "grad_norm": 4.416456699371338,
541
- "learning_rate": 3.733253177602199e-05,
542
- "loss": 1.7251,
543
  "step": 29500
544
  },
545
  {
546
  "epoch": 10.305736860185503,
547
- "grad_norm": 0.6309936046600342,
548
- "learning_rate": 3.7117828924768124e-05,
549
- "loss": 1.724,
550
  "step": 30000
551
  },
552
  {
553
  "epoch": 10.477499141188595,
554
- "grad_norm": 22.35858154296875,
555
- "learning_rate": 3.690312607351426e-05,
556
- "loss": 1.7271,
557
  "step": 30500
558
  },
559
  {
560
  "epoch": 10.649261422191687,
561
- "grad_norm": 9.670055389404297,
562
- "learning_rate": 3.668842322226039e-05,
563
- "loss": 1.7277,
564
  "step": 31000
565
  },
566
  {
567
  "epoch": 10.82102370319478,
568
- "grad_norm": 0.8170812129974365,
569
- "learning_rate": 3.6473720371006534e-05,
570
- "loss": 1.7271,
571
  "step": 31500
572
  },
573
  {
574
  "epoch": 10.99278598419787,
575
- "grad_norm": 0.1065647155046463,
576
- "learning_rate": 3.6259017519752663e-05,
577
- "loss": 1.7277,
578
  "step": 32000
579
  },
580
  {
581
  "epoch": 11.0,
582
- "eval_accuracy": 0.88172620060472,
583
- "eval_f1": 0.652366302836192,
584
- "eval_loss": 2.00553560256958,
585
- "eval_precision": 0.6310640032613127,
586
- "eval_recall": 0.6751570132588974,
587
- "eval_runtime": 4.3275,
588
- "eval_samples_per_second": 672.442,
589
- "eval_steps_per_second": 84.113,
590
  "step": 32021
591
  },
592
  {
593
  "epoch": 11.164548265200962,
594
- "grad_norm": 0.7106145620346069,
595
- "learning_rate": 3.60443146684988e-05,
596
- "loss": 1.7222,
597
  "step": 32500
598
  },
599
  {
600
  "epoch": 11.336310546204054,
601
- "grad_norm": 3.1519761085510254,
602
- "learning_rate": 3.5829611817244936e-05,
603
- "loss": 1.721,
604
  "step": 33000
605
  },
606
  {
607
  "epoch": 11.508072827207146,
608
- "grad_norm": 2.842960834503174,
609
- "learning_rate": 3.5614908965991066e-05,
610
- "loss": 1.7213,
611
  "step": 33500
612
  },
613
  {
614
  "epoch": 11.679835108210238,
615
- "grad_norm": 2.1760663986206055,
616
- "learning_rate": 3.54002061147372e-05,
617
- "loss": 1.7222,
618
  "step": 34000
619
  },
620
  {
621
  "epoch": 11.851597389213328,
622
- "grad_norm": 1.5963988304138184,
623
- "learning_rate": 3.518550326348334e-05,
624
- "loss": 1.7231,
625
  "step": 34500
626
  },
627
  {
628
  "epoch": 12.0,
629
- "eval_accuracy": 0.8824853728451943,
630
- "eval_f1": 0.6555833837118922,
631
- "eval_loss": 2.0041558742523193,
632
- "eval_precision": 0.6491363092184026,
633
- "eval_recall": 0.6621598046057222,
634
- "eval_runtime": 4.5128,
635
- "eval_samples_per_second": 644.839,
636
- "eval_steps_per_second": 80.66,
637
  "step": 34932
638
  },
639
  {
640
  "epoch": 12.02335967021642,
641
- "grad_norm": 0.8488190770149231,
642
- "learning_rate": 3.4970800412229476e-05,
643
- "loss": 1.7234,
644
  "step": 35000
645
  },
646
  {
647
  "epoch": 12.195121951219512,
648
- "grad_norm": 0.992290198802948,
649
- "learning_rate": 3.475609756097561e-05,
650
- "loss": 1.7179,
651
  "step": 35500
652
  },
653
  {
654
  "epoch": 12.366884232222604,
655
- "grad_norm": 2.4747726917266846,
656
- "learning_rate": 3.454139470972175e-05,
657
- "loss": 1.7192,
658
  "step": 36000
659
  },
660
  {
661
  "epoch": 12.538646513225697,
662
- "grad_norm": 2.668823480606079,
663
- "learning_rate": 3.432669185846788e-05,
664
- "loss": 1.7197,
665
  "step": 36500
666
  },
667
  {
668
  "epoch": 12.710408794228787,
669
- "grad_norm": 1.2698637247085571,
670
- "learning_rate": 3.4111989007214015e-05,
671
- "loss": 1.7183,
672
  "step": 37000
673
  },
674
  {
675
  "epoch": 12.882171075231879,
676
- "grad_norm": 2.6843957901000977,
677
- "learning_rate": 3.389728615596015e-05,
678
- "loss": 1.7225,
679
  "step": 37500
680
  },
681
  {
682
  "epoch": 13.0,
683
- "eval_accuracy": 0.88171311142816,
684
- "eval_f1": 0.6531850353892821,
685
- "eval_loss": 2.009732484817505,
686
- "eval_precision": 0.6316818774445893,
687
- "eval_recall": 0.6762037683182135,
688
- "eval_runtime": 4.3542,
689
- "eval_samples_per_second": 668.323,
690
- "eval_steps_per_second": 83.598,
691
  "step": 37843
692
  },
693
  {
694
  "epoch": 13.053933356234971,
695
- "grad_norm": 1.9299542903900146,
696
- "learning_rate": 3.368258330470629e-05,
697
- "loss": 1.7195,
698
  "step": 38000
699
  },
700
  {
701
  "epoch": 13.225695637238063,
702
- "grad_norm": 2.0054049491882324,
703
- "learning_rate": 3.3467880453452424e-05,
704
- "loss": 1.7168,
705
  "step": 38500
706
  },
707
  {
708
  "epoch": 13.397457918241154,
709
- "grad_norm": 0.18206368386745453,
710
- "learning_rate": 3.325317760219856e-05,
711
- "loss": 1.7169,
712
  "step": 39000
713
  },
714
  {
715
  "epoch": 13.569220199244246,
716
- "grad_norm": 0.6871322989463806,
717
- "learning_rate": 3.303847475094469e-05,
718
- "loss": 1.7173,
719
  "step": 39500
720
  },
721
  {
722
  "epoch": 13.740982480247338,
723
- "grad_norm": 0.820996105670929,
724
- "learning_rate": 3.2823771899690834e-05,
725
- "loss": 1.7174,
726
  "step": 40000
727
  },
728
  {
729
  "epoch": 13.91274476125043,
730
- "grad_norm": 0.4690723121166229,
731
- "learning_rate": 3.2609069048436964e-05,
732
- "loss": 1.717,
733
  "step": 40500
734
  },
735
  {
736
  "epoch": 14.0,
737
- "eval_accuracy": 0.8813858820141625,
738
- "eval_f1": 0.6556485716751532,
739
- "eval_loss": 2.0131328105926514,
740
- "eval_precision": 0.6487617421007685,
741
- "eval_recall": 0.6626831821353804,
742
- "eval_runtime": 4.3528,
743
- "eval_samples_per_second": 668.531,
744
- "eval_steps_per_second": 83.624,
745
  "step": 40754
746
  },
747
  {
748
  "epoch": 14.084507042253522,
749
- "grad_norm": 0.9285472631454468,
750
- "learning_rate": 3.23943661971831e-05,
751
- "loss": 1.7171,
752
  "step": 41000
753
  },
754
  {
755
  "epoch": 14.256269323256612,
756
- "grad_norm": 0.7423045635223389,
757
- "learning_rate": 3.2179663345929237e-05,
758
- "loss": 1.7146,
759
  "step": 41500
760
  },
761
  {
762
  "epoch": 14.428031604259704,
763
- "grad_norm": 0.6432718634605408,
764
- "learning_rate": 3.196496049467537e-05,
765
- "loss": 1.7151,
766
  "step": 42000
767
  },
768
  {
769
  "epoch": 14.599793885262796,
770
- "grad_norm": 0.06601449102163315,
771
- "learning_rate": 3.17502576434215e-05,
772
- "loss": 1.716,
773
  "step": 42500
774
  },
775
  {
776
  "epoch": 14.771556166265889,
777
- "grad_norm": 1.611905813217163,
778
- "learning_rate": 3.1535554792167646e-05,
779
- "loss": 1.7158,
780
  "step": 43000
781
  },
782
  {
783
  "epoch": 14.943318447268979,
784
- "grad_norm": 0.8380423188209534,
785
- "learning_rate": 3.1320851940913776e-05,
786
- "loss": 1.7152,
787
  "step": 43500
788
  },
789
  {
790
  "epoch": 15.0,
791
- "eval_accuracy": 0.8814775062500818,
792
- "eval_f1": 0.6542948934731146,
793
- "eval_loss": 2.0191421508789062,
794
- "eval_precision": 0.634760498687664,
795
- "eval_recall": 0.6750697836706211,
796
- "eval_runtime": 4.3582,
797
- "eval_samples_per_second": 667.713,
798
- "eval_steps_per_second": 83.521,
799
  "step": 43665
800
  },
801
  {
802
- "epoch": 15.115080728272071,
803
- "grad_norm": 3.277487277984619,
804
- "learning_rate": 3.110614908965991e-05,
805
- "loss": 1.7139,
806
- "step": 44000
807
- },
808
- {
809
- "epoch": 15.286843009275163,
810
- "grad_norm": 3.0774002075195312,
811
- "learning_rate": 3.089144623840605e-05,
812
- "loss": 1.7126,
813
- "step": 44500
814
- },
815
- {
816
- "epoch": 15.458605290278255,
817
- "grad_norm": 0.6177487373352051,
818
- "learning_rate": 3.0676743387152185e-05,
819
- "loss": 1.7136,
820
- "step": 45000
821
- },
822
- {
823
- "epoch": 15.630367571281347,
824
- "grad_norm": 1.545906901359558,
825
- "learning_rate": 3.046204053589832e-05,
826
- "loss": 1.7144,
827
- "step": 45500
828
- },
829
- {
830
- "epoch": 15.802129852284438,
831
- "grad_norm": 0.31795910000801086,
832
- "learning_rate": 3.0247337684644455e-05,
833
- "loss": 1.7142,
834
- "step": 46000
835
- },
836
- {
837
- "epoch": 15.97389213328753,
838
- "grad_norm": 0.0434710867702961,
839
- "learning_rate": 3.0032634833390588e-05,
840
- "loss": 1.7154,
841
- "step": 46500
842
- },
843
- {
844
- "epoch": 16.0,
845
- "eval_accuracy": 0.8817523789578398,
846
- "eval_f1": 0.6559375134114416,
847
- "eval_loss": 2.02302885055542,
848
- "eval_precision": 0.6456027709723747,
849
- "eval_recall": 0.6666085136078158,
850
- "eval_runtime": 4.3521,
851
- "eval_samples_per_second": 668.643,
852
- "eval_steps_per_second": 83.638,
853
- "step": 46576
854
- },
855
- {
856
- "epoch": 16.14565441429062,
857
- "grad_norm": 0.30855801701545715,
858
- "learning_rate": 2.9817931982136728e-05,
859
- "loss": 1.7105,
860
- "step": 47000
861
- },
862
- {
863
- "epoch": 16.317416695293712,
864
- "grad_norm": 0.8510277271270752,
865
- "learning_rate": 2.9603229130882858e-05,
866
- "loss": 1.7127,
867
- "step": 47500
868
- },
869
- {
870
- "epoch": 16.489178976296806,
871
- "grad_norm": 1.7923400402069092,
872
- "learning_rate": 2.9388526279628997e-05,
873
- "loss": 1.7119,
874
- "step": 48000
875
- },
876
- {
877
- "epoch": 16.660941257299896,
878
- "grad_norm": 4.354673862457275,
879
- "learning_rate": 2.917382342837513e-05,
880
- "loss": 1.7133,
881
- "step": 48500
882
- },
883
- {
884
- "epoch": 16.83270353830299,
885
- "grad_norm": 0.20724855363368988,
886
- "learning_rate": 2.8959120577121267e-05,
887
- "loss": 1.7121,
888
- "step": 49000
889
- },
890
- {
891
- "epoch": 17.0,
892
- "eval_accuracy": 0.8831136533200696,
893
- "eval_f1": 0.6562859946248026,
894
- "eval_loss": 2.0242502689361572,
895
- "eval_precision": 0.6422309426400601,
896
- "eval_recall": 0.670969993021633,
897
- "eval_runtime": 4.3469,
898
- "eval_samples_per_second": 669.449,
899
- "eval_steps_per_second": 83.739,
900
- "step": 49487
901
- },
902
- {
903
- "epoch": 17.00446581930608,
904
- "grad_norm": 5.177857398986816,
905
- "learning_rate": 2.87444177258674e-05,
906
- "loss": 1.7127,
907
- "step": 49500
908
- },
909
- {
910
- "epoch": 17.17622810030917,
911
- "grad_norm": 3.216094732284546,
912
- "learning_rate": 2.852971487461354e-05,
913
- "loss": 1.7098,
914
- "step": 50000
915
- },
916
- {
917
- "epoch": 17.347990381312265,
918
- "grad_norm": 2.916612148284912,
919
- "learning_rate": 2.8315012023359673e-05,
920
- "loss": 1.71,
921
- "step": 50500
922
- },
923
- {
924
- "epoch": 17.519752662315355,
925
- "grad_norm": 0.03734961524605751,
926
- "learning_rate": 2.8100309172105803e-05,
927
- "loss": 1.7106,
928
- "step": 51000
929
- },
930
- {
931
- "epoch": 17.69151494331845,
932
- "grad_norm": 1.543533444404602,
933
- "learning_rate": 2.7885606320851943e-05,
934
- "loss": 1.7104,
935
- "step": 51500
936
- },
937
- {
938
- "epoch": 17.86327722432154,
939
- "grad_norm": 0.31598055362701416,
940
- "learning_rate": 2.7670903469598076e-05,
941
- "loss": 1.7114,
942
- "step": 52000
943
- },
944
- {
945
- "epoch": 18.0,
946
- "eval_accuracy": 0.8838466472074242,
947
- "eval_f1": 0.6567732444944403,
948
- "eval_loss": 2.019536018371582,
949
- "eval_precision": 0.6515580736543909,
950
- "eval_recall": 0.6620725750174459,
951
- "eval_runtime": 4.3524,
952
- "eval_samples_per_second": 668.593,
953
- "eval_steps_per_second": 83.632,
954
- "step": 52398
955
- },
956
- {
957
- "epoch": 18.03503950532463,
958
- "grad_norm": 0.3598534166812897,
959
- "learning_rate": 2.7456200618344212e-05,
960
- "loss": 1.7117,
961
- "step": 52500
962
- },
963
- {
964
- "epoch": 18.206801786327723,
965
- "grad_norm": 0.020997876301407814,
966
- "learning_rate": 2.7241497767090345e-05,
967
- "loss": 1.7089,
968
- "step": 53000
969
- },
970
- {
971
- "epoch": 18.378564067330814,
972
- "grad_norm": 1.392232894897461,
973
- "learning_rate": 2.7026794915836485e-05,
974
- "loss": 1.7088,
975
- "step": 53500
976
- },
977
- {
978
- "epoch": 18.550326348333908,
979
- "grad_norm": 0.47013697028160095,
980
- "learning_rate": 2.681209206458262e-05,
981
- "loss": 1.7095,
982
- "step": 54000
983
- },
984
- {
985
- "epoch": 18.722088629336998,
986
- "grad_norm": 1.5354187488555908,
987
- "learning_rate": 2.6597389213328755e-05,
988
- "loss": 1.7097,
989
- "step": 54500
990
- },
991
- {
992
- "epoch": 18.89385091034009,
993
- "grad_norm": 0.10534074902534485,
994
- "learning_rate": 2.6382686362074888e-05,
995
- "loss": 1.7105,
996
- "step": 55000
997
- },
998
- {
999
- "epoch": 19.0,
1000
- "eval_accuracy": 0.8828387806123117,
1001
- "eval_f1": 0.6574315789473685,
1002
- "eval_loss": 2.025520086288452,
1003
- "eval_precision": 0.6354387107276575,
1004
- "eval_recall": 0.6810013956734124,
1005
- "eval_runtime": 4.3482,
1006
- "eval_samples_per_second": 669.243,
1007
- "eval_steps_per_second": 83.713,
1008
- "step": 55309
1009
- },
1010
- {
1011
- "epoch": 19.065613191343182,
1012
- "grad_norm": 0.2419898957014084,
1013
- "learning_rate": 2.6167983510821024e-05,
1014
- "loss": 1.7098,
1015
- "step": 55500
1016
- },
1017
- {
1018
- "epoch": 19.237375472346272,
1019
- "grad_norm": 0.1751754730939865,
1020
- "learning_rate": 2.5953280659567158e-05,
1021
- "loss": 1.7086,
1022
- "step": 56000
1023
- },
1024
- {
1025
- "epoch": 19.409137753349363,
1026
- "grad_norm": 4.463948726654053,
1027
- "learning_rate": 2.5738577808313297e-05,
1028
- "loss": 1.7073,
1029
- "step": 56500
1030
- },
1031
- {
1032
- "epoch": 19.580900034352457,
1033
- "grad_norm": 3.6637113094329834,
1034
- "learning_rate": 2.552387495705943e-05,
1035
- "loss": 1.7085,
1036
- "step": 57000
1037
- },
1038
- {
1039
- "epoch": 19.752662315355547,
1040
- "grad_norm": 2.393986940383911,
1041
- "learning_rate": 2.5309172105805567e-05,
1042
- "loss": 1.7084,
1043
- "step": 57500
1044
- },
1045
- {
1046
- "epoch": 19.92442459635864,
1047
- "grad_norm": 2.718120574951172,
1048
- "learning_rate": 2.50944692545517e-05,
1049
- "loss": 1.7086,
1050
- "step": 58000
1051
- },
1052
- {
1053
- "epoch": 20.0,
1054
- "eval_accuracy": 0.8840691632089426,
1055
- "eval_f1": 0.6621772325641245,
1056
- "eval_loss": 2.0267066955566406,
1057
- "eval_precision": 0.6513924050632911,
1058
- "eval_recall": 0.6733251919050942,
1059
- "eval_runtime": 4.3383,
1060
- "eval_samples_per_second": 670.771,
1061
- "eval_steps_per_second": 83.904,
1062
- "step": 58220
1063
- },
1064
- {
1065
- "epoch": 20.09618687736173,
1066
- "grad_norm": 0.07868649810552597,
1067
- "learning_rate": 2.4879766403297837e-05,
1068
- "loss": 1.7079,
1069
- "step": 58500
1070
- },
1071
- {
1072
- "epoch": 20.26794915836482,
1073
- "grad_norm": 1.5867220163345337,
1074
- "learning_rate": 2.466506355204397e-05,
1075
- "loss": 1.708,
1076
- "step": 59000
1077
- },
1078
- {
1079
- "epoch": 20.439711439367915,
1080
- "grad_norm": 2.7590816020965576,
1081
- "learning_rate": 2.4450360700790106e-05,
1082
- "loss": 1.7076,
1083
- "step": 59500
1084
- },
1085
- {
1086
- "epoch": 20.611473720371006,
1087
- "grad_norm": 0.02333156019449234,
1088
- "learning_rate": 2.4235657849536243e-05,
1089
- "loss": 1.708,
1090
- "step": 60000
1091
- },
1092
- {
1093
- "epoch": 20.7832360013741,
1094
- "grad_norm": 1.262069821357727,
1095
- "learning_rate": 2.4020954998282376e-05,
1096
- "loss": 1.7083,
1097
- "step": 60500
1098
- },
1099
- {
1100
- "epoch": 20.95499828237719,
1101
- "grad_norm": 1.6238000392913818,
1102
- "learning_rate": 2.3806252147028512e-05,
1103
- "loss": 1.7077,
1104
- "step": 61000
1105
- },
1106
- {
1107
- "epoch": 21.0,
1108
- "eval_accuracy": 0.8828387806123117,
1109
- "eval_f1": 0.6590735879414006,
1110
- "eval_loss": 2.0343477725982666,
1111
- "eval_precision": 0.6403653118314958,
1112
- "eval_recall": 0.6789078855547802,
1113
- "eval_runtime": 4.3449,
1114
- "eval_samples_per_second": 669.757,
1115
- "eval_steps_per_second": 83.777,
1116
- "step": 61131
1117
- },
1118
- {
1119
- "epoch": 21.12676056338028,
1120
- "grad_norm": 0.6878411173820496,
1121
- "learning_rate": 2.359154929577465e-05,
1122
- "loss": 1.7075,
1123
- "step": 61500
1124
- },
1125
- {
1126
- "epoch": 21.298522844383374,
1127
- "grad_norm": 0.2594795227050781,
1128
- "learning_rate": 2.3376846444520782e-05,
1129
- "loss": 1.7066,
1130
- "step": 62000
1131
- },
1132
- {
1133
- "epoch": 21.470285125386464,
1134
- "grad_norm": 0.14187024533748627,
1135
- "learning_rate": 2.316214359326692e-05,
1136
- "loss": 1.7067,
1137
- "step": 62500
1138
- },
1139
- {
1140
- "epoch": 21.64204740638956,
1141
- "grad_norm": 1.2114301919937134,
1142
- "learning_rate": 2.2947440742013055e-05,
1143
- "loss": 1.7066,
1144
- "step": 63000
1145
- },
1146
- {
1147
- "epoch": 21.81380968739265,
1148
- "grad_norm": 0.41020047664642334,
1149
- "learning_rate": 2.273273789075919e-05,
1150
- "loss": 1.7069,
1151
- "step": 63500
1152
- },
1153
- {
1154
- "epoch": 21.98557196839574,
1155
- "grad_norm": 1.944765567779541,
1156
- "learning_rate": 2.2518035039505325e-05,
1157
- "loss": 1.7075,
1158
- "step": 64000
1159
- },
1160
- {
1161
- "epoch": 22.0,
1162
- "eval_accuracy": 0.886019450516368,
1163
- "eval_f1": 0.6647068995331143,
1164
- "eval_loss": 2.025860548019409,
1165
- "eval_precision": 0.6588961261570107,
1166
- "eval_recall": 0.6706210746685276,
1167
- "eval_runtime": 4.3463,
1168
- "eval_samples_per_second": 669.53,
1169
- "eval_steps_per_second": 83.749,
1170
- "step": 64042
1171
- },
1172
- {
1173
- "epoch": 22.157334249398833,
1174
- "grad_norm": 0.3925967216491699,
1175
- "learning_rate": 2.230333218825146e-05,
1176
- "loss": 1.7058,
1177
- "step": 64500
1178
- },
1179
- {
1180
- "epoch": 22.329096530401923,
1181
- "grad_norm": 2.5894014835357666,
1182
- "learning_rate": 2.2088629336997598e-05,
1183
- "loss": 1.707,
1184
- "step": 65000
1185
- },
1186
- {
1187
- "epoch": 22.500858811405017,
1188
- "grad_norm": 2.7716064453125,
1189
- "learning_rate": 2.187392648574373e-05,
1190
- "loss": 1.7063,
1191
- "step": 65500
1192
- },
1193
- {
1194
- "epoch": 22.672621092408107,
1195
- "grad_norm": 0.03647352755069733,
1196
- "learning_rate": 2.1659223634489867e-05,
1197
- "loss": 1.7067,
1198
- "step": 66000
1199
- },
1200
- {
1201
- "epoch": 22.844383373411198,
1202
- "grad_norm": 0.1577221304178238,
1203
- "learning_rate": 2.1444520783236004e-05,
1204
- "loss": 1.706,
1205
- "step": 66500
1206
- },
1207
- {
1208
- "epoch": 23.0,
1209
- "eval_accuracy": 0.8844618385057396,
1210
- "eval_f1": 0.662218230035502,
1211
- "eval_loss": 2.02994441986084,
1212
- "eval_precision": 0.6496852706672261,
1213
- "eval_recall": 0.6752442428471738,
1214
- "eval_runtime": 4.5456,
1215
- "eval_samples_per_second": 640.185,
1216
- "eval_steps_per_second": 80.078,
1217
- "step": 66953
1218
- },
1219
- {
1220
- "epoch": 23.01614565441429,
1221
- "grad_norm": 2.8254947662353516,
1222
- "learning_rate": 2.1229817931982137e-05,
1223
- "loss": 1.7066,
1224
- "step": 67000
1225
- },
1226
- {
1227
- "epoch": 23.187907935417382,
1228
- "grad_norm": 0.07582961767911911,
1229
- "learning_rate": 2.1015115080728273e-05,
1230
- "loss": 1.7055,
1231
- "step": 67500
1232
- },
1233
- {
1234
- "epoch": 23.359670216420476,
1235
- "grad_norm": 0.11683762818574905,
1236
- "learning_rate": 2.080041222947441e-05,
1237
- "loss": 1.7052,
1238
- "step": 68000
1239
- },
1240
- {
1241
- "epoch": 23.531432497423566,
1242
- "grad_norm": 0.6018902063369751,
1243
- "learning_rate": 2.0585709378220543e-05,
1244
- "loss": 1.7053,
1245
- "step": 68500
1246
- },
1247
- {
1248
- "epoch": 23.703194778426656,
1249
- "grad_norm": 0.0493299625813961,
1250
- "learning_rate": 2.037100652696668e-05,
1251
- "loss": 1.7052,
1252
- "step": 69000
1253
- },
1254
- {
1255
- "epoch": 23.87495705942975,
1256
- "grad_norm": 0.7263774275779724,
1257
- "learning_rate": 2.0156303675712816e-05,
1258
- "loss": 1.7062,
1259
- "step": 69500
1260
- },
1261
- {
1262
- "epoch": 24.0,
1263
- "eval_accuracy": 0.8855613293367715,
1264
- "eval_f1": 0.6654007164127573,
1265
- "eval_loss": 2.0291478633880615,
1266
- "eval_precision": 0.6584949175706842,
1267
- "eval_recall": 0.6724528960223308,
1268
- "eval_runtime": 4.425,
1269
- "eval_samples_per_second": 657.631,
1270
- "eval_steps_per_second": 82.26,
1271
- "step": 69864
1272
- },
1273
- {
1274
- "epoch": 24.04671934043284,
1275
- "grad_norm": 0.22631505131721497,
1276
- "learning_rate": 1.994160082445895e-05,
1277
- "loss": 1.7058,
1278
- "step": 70000
1279
- },
1280
- {
1281
- "epoch": 24.218481621435934,
1282
- "grad_norm": 0.08843923360109329,
1283
- "learning_rate": 1.9726897973205085e-05,
1284
- "loss": 1.7051,
1285
- "step": 70500
1286
- },
1287
- {
1288
- "epoch": 24.390243902439025,
1289
- "grad_norm": 0.615134060382843,
1290
- "learning_rate": 1.9512195121951222e-05,
1291
- "loss": 1.7052,
1292
- "step": 71000
1293
- },
1294
- {
1295
- "epoch": 24.562006183442115,
1296
- "grad_norm": 0.013932738453149796,
1297
- "learning_rate": 1.9297492270697355e-05,
1298
- "loss": 1.7051,
1299
- "step": 71500
1300
- },
1301
- {
1302
- "epoch": 24.73376846444521,
1303
- "grad_norm": 3.4743807315826416,
1304
- "learning_rate": 1.908278941944349e-05,
1305
- "loss": 1.7058,
1306
- "step": 72000
1307
- },
1308
- {
1309
- "epoch": 24.9055307454483,
1310
- "grad_norm": 0.03590023145079613,
1311
- "learning_rate": 1.8868086568189628e-05,
1312
- "loss": 1.7051,
1313
- "step": 72500
1314
- },
1315
- {
1316
- "epoch": 25.0,
1317
- "eval_accuracy": 0.8849592272150159,
1318
- "eval_f1": 0.6664947124064998,
1319
- "eval_loss": 2.032705783843994,
1320
- "eval_precision": 0.6570605187319885,
1321
- "eval_recall": 0.6762037683182135,
1322
- "eval_runtime": 4.3381,
1323
- "eval_samples_per_second": 670.805,
1324
- "eval_steps_per_second": 83.908,
1325
- "step": 72775
1326
- },
1327
- {
1328
- "epoch": 25.07729302645139,
1329
- "grad_norm": 2.080662250518799,
1330
- "learning_rate": 1.865338371693576e-05,
1331
- "loss": 1.7049,
1332
- "step": 73000
1333
- },
1334
- {
1335
- "epoch": 25.249055307454483,
1336
- "grad_norm": 0.029195208102464676,
1337
- "learning_rate": 1.8438680865681898e-05,
1338
- "loss": 1.7053,
1339
- "step": 73500
1340
- },
1341
- {
1342
- "epoch": 25.420817588457574,
1343
- "grad_norm": 0.31326115131378174,
1344
- "learning_rate": 1.8223978014428034e-05,
1345
- "loss": 1.7049,
1346
- "step": 74000
1347
- },
1348
- {
1349
- "epoch": 25.592579869460668,
1350
- "grad_norm": 0.01428903266787529,
1351
- "learning_rate": 1.800927516317417e-05,
1352
- "loss": 1.7047,
1353
- "step": 74500
1354
- },
1355
- {
1356
- "epoch": 25.764342150463758,
1357
- "grad_norm": 1.265703797340393,
1358
- "learning_rate": 1.7794572311920304e-05,
1359
- "loss": 1.7049,
1360
- "step": 75000
1361
- },
1362
- {
1363
- "epoch": 25.93610443146685,
1364
- "grad_norm": 0.12370016425848007,
1365
- "learning_rate": 1.7579869460666437e-05,
1366
- "loss": 1.7044,
1367
- "step": 75500
1368
- },
1369
- {
1370
- "epoch": 26.0,
1371
- "eval_accuracy": 0.8851293865102946,
1372
- "eval_f1": 0.6667517223781576,
1373
- "eval_loss": 2.034836530685425,
1374
- "eval_precision": 0.650539419087137,
1375
- "eval_recall": 0.6837927424982554,
1376
- "eval_runtime": 4.3502,
1377
- "eval_samples_per_second": 668.933,
1378
- "eval_steps_per_second": 83.674,
1379
- "step": 75686
1380
- },
1381
- {
1382
- "epoch": 26.107866712469942,
1383
- "grad_norm": 0.7563213109970093,
1384
- "learning_rate": 1.7365166609412573e-05,
1385
- "loss": 1.7049,
1386
- "step": 76000
1387
- },
1388
- {
1389
- "epoch": 26.279628993473032,
1390
- "grad_norm": 0.0820002630352974,
1391
- "learning_rate": 1.7150463758158706e-05,
1392
- "loss": 1.7043,
1393
- "step": 76500
1394
- },
1395
- {
1396
- "epoch": 26.451391274476126,
1397
- "grad_norm": 0.0947314128279686,
1398
- "learning_rate": 1.6935760906904843e-05,
1399
- "loss": 1.7039,
1400
- "step": 77000
1401
- },
1402
- {
1403
- "epoch": 26.623153555479217,
1404
- "grad_norm": 0.03729939088225365,
1405
- "learning_rate": 1.672105805565098e-05,
1406
- "loss": 1.7045,
1407
- "step": 77500
1408
- },
1409
- {
1410
- "epoch": 26.794915836482307,
1411
- "grad_norm": 0.022032542154192924,
1412
- "learning_rate": 1.6506355204397116e-05,
1413
- "loss": 1.7046,
1414
- "step": 78000
1415
- },
1416
- {
1417
- "epoch": 26.9666781174854,
1418
- "grad_norm": 0.01787804253399372,
1419
- "learning_rate": 1.629165235314325e-05,
1420
- "loss": 1.704,
1421
- "step": 78500
1422
- },
1423
- {
1424
- "epoch": 27.0,
1425
- "eval_accuracy": 0.885482794277412,
1426
- "eval_f1": 0.6641411967283685,
1427
- "eval_loss": 2.0346953868865967,
1428
- "eval_precision": 0.6556178820329763,
1429
- "eval_recall": 0.6728890439637125,
1430
- "eval_runtime": 4.3364,
1431
- "eval_samples_per_second": 671.067,
1432
- "eval_steps_per_second": 83.941,
1433
- "step": 78597
1434
- },
1435
- {
1436
- "epoch": 27.13844039848849,
1437
- "grad_norm": 0.04579576849937439,
1438
- "learning_rate": 1.6076949501889386e-05,
1439
- "loss": 1.7036,
1440
- "step": 79000
1441
- },
1442
- {
1443
- "epoch": 27.310202679491585,
1444
- "grad_norm": 0.01315494254231453,
1445
- "learning_rate": 1.5862246650635522e-05,
1446
- "loss": 1.704,
1447
- "step": 79500
1448
- },
1449
- {
1450
- "epoch": 27.481964960494675,
1451
- "grad_norm": 0.014564316719770432,
1452
- "learning_rate": 1.5647543799381655e-05,
1453
- "loss": 1.7038,
1454
- "step": 80000
1455
- },
1456
- {
1457
- "epoch": 27.653727241497766,
1458
- "grad_norm": 0.027286505326628685,
1459
- "learning_rate": 1.543284094812779e-05,
1460
- "loss": 1.7042,
1461
- "step": 80500
1462
- },
1463
- {
1464
- "epoch": 27.82548952250086,
1465
- "grad_norm": 0.012170245870947838,
1466
- "learning_rate": 1.5218138096873926e-05,
1467
- "loss": 1.7041,
1468
- "step": 81000
1469
- },
1470
- {
1471
- "epoch": 27.99725180350395,
1472
- "grad_norm": 0.10445314645767212,
1473
- "learning_rate": 1.5003435245620063e-05,
1474
- "loss": 1.7041,
1475
- "step": 81500
1476
- },
1477
- {
1478
- "epoch": 28.0,
1479
- "eval_accuracy": 0.8848414246259768,
1480
- "eval_f1": 0.665781243305771,
1481
- "eval_loss": 2.039051055908203,
1482
- "eval_precision": 0.6542056074766355,
1483
- "eval_recall": 0.6777739009071877,
1484
- "eval_runtime": 4.3543,
1485
- "eval_samples_per_second": 668.309,
1486
- "eval_steps_per_second": 83.596,
1487
- "step": 81508
1488
- },
1489
- {
1490
- "epoch": 28.169014084507044,
1491
- "grad_norm": 0.03070581518113613,
1492
- "learning_rate": 1.4788732394366198e-05,
1493
- "loss": 1.7034,
1494
- "step": 82000
1495
- },
1496
- {
1497
- "epoch": 28.340776365510134,
1498
- "grad_norm": 0.028482601046562195,
1499
- "learning_rate": 1.4574029543112333e-05,
1500
- "loss": 1.7034,
1501
- "step": 82500
1502
- },
1503
- {
1504
- "epoch": 28.512538646513224,
1505
- "grad_norm": 0.020483843982219696,
1506
- "learning_rate": 1.4359326691858469e-05,
1507
- "loss": 1.7037,
1508
- "step": 83000
1509
- },
1510
- {
1511
- "epoch": 28.68430092751632,
1512
- "grad_norm": 0.04633668065071106,
1513
- "learning_rate": 1.4144623840604604e-05,
1514
- "loss": 1.7038,
1515
- "step": 83500
1516
- },
1517
- {
1518
- "epoch": 28.85606320851941,
1519
- "grad_norm": 0.013343285769224167,
1520
- "learning_rate": 1.3929920989350739e-05,
1521
- "loss": 1.7044,
1522
- "step": 84000
1523
- },
1524
- {
1525
- "epoch": 29.0,
1526
- "eval_accuracy": 0.8849592272150159,
1527
- "eval_f1": 0.6683048801517503,
1528
- "eval_loss": 2.0383501052856445,
1529
- "eval_precision": 0.6606716672349131,
1530
- "eval_recall": 0.6761165387299372,
1531
- "eval_runtime": 4.5322,
1532
- "eval_samples_per_second": 642.076,
1533
- "eval_steps_per_second": 80.315,
1534
- "step": 84419
1535
- },
1536
- {
1537
- "epoch": 29.027825489522503,
1538
- "grad_norm": 0.012182236649096012,
1539
- "learning_rate": 1.3715218138096875e-05,
1540
- "loss": 1.7031,
1541
- "step": 84500
1542
- },
1543
- {
1544
- "epoch": 29.199587770525593,
1545
- "grad_norm": 0.022931888699531555,
1546
- "learning_rate": 1.350051528684301e-05,
1547
- "loss": 1.7034,
1548
- "step": 85000
1549
- },
1550
- {
1551
- "epoch": 29.371350051528683,
1552
- "grad_norm": 0.03491847962141037,
1553
- "learning_rate": 1.3285812435589146e-05,
1554
- "loss": 1.7031,
1555
- "step": 85500
1556
- },
1557
- {
1558
- "epoch": 29.543112332531777,
1559
- "grad_norm": 0.7195144295692444,
1560
- "learning_rate": 1.3071109584335281e-05,
1561
- "loss": 1.703,
1562
- "step": 86000
1563
- },
1564
- {
1565
- "epoch": 29.714874613534867,
1566
- "grad_norm": 0.06322001665830612,
1567
- "learning_rate": 1.2856406733081416e-05,
1568
- "loss": 1.7038,
1569
- "step": 86500
1570
- },
1571
- {
1572
- "epoch": 29.886636894537958,
1573
- "grad_norm": 2.88450288772583,
1574
- "learning_rate": 1.2641703881827552e-05,
1575
- "loss": 1.7038,
1576
- "step": 87000
1577
- },
1578
- {
1579
- "epoch": 30.0,
1580
- "eval_accuracy": 0.8862157881647665,
1581
- "eval_f1": 0.6689958592132506,
1582
- "eval_loss": 2.036111354827881,
1583
- "eval_precision": 0.6616894197952219,
1584
- "eval_recall": 0.6764654570830426,
1585
- "eval_runtime": 4.3577,
1586
- "eval_samples_per_second": 667.782,
1587
- "eval_steps_per_second": 83.53,
1588
- "step": 87330
1589
- },
1590
- {
1591
- "epoch": 30.05839917554105,
1592
- "grad_norm": 0.045289408415555954,
1593
- "learning_rate": 1.2427001030573687e-05,
1594
- "loss": 1.7034,
1595
- "step": 87500
1596
- },
1597
- {
1598
- "epoch": 30.230161456544142,
1599
- "grad_norm": 0.08461955934762955,
1600
- "learning_rate": 1.2212298179319822e-05,
1601
- "loss": 1.703,
1602
- "step": 88000
1603
- },
1604
- {
1605
- "epoch": 30.401923737547236,
1606
- "grad_norm": 0.019219454377889633,
1607
- "learning_rate": 1.1997595328065957e-05,
1608
- "loss": 1.7028,
1609
- "step": 88500
1610
- },
1611
- {
1612
- "epoch": 30.573686018550326,
1613
- "grad_norm": 0.08018019050359726,
1614
- "learning_rate": 1.1782892476812092e-05,
1615
- "loss": 1.7035,
1616
- "step": 89000
1617
- },
1618
- {
1619
- "epoch": 30.745448299553416,
1620
- "grad_norm": 0.022569868713617325,
1621
- "learning_rate": 1.1568189625558228e-05,
1622
- "loss": 1.7029,
1623
- "step": 89500
1624
- },
1625
- {
1626
- "epoch": 30.91721058055651,
1627
- "grad_norm": 0.01143190823495388,
1628
- "learning_rate": 1.1353486774304363e-05,
1629
- "loss": 1.7043,
1630
- "step": 90000
1631
- },
1632
- {
1633
- "epoch": 31.0,
1634
- "eval_accuracy": 0.8862157881647665,
1635
- "eval_f1": 0.6686131386861313,
1636
- "eval_loss": 2.036999225616455,
1637
- "eval_precision": 0.6583798410282429,
1638
- "eval_recall": 0.6791695743196092,
1639
- "eval_runtime": 4.3764,
1640
- "eval_samples_per_second": 664.936,
1641
- "eval_steps_per_second": 83.174,
1642
- "step": 90241
1643
- },
1644
- {
1645
- "epoch": 31.0889728615596,
1646
- "grad_norm": 0.020288735628128052,
1647
- "learning_rate": 1.1138783923050498e-05,
1648
- "loss": 1.7027,
1649
- "step": 90500
1650
- },
1651
- {
1652
- "epoch": 31.260735142562694,
1653
- "grad_norm": 1.4007924795150757,
1654
- "learning_rate": 1.0924081071796634e-05,
1655
- "loss": 1.7031,
1656
- "step": 91000
1657
- },
1658
- {
1659
- "epoch": 31.432497423565785,
1660
- "grad_norm": 0.026297271251678467,
1661
- "learning_rate": 1.0709378220542769e-05,
1662
- "loss": 1.7033,
1663
- "step": 91500
1664
- },
1665
- {
1666
- "epoch": 31.604259704568875,
1667
- "grad_norm": 0.01724054105579853,
1668
- "learning_rate": 1.0494675369288904e-05,
1669
- "loss": 1.7033,
1670
- "step": 92000
1671
- },
1672
- {
1673
- "epoch": 31.77602198557197,
1674
- "grad_norm": 0.022584540769457817,
1675
- "learning_rate": 1.027997251803504e-05,
1676
- "loss": 1.7029,
1677
- "step": 92500
1678
- },
1679
- {
1680
- "epoch": 31.94778426657506,
1681
- "grad_norm": 0.020957598462700844,
1682
- "learning_rate": 1.0065269666781175e-05,
1683
- "loss": 1.7027,
1684
- "step": 93000
1685
- },
1686
- {
1687
- "epoch": 32.0,
1688
- "eval_accuracy": 0.887498527467637,
1689
- "eval_f1": 0.6720918279938325,
1690
- "eval_loss": 2.0346181392669678,
1691
- "eval_precision": 0.6602154156849546,
1692
- "eval_recall": 0.6844033496161898,
1693
- "eval_runtime": 4.3442,
1694
- "eval_samples_per_second": 669.861,
1695
- "eval_steps_per_second": 83.79,
1696
- "step": 93152
1697
- },
1698
- {
1699
- "epoch": 32.11954654757815,
1700
- "grad_norm": 0.034752070903778076,
1701
- "learning_rate": 9.850566815527312e-06,
1702
- "loss": 1.7029,
1703
- "step": 93500
1704
- },
1705
- {
1706
- "epoch": 32.29130882858124,
1707
- "grad_norm": 0.029328178614377975,
1708
- "learning_rate": 9.635863964273446e-06,
1709
- "loss": 1.7028,
1710
- "step": 94000
1711
- },
1712
- {
1713
- "epoch": 32.463071109584334,
1714
- "grad_norm": 0.012389196082949638,
1715
- "learning_rate": 9.421161113019581e-06,
1716
- "loss": 1.7029,
1717
- "step": 94500
1718
- },
1719
- {
1720
- "epoch": 32.634833390587424,
1721
- "grad_norm": 0.3913457989692688,
1722
- "learning_rate": 9.206458261765718e-06,
1723
- "loss": 1.703,
1724
- "step": 95000
1725
- },
1726
- {
1727
- "epoch": 32.80659567159052,
1728
- "grad_norm": 0.018091492354869843,
1729
- "learning_rate": 8.991755410511853e-06,
1730
- "loss": 1.703,
1731
- "step": 95500
1732
- },
1733
- {
1734
- "epoch": 32.97835795259361,
1735
- "grad_norm": 0.0414559505879879,
1736
- "learning_rate": 8.777052559257987e-06,
1737
- "loss": 1.7026,
1738
- "step": 96000
1739
- },
1740
- {
1741
- "epoch": 33.0,
1742
- "eval_accuracy": 0.8870796738177201,
1743
- "eval_f1": 0.6719724671972467,
1744
- "eval_loss": 2.0373663902282715,
1745
- "eval_precision": 0.6629318394024276,
1746
- "eval_recall": 0.6812630844382415,
1747
- "eval_runtime": 4.3395,
1748
- "eval_samples_per_second": 670.591,
1749
- "eval_steps_per_second": 83.881,
1750
- "step": 96063
1751
- },
1752
- {
1753
- "epoch": 33.1501202335967,
1754
- "grad_norm": 1.3967928886413574,
1755
- "learning_rate": 8.562349708004122e-06,
1756
- "loss": 1.7026,
1757
- "step": 96500
1758
- },
1759
- {
1760
- "epoch": 33.32188251459979,
1761
- "grad_norm": 0.0278925858438015,
1762
- "learning_rate": 8.347646856750257e-06,
1763
- "loss": 1.7029,
1764
- "step": 97000
1765
- },
1766
- {
1767
- "epoch": 33.49364479560288,
1768
- "grad_norm": 0.0145077770575881,
1769
- "learning_rate": 8.132944005496393e-06,
1770
- "loss": 1.7026,
1771
- "step": 97500
1772
- },
1773
- {
1774
- "epoch": 33.66540707660598,
1775
- "grad_norm": 0.025042984634637833,
1776
- "learning_rate": 7.918241154242528e-06,
1777
- "loss": 1.7027,
1778
- "step": 98000
1779
- },
1780
- {
1781
- "epoch": 33.83716935760907,
1782
- "grad_norm": 0.01291943620890379,
1783
- "learning_rate": 7.703538302988663e-06,
1784
- "loss": 1.7025,
1785
- "step": 98500
1786
- },
1787
- {
1788
- "epoch": 34.0,
1789
- "eval_accuracy": 0.8862026989882067,
1790
- "eval_f1": 0.6707671279685631,
1791
- "eval_loss": 2.038774251937866,
1792
- "eval_precision": 0.6571811181787747,
1793
- "eval_recall": 0.6849267271458479,
1794
- "eval_runtime": 4.3422,
1795
- "eval_samples_per_second": 670.172,
1796
- "eval_steps_per_second": 83.829,
1797
- "step": 98974
1798
- },
1799
- {
1800
- "epoch": 34.00893163861216,
1801
- "grad_norm": 0.014937439002096653,
1802
- "learning_rate": 7.4888354517347995e-06,
1803
- "loss": 1.7027,
1804
- "step": 99000
1805
- },
1806
- {
1807
- "epoch": 34.18069391961525,
1808
- "grad_norm": 0.05787663906812668,
1809
- "learning_rate": 7.274132600480934e-06,
1810
- "loss": 1.7029,
1811
- "step": 99500
1812
- },
1813
- {
1814
- "epoch": 34.35245620061834,
1815
- "grad_norm": 0.12616612017154694,
1816
- "learning_rate": 7.05942974922707e-06,
1817
- "loss": 1.7025,
1818
- "step": 100000
1819
- },
1820
- {
1821
- "epoch": 34.52421848162144,
1822
- "grad_norm": 1.7043280601501465,
1823
- "learning_rate": 6.844726897973206e-06,
1824
- "loss": 1.7026,
1825
- "step": 100500
1826
- },
1827
- {
1828
- "epoch": 34.69598076262453,
1829
- "grad_norm": 0.2174743413925171,
1830
- "learning_rate": 6.6300240467193404e-06,
1831
- "loss": 1.7026,
1832
- "step": 101000
1833
- },
1834
- {
1835
- "epoch": 34.86774304362762,
1836
- "grad_norm": 0.018968598917126656,
1837
- "learning_rate": 6.415321195465476e-06,
1838
- "loss": 1.7024,
1839
- "step": 101500
1840
- },
1841
- {
1842
- "epoch": 35.0,
1843
- "eval_accuracy": 0.8873152789957984,
1844
- "eval_f1": 0.6722725312634755,
1845
- "eval_loss": 2.0370359420776367,
1846
- "eval_precision": 0.6647620672010915,
1847
- "eval_recall": 0.6799546406140963,
1848
- "eval_runtime": 4.5075,
1849
- "eval_samples_per_second": 645.593,
1850
- "eval_steps_per_second": 80.755,
1851
- "step": 101885
1852
- },
1853
- {
1854
- "epoch": 35.03950532463071,
1855
- "grad_norm": 0.031130915507674217,
1856
- "learning_rate": 6.200618344211612e-06,
1857
- "loss": 1.7028,
1858
- "step": 102000
1859
- },
1860
- {
1861
- "epoch": 35.2112676056338,
1862
- "grad_norm": 0.015709536150097847,
1863
- "learning_rate": 5.9859154929577465e-06,
1864
- "loss": 1.7026,
1865
- "step": 102500
1866
- },
1867
- {
1868
- "epoch": 35.3830298866369,
1869
- "grad_norm": 0.013832608237862587,
1870
- "learning_rate": 5.771212641703882e-06,
1871
- "loss": 1.7023,
1872
- "step": 103000
1873
- },
1874
- {
1875
- "epoch": 35.55479216763999,
1876
- "grad_norm": 0.033963147550821304,
1877
- "learning_rate": 5.556509790450017e-06,
1878
- "loss": 1.7025,
1879
- "step": 103500
1880
- },
1881
- {
1882
- "epoch": 35.72655444864308,
1883
- "grad_norm": 0.010256431065499783,
1884
- "learning_rate": 5.341806939196153e-06,
1885
- "loss": 1.7025,
1886
- "step": 104000
1887
- },
1888
- {
1889
- "epoch": 35.89831672964617,
1890
- "grad_norm": 0.01719123311340809,
1891
- "learning_rate": 5.127104087942288e-06,
1892
- "loss": 1.703,
1893
- "step": 104500
1894
- },
1895
- {
1896
- "epoch": 36.0,
1897
- "eval_accuracy": 0.8867524444037226,
1898
- "eval_f1": 0.6718547341115435,
1899
- "eval_loss": 2.0409553050994873,
1900
- "eval_precision": 0.6660380593176753,
1901
- "eval_recall": 0.6777739009071877,
1902
- "eval_runtime": 4.3391,
1903
- "eval_samples_per_second": 670.65,
1904
- "eval_steps_per_second": 83.889,
1905
- "step": 104796
1906
- },
1907
- {
1908
- "epoch": 36.07007901064926,
1909
- "grad_norm": 0.010260261595249176,
1910
- "learning_rate": 4.912401236688424e-06,
1911
- "loss": 1.7023,
1912
- "step": 105000
1913
- },
1914
- {
1915
- "epoch": 36.24184129165236,
1916
- "grad_norm": 0.016782447695732117,
1917
- "learning_rate": 4.697698385434559e-06,
1918
- "loss": 1.7022,
1919
- "step": 105500
1920
- },
1921
- {
1922
- "epoch": 36.41360357265545,
1923
- "grad_norm": 0.017756333574652672,
1924
- "learning_rate": 4.482995534180694e-06,
1925
- "loss": 1.7027,
1926
- "step": 106000
1927
- },
1928
- {
1929
- "epoch": 36.58536585365854,
1930
- "grad_norm": 0.02108193188905716,
1931
- "learning_rate": 4.26829268292683e-06,
1932
- "loss": 1.7027,
1933
- "step": 106500
1934
- },
1935
- {
1936
- "epoch": 36.75712813466163,
1937
- "grad_norm": 0.023750385269522667,
1938
- "learning_rate": 4.053589831672965e-06,
1939
- "loss": 1.7027,
1940
- "step": 107000
1941
- },
1942
- {
1943
- "epoch": 36.92889041566472,
1944
- "grad_norm": 0.01755833625793457,
1945
- "learning_rate": 3.8388869804191e-06,
1946
- "loss": 1.7023,
1947
- "step": 107500
1948
- },
1949
- {
1950
- "epoch": 37.0,
1951
- "eval_accuracy": 0.8871058521708399,
1952
- "eval_f1": 0.6730462519936204,
1953
- "eval_loss": 2.039201498031616,
1954
- "eval_precision": 0.6652748189177674,
1955
- "eval_recall": 0.6810013956734124,
1956
- "eval_runtime": 4.3606,
1957
- "eval_samples_per_second": 667.332,
1958
- "eval_steps_per_second": 83.474,
1959
- "step": 107707
1960
- },
1961
- {
1962
- "epoch": 37.100652696667815,
1963
- "grad_norm": 0.00990867055952549,
1964
- "learning_rate": 3.6241841291652353e-06,
1965
- "loss": 1.7024,
1966
- "step": 108000
1967
- },
1968
- {
1969
- "epoch": 37.272414977670906,
1970
- "grad_norm": 0.011056340299546719,
1971
- "learning_rate": 3.409481277911371e-06,
1972
- "loss": 1.7023,
1973
- "step": 108500
1974
- },
1975
- {
1976
- "epoch": 37.444177258673996,
1977
- "grad_norm": 0.011399239301681519,
1978
- "learning_rate": 3.194778426657506e-06,
1979
- "loss": 1.7025,
1980
- "step": 109000
1981
- },
1982
- {
1983
- "epoch": 37.615939539677086,
1984
- "grad_norm": 0.11473862081766129,
1985
- "learning_rate": 2.9800755754036418e-06,
1986
- "loss": 1.7024,
1987
- "step": 109500
1988
- },
1989
- {
1990
- "epoch": 37.78770182068018,
1991
- "grad_norm": 0.013430794700980186,
1992
- "learning_rate": 2.7653727241497766e-06,
1993
- "loss": 1.7026,
1994
- "step": 110000
1995
- },
1996
- {
1997
- "epoch": 37.95946410168327,
1998
- "grad_norm": 0.6538777351379395,
1999
- "learning_rate": 2.5506698728959122e-06,
2000
- "loss": 1.7022,
2001
- "step": 110500
2002
- },
2003
- {
2004
- "epoch": 38.0,
2005
- "eval_accuracy": 0.8873545465254781,
2006
- "eval_f1": 0.6726804123711341,
2007
- "eval_loss": 2.0390825271606445,
2008
- "eval_precision": 0.6626607989167231,
2009
- "eval_recall": 0.6830076762037683,
2010
- "eval_runtime": 4.3713,
2011
- "eval_samples_per_second": 665.706,
2012
- "eval_steps_per_second": 83.27,
2013
- "step": 110618
2014
- },
2015
- {
2016
- "epoch": 38.131226382686364,
2017
- "grad_norm": 0.022718122228980064,
2018
- "learning_rate": 2.3359670216420474e-06,
2019
- "loss": 1.7021,
2020
- "step": 111000
2021
- },
2022
- {
2023
- "epoch": 38.302988663689455,
2024
- "grad_norm": 0.00782406609505415,
2025
- "learning_rate": 2.121264170388183e-06,
2026
- "loss": 1.7024,
2027
- "step": 111500
2028
- },
2029
- {
2030
- "epoch": 38.474750944692545,
2031
- "grad_norm": 0.06520986557006836,
2032
- "learning_rate": 1.906561319134318e-06,
2033
- "loss": 1.7025,
2034
- "step": 112000
2035
- },
2036
- {
2037
- "epoch": 38.646513225695635,
2038
- "grad_norm": 0.011009753681719303,
2039
- "learning_rate": 1.6918584678804535e-06,
2040
- "loss": 1.7024,
2041
- "step": 112500
2042
- },
2043
- {
2044
- "epoch": 38.818275506698726,
2045
- "grad_norm": 0.01437163446098566,
2046
- "learning_rate": 1.477155616626589e-06,
2047
- "loss": 1.7024,
2048
- "step": 113000
2049
- },
2050
- {
2051
- "epoch": 38.99003778770182,
2052
- "grad_norm": 0.020153211429715157,
2053
- "learning_rate": 1.2624527653727242e-06,
2054
- "loss": 1.7022,
2055
- "step": 113500
2056
- },
2057
- {
2058
- "epoch": 39.0,
2059
- "eval_accuracy": 0.8869356928755612,
2060
- "eval_f1": 0.6716757545833154,
2061
- "eval_loss": 2.039407968521118,
2062
- "eval_precision": 0.6613680561427243,
2063
- "eval_recall": 0.6823098394975575,
2064
- "eval_runtime": 4.3449,
2065
- "eval_samples_per_second": 669.757,
2066
- "eval_steps_per_second": 83.777,
2067
- "step": 113529
2068
- },
2069
- {
2070
- "epoch": 39.16180006870491,
2071
- "grad_norm": 0.020474748685956,
2072
- "learning_rate": 1.0477499141188596e-06,
2073
- "loss": 1.7027,
2074
- "step": 114000
2075
- },
2076
- {
2077
- "epoch": 39.333562349708004,
2078
- "grad_norm": 0.010198526084423065,
2079
- "learning_rate": 8.330470628649948e-07,
2080
- "loss": 1.7023,
2081
- "step": 114500
2082
- },
2083
- {
2084
- "epoch": 39.505324630711094,
2085
- "grad_norm": 0.02069229632616043,
2086
- "learning_rate": 6.183442116111302e-07,
2087
- "loss": 1.7022,
2088
- "step": 115000
2089
- },
2090
- {
2091
- "epoch": 39.677086911714184,
2092
- "grad_norm": 0.04512259364128113,
2093
- "learning_rate": 4.0364136035726557e-07,
2094
- "loss": 1.7024,
2095
- "step": 115500
2096
- },
2097
- {
2098
- "epoch": 39.84884919271728,
2099
- "grad_norm": 0.02132527157664299,
2100
- "learning_rate": 1.889385091034009e-07,
2101
- "loss": 1.7023,
2102
- "step": 116000
2103
- },
2104
- {
2105
- "epoch": 40.0,
2106
- "eval_accuracy": 0.8874069032317177,
2107
- "eval_f1": 0.6728642134710566,
2108
- "eval_loss": 2.0384275913238525,
2109
- "eval_precision": 0.6640897120040778,
2110
- "eval_recall": 0.6818736915561758,
2111
- "eval_runtime": 4.3494,
2112
- "eval_samples_per_second": 669.052,
2113
- "eval_steps_per_second": 83.689,
2114
- "step": 116440
2115
- },
2116
- {
2117
- "epoch": 40.0,
2118
- "step": 116440,
2119
- "total_flos": 3.172672952125471e+16,
2120
- "train_loss": 1.7340915796560996,
2121
- "train_runtime": 6267.5858,
2122
- "train_samples_per_second": 148.606,
2123
- "train_steps_per_second": 18.578
2124
  }
2125
  ],
2126
  "logging_steps": 500,
2127
- "max_steps": 116440,
2128
  "num_input_tokens_seen": 0,
2129
- "num_train_epochs": 40,
2130
  "save_steps": 500,
2131
  "stateful_callbacks": {
2132
  "TrainerControl": {
@@ -2140,7 +825,7 @@
2140
  "attributes": {}
2141
  }
2142
  },
2143
- "total_flos": 3.172672952125471e+16,
2144
  "train_batch_size": 8,
2145
  "trial_name": null,
2146
  "trial_params": null
 
1
  {
2
  "best_global_step": 5822,
3
+ "best_metric": 1.5156257152557373,
4
  "best_model_checkpoint": "./output/bert-base-medmentions/checkpoint-5822",
5
+ "epoch": 15.0,
6
  "eval_steps": 500,
7
+ "global_step": 43665,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.1717622810030917,
14
+ "grad_norm": 1.9404675960540771,
15
+ "learning_rate": 4.9427459063323026e-05,
16
+ "loss": 1.7239,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.3435245620061834,
21
+ "grad_norm": 1.6719824075698853,
22
+ "learning_rate": 4.885491812664606e-05,
23
+ "loss": 1.6163,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 0.5152868430092752,
28
+ "grad_norm": 1.837388515472412,
29
+ "learning_rate": 4.828237718996909e-05,
30
+ "loss": 1.5924,
31
  "step": 1500
32
  },
33
  {
34
  "epoch": 0.6870491240123668,
35
+ "grad_norm": 2.1180434226989746,
36
+ "learning_rate": 4.770983625329211e-05,
37
+ "loss": 1.5717,
38
  "step": 2000
39
  },
40
  {
41
  "epoch": 0.8588114050154586,
42
+ "grad_norm": 1.800995945930481,
43
+ "learning_rate": 4.7137295316615135e-05,
44
+ "loss": 1.5686,
45
  "step": 2500
46
  },
47
  {
48
  "epoch": 1.0,
49
+ "eval_accuracy": 0.8549719237162791,
50
+ "eval_f1": 0.5650458863307036,
51
+ "eval_loss": 1.5439889430999756,
52
+ "eval_precision": 0.5245889387144993,
53
+ "eval_recall": 0.6122644801116539,
54
+ "eval_runtime": 4.1137,
55
+ "eval_samples_per_second": 707.39,
56
+ "eval_steps_per_second": 88.485,
57
  "step": 2911
58
  },
59
  {
60
  "epoch": 1.0305736860185504,
61
+ "grad_norm": 1.0917384624481201,
62
+ "learning_rate": 4.6564754379938166e-05,
63
+ "loss": 1.5484,
64
  "step": 3000
65
  },
66
  {
67
  "epoch": 1.202335967021642,
68
+ "grad_norm": 1.0721220970153809,
69
+ "learning_rate": 4.59922134432612e-05,
70
+ "loss": 1.4824,
71
  "step": 3500
72
  },
73
  {
74
  "epoch": 1.3740982480247337,
75
+ "grad_norm": 1.5045437812805176,
76
+ "learning_rate": 4.541967250658422e-05,
77
+ "loss": 1.4894,
78
  "step": 4000
79
  },
80
  {
81
  "epoch": 1.5458605290278253,
82
+ "grad_norm": 3.3368799686431885,
83
+ "learning_rate": 4.484713156990725e-05,
84
+ "loss": 1.4863,
85
  "step": 4500
86
  },
87
  {
88
  "epoch": 1.7176228100309172,
89
+ "grad_norm": 2.191357374191284,
90
+ "learning_rate": 4.427459063323028e-05,
91
+ "loss": 1.4806,
92
  "step": 5000
93
  },
94
  {
95
  "epoch": 1.889385091034009,
96
+ "grad_norm": 2.575375556945801,
97
+ "learning_rate": 4.3702049696553306e-05,
98
+ "loss": 1.4792,
99
  "step": 5500
100
  },
101
  {
102
  "epoch": 2.0,
103
+ "eval_accuracy": 0.8688595400463357,
104
+ "eval_f1": 0.6071204975165909,
105
+ "eval_loss": 1.5156257152557373,
106
+ "eval_precision": 0.5820728291316527,
107
+ "eval_recall": 0.6344207955338451,
108
+ "eval_runtime": 4.3513,
109
+ "eval_samples_per_second": 668.77,
110
+ "eval_steps_per_second": 83.654,
111
  "step": 5822
112
  },
113
  {
114
  "epoch": 2.0611473720371007,
115
+ "grad_norm": 4.3115081787109375,
116
+ "learning_rate": 4.3129508759876336e-05,
117
+ "loss": 1.4566,
118
  "step": 6000
119
  },
120
  {
121
  "epoch": 2.2329096530401924,
122
+ "grad_norm": 1.5031124353408813,
123
+ "learning_rate": 4.255696782319936e-05,
124
+ "loss": 1.4053,
125
  "step": 6500
126
  },
127
  {
128
  "epoch": 2.404671934043284,
129
+ "grad_norm": 2.4502875804901123,
130
+ "learning_rate": 4.198442688652239e-05,
131
+ "loss": 1.4111,
132
  "step": 7000
133
  },
134
  {
135
  "epoch": 2.5764342150463757,
136
+ "grad_norm": 1.5572278499603271,
137
+ "learning_rate": 4.1411885949845415e-05,
138
+ "loss": 1.4096,
139
  "step": 7500
140
  },
141
  {
142
  "epoch": 2.7481964960494674,
143
+ "grad_norm": 1.3663930892944336,
144
+ "learning_rate": 4.0839345013168445e-05,
145
+ "loss": 1.4097,
146
  "step": 8000
147
  },
148
  {
149
  "epoch": 2.9199587770525595,
150
+ "grad_norm": 3.3840882778167725,
151
+ "learning_rate": 4.026680407649147e-05,
152
+ "loss": 1.4111,
153
  "step": 8500
154
  },
155
  {
156
  "epoch": 3.0,
157
+ "eval_accuracy": 0.8713595727692771,
158
+ "eval_f1": 0.6163334575106585,
159
+ "eval_loss": 1.519059419631958,
160
+ "eval_precision": 0.5864513588026782,
161
+ "eval_recall": 0.6494242847173761,
162
+ "eval_runtime": 4.33,
163
+ "eval_samples_per_second": 672.06,
164
+ "eval_steps_per_second": 84.065,
165
  "step": 8733
166
  },
167
  {
168
  "epoch": 3.091721058055651,
169
+ "grad_norm": 1.4997501373291016,
170
+ "learning_rate": 3.96942631398145e-05,
171
+ "loss": 1.3798,
172
  "step": 9000
173
  },
174
  {
175
  "epoch": 3.2634833390587428,
176
+ "grad_norm": 1.4717656373977661,
177
+ "learning_rate": 3.912172220313753e-05,
178
+ "loss": 1.3517,
179
  "step": 9500
180
  },
181
  {
182
  "epoch": 3.4352456200618344,
183
+ "grad_norm": 2.0151214599609375,
184
+ "learning_rate": 3.8549181266460554e-05,
185
+ "loss": 1.3532,
186
  "step": 10000
187
  },
188
  {
189
  "epoch": 3.607007901064926,
190
+ "grad_norm": 2.7060940265655518,
191
+ "learning_rate": 3.797664032978358e-05,
192
+ "loss": 1.3546,
193
  "step": 10500
194
  },
195
  {
196
  "epoch": 3.7787701820680177,
197
+ "grad_norm": 3.2001101970672607,
198
+ "learning_rate": 3.740409939310661e-05,
199
+ "loss": 1.355,
200
  "step": 11000
201
  },
202
  {
203
  "epoch": 3.9505324630711094,
204
+ "grad_norm": 2.374141216278076,
205
+ "learning_rate": 3.683155845642964e-05,
206
+ "loss": 1.356,
207
  "step": 11500
208
  },
209
  {
210
  "epoch": 4.0,
211
+ "eval_accuracy": 0.8777209125773897,
212
+ "eval_f1": 0.6318054658919734,
213
+ "eval_loss": 1.5293220281600952,
214
+ "eval_precision": 0.6235663919802905,
215
+ "eval_recall": 0.6402651779483601,
216
+ "eval_runtime": 4.3262,
217
+ "eval_samples_per_second": 672.648,
218
+ "eval_steps_per_second": 84.139,
219
  "step": 11644
220
  },
221
  {
222
  "epoch": 4.1222947440742015,
223
+ "grad_norm": 2.5122597217559814,
224
+ "learning_rate": 3.6259017519752663e-05,
225
+ "loss": 1.3298,
226
  "step": 12000
227
  },
228
  {
229
  "epoch": 4.294057025077293,
230
+ "grad_norm": 1.870731234550476,
231
+ "learning_rate": 3.568647658307569e-05,
232
+ "loss": 1.3148,
233
  "step": 12500
234
  },
235
  {
236
  "epoch": 4.465819306080385,
237
+ "grad_norm": 2.1130025386810303,
238
+ "learning_rate": 3.511393564639872e-05,
239
+ "loss": 1.3147,
240
  "step": 13000
241
  },
242
  {
243
  "epoch": 4.637581587083476,
244
+ "grad_norm": 1.9891668558120728,
245
+ "learning_rate": 3.454139470972175e-05,
246
+ "loss": 1.3176,
247
  "step": 13500
248
  },
249
  {
250
  "epoch": 4.809343868086568,
251
+ "grad_norm": 2.144550085067749,
252
+ "learning_rate": 3.396885377304477e-05,
253
+ "loss": 1.3207,
254
  "step": 14000
255
  },
256
  {
257
  "epoch": 4.98110614908966,
258
+ "grad_norm": 1.166013240814209,
259
+ "learning_rate": 3.33963128363678e-05,
260
+ "loss": 1.3182,
261
  "step": 14500
262
  },
263
  {
264
  "epoch": 5.0,
265
+ "eval_accuracy": 0.8789120276443408,
266
+ "eval_f1": 0.6353600689952565,
267
+ "eval_loss": 1.543265461921692,
268
+ "eval_precision": 0.6282619819205185,
269
+ "eval_recall": 0.6426203768318214,
270
+ "eval_runtime": 4.3312,
271
+ "eval_samples_per_second": 671.862,
272
+ "eval_steps_per_second": 84.041,
273
  "step": 14555
274
  },
275
  {
276
  "epoch": 5.152868430092751,
277
+ "grad_norm": 1.4789066314697266,
278
+ "learning_rate": 3.2823771899690834e-05,
279
+ "loss": 1.2897,
280
  "step": 15000
281
  },
282
  {
283
  "epoch": 5.3246307110958435,
284
+ "grad_norm": 2.067422866821289,
285
+ "learning_rate": 3.225123096301386e-05,
286
+ "loss": 1.2908,
287
  "step": 15500
288
  },
289
  {
290
  "epoch": 5.496392992098935,
291
+ "grad_norm": 1.4258556365966797,
292
+ "learning_rate": 3.167869002633688e-05,
293
+ "loss": 1.2946,
294
  "step": 16000
295
  },
296
  {
297
  "epoch": 5.668155273102027,
298
+ "grad_norm": 1.9385099411010742,
299
+ "learning_rate": 3.110614908965991e-05,
300
+ "loss": 1.2914,
301
  "step": 16500
302
  },
303
  {
304
  "epoch": 5.839917554105119,
305
+ "grad_norm": 2.5622832775115967,
306
+ "learning_rate": 3.053360815298294e-05,
307
+ "loss": 1.2919,
308
  "step": 17000
309
  },
310
  {
311
  "epoch": 6.0,
312
+ "eval_accuracy": 0.8793701488239375,
313
+ "eval_f1": 0.6428903837204383,
314
+ "eval_loss": 1.5671014785766602,
315
+ "eval_precision": 0.6241682411895177,
316
+ "eval_recall": 0.6627704117236567,
317
+ "eval_runtime": 4.523,
318
+ "eval_samples_per_second": 643.376,
319
+ "eval_steps_per_second": 80.477,
320
  "step": 17466
321
  },
322
  {
323
  "epoch": 6.01167983510821,
324
+ "grad_norm": 1.9045183658599854,
325
+ "learning_rate": 2.9961067216305967e-05,
326
+ "loss": 1.2899,
327
  "step": 17500
328
  },
329
  {
330
  "epoch": 6.183442116111302,
331
+ "grad_norm": 5.356103420257568,
332
+ "learning_rate": 2.9388526279628997e-05,
333
+ "loss": 1.2716,
334
  "step": 18000
335
  },
336
  {
337
  "epoch": 6.3552043971143934,
338
+ "grad_norm": 1.4505314826965332,
339
+ "learning_rate": 2.8815985342952025e-05,
340
+ "loss": 1.2736,
341
  "step": 18500
342
  },
343
  {
344
  "epoch": 6.5269666781174855,
345
+ "grad_norm": 2.0673441886901855,
346
+ "learning_rate": 2.824344440627505e-05,
347
+ "loss": 1.2755,
348
  "step": 19000
349
  },
350
  {
351
  "epoch": 6.698728959120577,
352
+ "grad_norm": 7.07130765914917,
353
+ "learning_rate": 2.7670903469598076e-05,
354
+ "loss": 1.2739,
355
  "step": 19500
356
  },
357
  {
358
  "epoch": 6.870491240123669,
359
+ "grad_norm": 2.692305326461792,
360
+ "learning_rate": 2.7098362532921106e-05,
361
+ "loss": 1.2743,
362
  "step": 20000
363
  },
364
  {
365
  "epoch": 7.0,
366
+ "eval_accuracy": 0.8809277608345659,
367
+ "eval_f1": 0.6462844646455984,
368
+ "eval_loss": 1.5696512460708618,
369
+ "eval_precision": 0.6355739225773804,
370
+ "eval_recall": 0.6573621772505234,
371
+ "eval_runtime": 4.3283,
372
+ "eval_samples_per_second": 672.312,
373
+ "eval_steps_per_second": 84.097,
374
  "step": 20377
375
  },
376
  {
377
  "epoch": 7.042253521126761,
378
+ "grad_norm": 0.9224966764450073,
379
+ "learning_rate": 2.6525821596244134e-05,
380
+ "loss": 1.2716,
381
  "step": 20500
382
  },
383
  {
384
  "epoch": 7.214015802129852,
385
+ "grad_norm": 2.72609543800354,
386
+ "learning_rate": 2.5953280659567158e-05,
387
+ "loss": 1.2578,
388
  "step": 21000
389
  },
390
  {
391
  "epoch": 7.385778083132944,
392
+ "grad_norm": 1.7019892930984497,
393
+ "learning_rate": 2.538073972289019e-05,
394
+ "loss": 1.2612,
395
  "step": 21500
396
  },
397
  {
398
  "epoch": 7.5575403641360355,
399
+ "grad_norm": 4.017130374908447,
400
+ "learning_rate": 2.4808198786213216e-05,
401
+ "loss": 1.2622,
402
  "step": 22000
403
  },
404
  {
405
  "epoch": 7.729302645139128,
406
+ "grad_norm": 6.522401332855225,
407
+ "learning_rate": 2.4235657849536243e-05,
408
+ "loss": 1.2611,
409
  "step": 22500
410
  },
411
  {
412
  "epoch": 7.901064926142219,
413
+ "grad_norm": 3.355700731277466,
414
+ "learning_rate": 2.366311691285927e-05,
415
+ "loss": 1.2633,
416
  "step": 23000
417
  },
418
  {
419
  "epoch": 8.0,
420
+ "eval_accuracy": 0.8812811686016833,
421
+ "eval_f1": 0.6527559389741191,
422
+ "eval_loss": 1.5806214809417725,
423
+ "eval_precision": 0.636446507002569,
424
+ "eval_recall": 0.6699232379623168,
425
+ "eval_runtime": 4.329,
426
+ "eval_samples_per_second": 672.206,
427
+ "eval_steps_per_second": 84.084,
428
  "step": 23288
429
  },
430
  {
431
  "epoch": 8.07282720714531,
432
+ "grad_norm": 0.5228517651557922,
433
+ "learning_rate": 2.30905759761823e-05,
434
+ "loss": 1.2579,
435
  "step": 23500
436
  },
437
  {
438
  "epoch": 8.244589488148403,
439
+ "grad_norm": 0.2985314726829529,
440
+ "learning_rate": 2.2518035039505325e-05,
441
+ "loss": 1.2527,
442
  "step": 24000
443
  },
444
  {
445
  "epoch": 8.416351769151495,
446
+ "grad_norm": 1.963086724281311,
447
+ "learning_rate": 2.1945494102828355e-05,
448
+ "loss": 1.2542,
449
  "step": 24500
450
  },
451
  {
452
  "epoch": 8.588114050154585,
453
+ "grad_norm": 0.8812742233276367,
454
+ "learning_rate": 2.137295316615138e-05,
455
+ "loss": 1.251,
456
  "step": 25000
457
  },
458
  {
459
  "epoch": 8.759876331157677,
460
+ "grad_norm": 0.7020455002784729,
461
+ "learning_rate": 2.080041222947441e-05,
462
+ "loss": 1.2516,
463
  "step": 25500
464
  },
465
  {
466
  "epoch": 8.93163861216077,
467
+ "grad_norm": 2.0791664123535156,
468
+ "learning_rate": 2.0227871292797437e-05,
469
+ "loss": 1.2542,
470
  "step": 26000
471
  },
472
  {
473
  "epoch": 9.0,
474
+ "eval_accuracy": 0.880783779892407,
475
+ "eval_f1": 0.6498043011657758,
476
+ "eval_loss": 1.594204068183899,
477
+ "eval_precision": 0.6277953972513621,
478
+ "eval_recall": 0.6734124214933705,
479
+ "eval_runtime": 4.3509,
480
+ "eval_samples_per_second": 668.824,
481
+ "eval_steps_per_second": 83.66,
482
  "step": 26199
483
  },
484
  {
485
  "epoch": 9.103400893163862,
486
+ "grad_norm": 2.5551717281341553,
487
+ "learning_rate": 1.9655330356120464e-05,
488
+ "loss": 1.2496,
489
  "step": 26500
490
  },
491
  {
492
  "epoch": 9.275163174166954,
493
+ "grad_norm": 3.963749647140503,
494
+ "learning_rate": 1.908278941944349e-05,
495
+ "loss": 1.2453,
496
  "step": 27000
497
  },
498
  {
499
  "epoch": 9.446925455170044,
500
+ "grad_norm": 0.5859785676002502,
501
+ "learning_rate": 1.851024848276652e-05,
502
+ "loss": 1.2463,
503
  "step": 27500
504
  },
505
  {
506
  "epoch": 9.618687736173136,
507
+ "grad_norm": 0.3447531759738922,
508
+ "learning_rate": 1.7937707546089546e-05,
509
+ "loss": 1.2477,
510
  "step": 28000
511
  },
512
  {
513
  "epoch": 9.790450017176228,
514
+ "grad_norm": 0.3794388175010681,
515
+ "learning_rate": 1.7365166609412573e-05,
516
+ "loss": 1.2468,
517
  "step": 28500
518
  },
519
  {
520
  "epoch": 9.96221229817932,
521
+ "grad_norm": 1.6076109409332275,
522
+ "learning_rate": 1.67926256727356e-05,
523
+ "loss": 1.2457,
524
  "step": 29000
525
  },
526
  {
527
  "epoch": 10.0,
528
+ "eval_accuracy": 0.8814251495438422,
529
+ "eval_f1": 0.6500277789649131,
530
+ "eval_loss": 1.607577919960022,
531
+ "eval_precision": 0.6372015081692501,
532
+ "eval_recall": 0.6633810188415911,
533
+ "eval_runtime": 4.3517,
534
+ "eval_samples_per_second": 668.699,
535
+ "eval_steps_per_second": 83.645,
536
  "step": 29110
537
  },
538
  {
539
  "epoch": 10.13397457918241,
540
+ "grad_norm": 1.843865990638733,
541
+ "learning_rate": 1.622008473605863e-05,
542
+ "loss": 1.2411,
543
  "step": 29500
544
  },
545
  {
546
  "epoch": 10.305736860185503,
547
+ "grad_norm": 2.924731731414795,
548
+ "learning_rate": 1.5647543799381655e-05,
549
+ "loss": 1.2402,
550
  "step": 30000
551
  },
552
  {
553
  "epoch": 10.477499141188595,
554
+ "grad_norm": 0.40096113085746765,
555
+ "learning_rate": 1.5075002862704684e-05,
556
+ "loss": 1.2416,
557
  "step": 30500
558
  },
559
  {
560
  "epoch": 10.649261422191687,
561
+ "grad_norm": 1.3067082166671753,
562
+ "learning_rate": 1.4502461926027711e-05,
563
+ "loss": 1.2422,
564
  "step": 31000
565
  },
566
  {
567
  "epoch": 10.82102370319478,
568
+ "grad_norm": 1.1540168523788452,
569
+ "learning_rate": 1.3929920989350739e-05,
570
+ "loss": 1.2406,
571
  "step": 31500
572
  },
573
  {
574
  "epoch": 10.99278598419787,
575
+ "grad_norm": 2.354355812072754,
576
+ "learning_rate": 1.3357380052673768e-05,
577
+ "loss": 1.2398,
578
  "step": 32000
579
  },
580
  {
581
  "epoch": 11.0,
582
+ "eval_accuracy": 0.8834932394403068,
583
+ "eval_f1": 0.6551724137931034,
584
+ "eval_loss": 1.6077239513397217,
585
+ "eval_precision": 0.6413770053475936,
586
+ "eval_recall": 0.6695743196092114,
587
+ "eval_runtime": 4.5309,
588
+ "eval_samples_per_second": 642.254,
589
+ "eval_steps_per_second": 80.337,
590
  "step": 32021
591
  },
592
  {
593
  "epoch": 11.164548265200962,
594
+ "grad_norm": 2.941948413848877,
595
+ "learning_rate": 1.2784839115996793e-05,
596
+ "loss": 1.2384,
597
  "step": 32500
598
  },
599
  {
600
  "epoch": 11.336310546204054,
601
+ "grad_norm": 0.06978488713502884,
602
+ "learning_rate": 1.2212298179319822e-05,
603
+ "loss": 1.238,
604
  "step": 33000
605
  },
606
  {
607
  "epoch": 11.508072827207146,
608
+ "grad_norm": 1.1202852725982666,
609
+ "learning_rate": 1.163975724264285e-05,
610
+ "loss": 1.2375,
611
  "step": 33500
612
  },
613
  {
614
  "epoch": 11.679835108210238,
615
+ "grad_norm": 3.7290749549865723,
616
+ "learning_rate": 1.1067216305965877e-05,
617
+ "loss": 1.2373,
618
  "step": 34000
619
  },
620
  {
621
  "epoch": 11.851597389213328,
622
+ "grad_norm": 0.5790780782699585,
623
+ "learning_rate": 1.0494675369288904e-05,
624
+ "loss": 1.2377,
625
  "step": 34500
626
  },
627
  {
628
  "epoch": 12.0,
629
+ "eval_accuracy": 0.8846974436838179,
630
+ "eval_f1": 0.6615437158469945,
631
+ "eval_loss": 1.6134886741638184,
632
+ "eval_precision": 0.6478260869565218,
633
+ "eval_recall": 0.6758548499651081,
634
+ "eval_runtime": 4.3276,
635
+ "eval_samples_per_second": 672.43,
636
+ "eval_steps_per_second": 84.112,
637
  "step": 34932
638
  },
639
  {
640
  "epoch": 12.02335967021642,
641
+ "grad_norm": 0.04744827747344971,
642
+ "learning_rate": 9.922134432611933e-06,
643
+ "loss": 1.2362,
644
  "step": 35000
645
  },
646
  {
647
  "epoch": 12.195121951219512,
648
+ "grad_norm": 0.07846707850694656,
649
+ "learning_rate": 9.34959349593496e-06,
650
+ "loss": 1.235,
651
  "step": 35500
652
  },
653
  {
654
  "epoch": 12.366884232222604,
655
+ "grad_norm": 3.9505062103271484,
656
+ "learning_rate": 8.777052559257987e-06,
657
+ "loss": 1.2345,
658
  "step": 36000
659
  },
660
  {
661
  "epoch": 12.538646513225697,
662
+ "grad_norm": 0.13419800996780396,
663
+ "learning_rate": 8.204511622581015e-06,
664
+ "loss": 1.2362,
665
  "step": 36500
666
  },
667
  {
668
  "epoch": 12.710408794228787,
669
+ "grad_norm": 0.205936998128891,
670
+ "learning_rate": 7.631970685904042e-06,
671
+ "loss": 1.2341,
672
  "step": 37000
673
  },
674
  {
675
  "epoch": 12.882171075231879,
676
+ "grad_norm": 1.917006254196167,
677
+ "learning_rate": 7.05942974922707e-06,
678
+ "loss": 1.2349,
679
  "step": 37500
680
  },
681
  {
682
  "epoch": 13.0,
683
+ "eval_accuracy": 0.883872825560544,
684
+ "eval_f1": 0.6590367597004765,
685
+ "eval_loss": 1.619519829750061,
686
+ "eval_precision": 0.6432724252491694,
687
+ "eval_recall": 0.6755931612002791,
688
+ "eval_runtime": 4.3511,
689
+ "eval_samples_per_second": 668.801,
690
+ "eval_steps_per_second": 83.658,
691
  "step": 37843
692
  },
693
  {
694
  "epoch": 13.053933356234971,
695
+ "grad_norm": 0.5815674662590027,
696
+ "learning_rate": 6.486888812550097e-06,
697
+ "loss": 1.2342,
698
  "step": 38000
699
  },
700
  {
701
  "epoch": 13.225695637238063,
702
+ "grad_norm": 0.48151713609695435,
703
+ "learning_rate": 5.914347875873125e-06,
704
+ "loss": 1.2335,
705
  "step": 38500
706
  },
707
  {
708
  "epoch": 13.397457918241154,
709
+ "grad_norm": 4.141974925994873,
710
+ "learning_rate": 5.341806939196153e-06,
711
+ "loss": 1.2335,
712
  "step": 39000
713
  },
714
  {
715
  "epoch": 13.569220199244246,
716
+ "grad_norm": 0.24046790599822998,
717
+ "learning_rate": 4.76926600251918e-06,
718
+ "loss": 1.2331,
719
  "step": 39500
720
  },
721
  {
722
  "epoch": 13.740982480247338,
723
+ "grad_norm": 0.08363146334886551,
724
+ "learning_rate": 4.196725065842208e-06,
725
+ "loss": 1.233,
726
  "step": 40000
727
  },
728
  {
729
  "epoch": 13.91274476125043,
730
+ "grad_norm": 0.5658828616142273,
731
+ "learning_rate": 3.6241841291652353e-06,
732
+ "loss": 1.2328,
733
  "step": 40500
734
  },
735
  {
736
  "epoch": 14.0,
737
+ "eval_accuracy": 0.8845272843885391,
738
+ "eval_f1": 0.6591725081210464,
739
+ "eval_loss": 1.6228290796279907,
740
+ "eval_precision": 0.6462453905464298,
741
+ "eval_recall": 0.6726273551988835,
742
+ "eval_runtime": 4.3411,
743
+ "eval_samples_per_second": 670.343,
744
+ "eval_steps_per_second": 83.85,
745
  "step": 40754
746
  },
747
  {
748
  "epoch": 14.084507042253522,
749
+ "grad_norm": 1.4916341304779053,
750
+ "learning_rate": 3.051643192488263e-06,
751
+ "loss": 1.2318,
752
  "step": 41000
753
  },
754
  {
755
  "epoch": 14.256269323256612,
756
+ "grad_norm": 0.1434667557477951,
757
+ "learning_rate": 2.4791022558112906e-06,
758
+ "loss": 1.2305,
759
  "step": 41500
760
  },
761
  {
762
  "epoch": 14.428031604259704,
763
+ "grad_norm": 0.10652283579111099,
764
+ "learning_rate": 1.906561319134318e-06,
765
+ "loss": 1.232,
766
  "step": 42000
767
  },
768
  {
769
  "epoch": 14.599793885262796,
770
+ "grad_norm": 0.8040905594825745,
771
+ "learning_rate": 1.3340203824573458e-06,
772
+ "loss": 1.2321,
773
  "step": 42500
774
  },
775
  {
776
  "epoch": 14.771556166265889,
777
+ "grad_norm": 0.040788378566503525,
778
+ "learning_rate": 7.614794457803733e-07,
779
+ "loss": 1.2319,
780
  "step": 43000
781
  },
782
  {
783
  "epoch": 14.943318447268979,
784
+ "grad_norm": 0.5179036259651184,
785
+ "learning_rate": 1.889385091034009e-07,
786
+ "loss": 1.231,
787
  "step": 43500
788
  },
789
  {
790
  "epoch": 15.0,
791
+ "eval_accuracy": 0.8847236220369377,
792
+ "eval_f1": 0.660140218878249,
793
+ "eval_loss": 1.6247130632400513,
794
+ "eval_precision": 0.6473004694835681,
795
+ "eval_recall": 0.6734996510816469,
796
+ "eval_runtime": 4.3445,
797
+ "eval_samples_per_second": 669.806,
798
+ "eval_steps_per_second": 83.783,
799
  "step": 43665
800
  },
801
  {
802
+ "epoch": 15.0,
803
+ "step": 43665,
804
+ "total_flos": 1.1901430945516224e+16,
805
+ "train_loss": 1.311661433704009,
806
+ "train_runtime": 2299.911,
807
+ "train_samples_per_second": 151.865,
808
+ "train_steps_per_second": 18.986
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  }
810
  ],
811
  "logging_steps": 500,
812
+ "max_steps": 43665,
813
  "num_input_tokens_seen": 0,
814
+ "num_train_epochs": 15,
815
  "save_steps": 500,
816
  "stateful_callbacks": {
817
  "TrainerControl": {
 
825
  "attributes": {}
826
  }
827
  },
828
+ "total_flos": 1.1901430945516224e+16,
829
  "train_batch_size": 8,
830
  "trial_name": null,
831
  "trial_params": null