hts98 commited on
Commit
741cec3
·
verified ·
1 Parent(s): 7230933

End of training

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [vinai/phobert-large](https://huggingface.co/vinai/phobert-large) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.8918
22
- - F1: 0.6410
23
 
24
  ## Model description
25
 
 
18
 
19
  This model is a fine-tuned version of [vinai/phobert-large](https://huggingface.co/vinai/phobert-large) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.2752
22
+ - F1: 0.9361
23
 
24
  ## Model description
25
 
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_f1": 0.9360692350642099,
4
+ "eval_loss": 0.2751626968383789,
5
+ "eval_runtime": 29.5871,
6
+ "eval_samples": 3582,
7
+ "eval_samples_per_second": 121.066,
8
+ "eval_steps_per_second": 5.07,
9
+ "total_flos": 2.3084949799922688e+17,
10
+ "train_loss": 0.688100525762774,
11
+ "train_runtime": 30496.5649,
12
+ "train_samples": 24771,
13
+ "train_samples_per_second": 32.49,
14
+ "train_steps_per_second": 1.355
15
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_f1": 0.9360692350642099,
4
+ "eval_loss": 0.2751626968383789,
5
+ "eval_runtime": 29.5871,
6
+ "eval_samples": 3582,
7
+ "eval_samples_per_second": 121.066,
8
+ "eval_steps_per_second": 5.07
9
+ }
predict_results.txt ADDED
The diff for this file is too large to render. See raw diff
 
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "total_flos": 2.3084949799922688e+17,
4
+ "train_loss": 0.688100525762774,
5
+ "train_runtime": 30496.5649,
6
+ "train_samples": 24771,
7
+ "train_samples_per_second": 32.49,
8
+ "train_steps_per_second": 1.355
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,976 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9360692350642099,
3
+ "best_model_checkpoint": "/tmp/classification_phobertlarge/checkpoint-6198",
4
+ "epoch": 40.0,
5
+ "eval_steps": 500,
6
+ "global_step": 41320,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.484027105517909,
13
+ "grad_norm": 107.85002136230469,
14
+ "learning_rate": 2.9636979670861567e-05,
15
+ "loss": 0.4143,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.968054211035818,
20
+ "grad_norm": 15.951617240905762,
21
+ "learning_rate": 2.927395934172314e-05,
22
+ "loss": 0.3094,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 1.0,
27
+ "eval_f1": 0.9327191513121161,
28
+ "eval_loss": 0.22949737310409546,
29
+ "eval_runtime": 30.2358,
30
+ "eval_samples_per_second": 118.469,
31
+ "eval_steps_per_second": 4.961,
32
+ "step": 1033
33
+ },
34
+ {
35
+ "epoch": 1.452081316553727,
36
+ "grad_norm": 0.18993481993675232,
37
+ "learning_rate": 2.8910939012584705e-05,
38
+ "loss": 0.2335,
39
+ "step": 1500
40
+ },
41
+ {
42
+ "epoch": 1.936108422071636,
43
+ "grad_norm": 2.2120251655578613,
44
+ "learning_rate": 2.8547918683446275e-05,
45
+ "loss": 0.2346,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 2.0,
50
+ "eval_f1": 0.9279731993299832,
51
+ "eval_loss": 0.2650923430919647,
52
+ "eval_runtime": 30.2762,
53
+ "eval_samples_per_second": 118.311,
54
+ "eval_steps_per_second": 4.954,
55
+ "step": 2066
56
+ },
57
+ {
58
+ "epoch": 2.420135527589545,
59
+ "grad_norm": 1.64552640914917,
60
+ "learning_rate": 2.818489835430784e-05,
61
+ "loss": 0.1957,
62
+ "step": 2500
63
+ },
64
+ {
65
+ "epoch": 2.904162633107454,
66
+ "grad_norm": 1.3047701120376587,
67
+ "learning_rate": 2.7821878025169413e-05,
68
+ "loss": 0.2359,
69
+ "step": 3000
70
+ },
71
+ {
72
+ "epoch": 3.0,
73
+ "eval_f1": 0.9338358458961474,
74
+ "eval_loss": 0.2357342690229416,
75
+ "eval_runtime": 30.2251,
76
+ "eval_samples_per_second": 118.511,
77
+ "eval_steps_per_second": 4.963,
78
+ "step": 3099
79
+ },
80
+ {
81
+ "epoch": 3.388189738625363,
82
+ "grad_norm": 21.052520751953125,
83
+ "learning_rate": 2.745885769603098e-05,
84
+ "loss": 0.179,
85
+ "step": 3500
86
+ },
87
+ {
88
+ "epoch": 3.872216844143272,
89
+ "grad_norm": 0.44238439202308655,
90
+ "learning_rate": 2.7095837366892545e-05,
91
+ "loss": 0.1734,
92
+ "step": 4000
93
+ },
94
+ {
95
+ "epoch": 4.0,
96
+ "eval_f1": 0.9276940256839754,
97
+ "eval_loss": 0.2720405161380768,
98
+ "eval_runtime": 30.1857,
99
+ "eval_samples_per_second": 118.666,
100
+ "eval_steps_per_second": 4.969,
101
+ "step": 4132
102
+ },
103
+ {
104
+ "epoch": 4.356243949661181,
105
+ "grad_norm": 0.10865873098373413,
106
+ "learning_rate": 2.6732817037754114e-05,
107
+ "loss": 0.1593,
108
+ "step": 4500
109
+ },
110
+ {
111
+ "epoch": 4.84027105517909,
112
+ "grad_norm": 35.33164978027344,
113
+ "learning_rate": 2.6369796708615684e-05,
114
+ "loss": 0.1551,
115
+ "step": 5000
116
+ },
117
+ {
118
+ "epoch": 5.0,
119
+ "eval_f1": 0.9349525404801786,
120
+ "eval_loss": 0.29397937655448914,
121
+ "eval_runtime": 30.2085,
122
+ "eval_samples_per_second": 118.576,
123
+ "eval_steps_per_second": 4.965,
124
+ "step": 5165
125
+ },
126
+ {
127
+ "epoch": 5.3242981606969995,
128
+ "grad_norm": 0.6579780578613281,
129
+ "learning_rate": 2.6006776379477253e-05,
130
+ "loss": 0.1434,
131
+ "step": 5500
132
+ },
133
+ {
134
+ "epoch": 5.808325266214908,
135
+ "grad_norm": 0.2094825804233551,
136
+ "learning_rate": 2.564375605033882e-05,
137
+ "loss": 0.1282,
138
+ "step": 6000
139
+ },
140
+ {
141
+ "epoch": 6.0,
142
+ "eval_f1": 0.9360692350642099,
143
+ "eval_loss": 0.2751626968383789,
144
+ "eval_runtime": 30.1054,
145
+ "eval_samples_per_second": 118.982,
146
+ "eval_steps_per_second": 4.983,
147
+ "step": 6198
148
+ },
149
+ {
150
+ "epoch": 6.292352371732817,
151
+ "grad_norm": 0.12414942681789398,
152
+ "learning_rate": 2.5280735721200388e-05,
153
+ "loss": 0.1158,
154
+ "step": 6500
155
+ },
156
+ {
157
+ "epoch": 6.776379477250726,
158
+ "grad_norm": 4.030830383300781,
159
+ "learning_rate": 2.4917715392061957e-05,
160
+ "loss": 0.1208,
161
+ "step": 7000
162
+ },
163
+ {
164
+ "epoch": 7.0,
165
+ "eval_f1": 0.9346733668341709,
166
+ "eval_loss": 0.3136898875236511,
167
+ "eval_runtime": 30.2127,
168
+ "eval_samples_per_second": 118.559,
169
+ "eval_steps_per_second": 4.965,
170
+ "step": 7231
171
+ },
172
+ {
173
+ "epoch": 7.260406582768635,
174
+ "grad_norm": 4.0914530754089355,
175
+ "learning_rate": 2.4554695062923523e-05,
176
+ "loss": 0.1127,
177
+ "step": 7500
178
+ },
179
+ {
180
+ "epoch": 7.744433688286544,
181
+ "grad_norm": 0.10396777093410492,
182
+ "learning_rate": 2.4191674733785093e-05,
183
+ "loss": 0.1074,
184
+ "step": 8000
185
+ },
186
+ {
187
+ "epoch": 8.0,
188
+ "eval_f1": 0.9355108877721943,
189
+ "eval_loss": 0.2966548800468445,
190
+ "eval_runtime": 30.2555,
191
+ "eval_samples_per_second": 118.392,
192
+ "eval_steps_per_second": 4.958,
193
+ "step": 8264
194
+ },
195
+ {
196
+ "epoch": 8.228460793804453,
197
+ "grad_norm": 0.22765442728996277,
198
+ "learning_rate": 2.382865440464666e-05,
199
+ "loss": 0.0993,
200
+ "step": 8500
201
+ },
202
+ {
203
+ "epoch": 8.712487899322362,
204
+ "grad_norm": 0.5725598931312561,
205
+ "learning_rate": 2.346563407550823e-05,
206
+ "loss": 0.2719,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 9.0,
211
+ "eval_f1": 0.6409826912339475,
212
+ "eval_loss": 0.8269146084785461,
213
+ "eval_runtime": 30.1812,
214
+ "eval_samples_per_second": 118.683,
215
+ "eval_steps_per_second": 4.97,
216
+ "step": 9297
217
+ },
218
+ {
219
+ "epoch": 9.196515004840272,
220
+ "grad_norm": 1.2485491037368774,
221
+ "learning_rate": 2.3102613746369797e-05,
222
+ "loss": 0.8488,
223
+ "step": 9500
224
+ },
225
+ {
226
+ "epoch": 9.68054211035818,
227
+ "grad_norm": 0.6287474036216736,
228
+ "learning_rate": 2.2739593417231366e-05,
229
+ "loss": 0.8456,
230
+ "step": 10000
231
+ },
232
+ {
233
+ "epoch": 10.0,
234
+ "eval_f1": 0.6409826912339475,
235
+ "eval_loss": 0.8219632506370544,
236
+ "eval_runtime": 30.1694,
237
+ "eval_samples_per_second": 118.729,
238
+ "eval_steps_per_second": 4.972,
239
+ "step": 10330
240
+ },
241
+ {
242
+ "epoch": 10.164569215876089,
243
+ "grad_norm": 1.4201635122299194,
244
+ "learning_rate": 2.2376573088092936e-05,
245
+ "loss": 0.8446,
246
+ "step": 10500
247
+ },
248
+ {
249
+ "epoch": 10.648596321393999,
250
+ "grad_norm": 1.115445613861084,
251
+ "learning_rate": 2.20135527589545e-05,
252
+ "loss": 0.84,
253
+ "step": 11000
254
+ },
255
+ {
256
+ "epoch": 11.0,
257
+ "eval_f1": 0.6409826912339475,
258
+ "eval_loss": 0.8211880922317505,
259
+ "eval_runtime": 30.1617,
260
+ "eval_samples_per_second": 118.76,
261
+ "eval_steps_per_second": 4.973,
262
+ "step": 11363
263
+ },
264
+ {
265
+ "epoch": 11.132623426911907,
266
+ "grad_norm": 1.784400463104248,
267
+ "learning_rate": 2.165053242981607e-05,
268
+ "loss": 0.8435,
269
+ "step": 11500
270
+ },
271
+ {
272
+ "epoch": 11.616650532429816,
273
+ "grad_norm": 1.7544752359390259,
274
+ "learning_rate": 2.1287512100677637e-05,
275
+ "loss": 0.841,
276
+ "step": 12000
277
+ },
278
+ {
279
+ "epoch": 12.0,
280
+ "eval_f1": 0.6409826912339475,
281
+ "eval_loss": 0.8207370042800903,
282
+ "eval_runtime": 30.1494,
283
+ "eval_samples_per_second": 118.808,
284
+ "eval_steps_per_second": 4.975,
285
+ "step": 12396
286
+ },
287
+ {
288
+ "epoch": 12.100677637947726,
289
+ "grad_norm": 0.660882830619812,
290
+ "learning_rate": 2.092449177153921e-05,
291
+ "loss": 0.8421,
292
+ "step": 12500
293
+ },
294
+ {
295
+ "epoch": 12.584704743465634,
296
+ "grad_norm": 0.5564029812812805,
297
+ "learning_rate": 2.0561471442400775e-05,
298
+ "loss": 0.8383,
299
+ "step": 13000
300
+ },
301
+ {
302
+ "epoch": 13.0,
303
+ "eval_f1": 0.6409826912339475,
304
+ "eval_loss": 0.8198010325431824,
305
+ "eval_runtime": 30.2024,
306
+ "eval_samples_per_second": 118.6,
307
+ "eval_steps_per_second": 4.966,
308
+ "step": 13429
309
+ },
310
+ {
311
+ "epoch": 13.068731848983543,
312
+ "grad_norm": 0.45510706305503845,
313
+ "learning_rate": 2.019845111326234e-05,
314
+ "loss": 0.8494,
315
+ "step": 13500
316
+ },
317
+ {
318
+ "epoch": 13.552758954501453,
319
+ "grad_norm": 0.9993030428886414,
320
+ "learning_rate": 1.983543078412391e-05,
321
+ "loss": 0.8371,
322
+ "step": 14000
323
+ },
324
+ {
325
+ "epoch": 14.0,
326
+ "eval_f1": 0.6409826912339475,
327
+ "eval_loss": 0.8276057839393616,
328
+ "eval_runtime": 30.1498,
329
+ "eval_samples_per_second": 118.807,
330
+ "eval_steps_per_second": 4.975,
331
+ "step": 14462
332
+ },
333
+ {
334
+ "epoch": 14.036786060019361,
335
+ "grad_norm": 0.8151612281799316,
336
+ "learning_rate": 1.947241045498548e-05,
337
+ "loss": 0.8465,
338
+ "step": 14500
339
+ },
340
+ {
341
+ "epoch": 14.52081316553727,
342
+ "grad_norm": 1.312504768371582,
343
+ "learning_rate": 1.910939012584705e-05,
344
+ "loss": 0.8486,
345
+ "step": 15000
346
+ },
347
+ {
348
+ "epoch": 15.0,
349
+ "eval_f1": 0.6409826912339475,
350
+ "eval_loss": 0.8243775367736816,
351
+ "eval_runtime": 30.177,
352
+ "eval_samples_per_second": 118.7,
353
+ "eval_steps_per_second": 4.971,
354
+ "step": 15495
355
+ },
356
+ {
357
+ "epoch": 15.00484027105518,
358
+ "grad_norm": 1.09600031375885,
359
+ "learning_rate": 1.8746369796708615e-05,
360
+ "loss": 0.8345,
361
+ "step": 15500
362
+ },
363
+ {
364
+ "epoch": 15.488867376573088,
365
+ "grad_norm": 1.3698476552963257,
366
+ "learning_rate": 1.8383349467570184e-05,
367
+ "loss": 0.8365,
368
+ "step": 16000
369
+ },
370
+ {
371
+ "epoch": 15.972894482090997,
372
+ "grad_norm": 0.5735417008399963,
373
+ "learning_rate": 1.8020329138431754e-05,
374
+ "loss": 0.844,
375
+ "step": 16500
376
+ },
377
+ {
378
+ "epoch": 16.0,
379
+ "eval_f1": 0.6409826912339475,
380
+ "eval_loss": 0.8390949964523315,
381
+ "eval_runtime": 30.2033,
382
+ "eval_samples_per_second": 118.596,
383
+ "eval_steps_per_second": 4.966,
384
+ "step": 16528
385
+ },
386
+ {
387
+ "epoch": 16.456921587608907,
388
+ "grad_norm": 1.3362003564834595,
389
+ "learning_rate": 1.765730880929332e-05,
390
+ "loss": 0.8349,
391
+ "step": 17000
392
+ },
393
+ {
394
+ "epoch": 16.940948693126813,
395
+ "grad_norm": 0.4842800796031952,
396
+ "learning_rate": 1.729428848015489e-05,
397
+ "loss": 0.837,
398
+ "step": 17500
399
+ },
400
+ {
401
+ "epoch": 17.0,
402
+ "eval_f1": 0.6409826912339475,
403
+ "eval_loss": 0.8234853744506836,
404
+ "eval_runtime": 30.2033,
405
+ "eval_samples_per_second": 118.596,
406
+ "eval_steps_per_second": 4.966,
407
+ "step": 17561
408
+ },
409
+ {
410
+ "epoch": 17.424975798644724,
411
+ "grad_norm": 0.564506471157074,
412
+ "learning_rate": 1.6931268151016455e-05,
413
+ "loss": 0.8368,
414
+ "step": 18000
415
+ },
416
+ {
417
+ "epoch": 17.909002904162634,
418
+ "grad_norm": 1.0698164701461792,
419
+ "learning_rate": 1.6568247821878027e-05,
420
+ "loss": 0.8438,
421
+ "step": 18500
422
+ },
423
+ {
424
+ "epoch": 18.0,
425
+ "eval_f1": 0.6409826912339475,
426
+ "eval_loss": 0.8246738314628601,
427
+ "eval_runtime": 30.1891,
428
+ "eval_samples_per_second": 118.652,
429
+ "eval_steps_per_second": 4.969,
430
+ "step": 18594
431
+ },
432
+ {
433
+ "epoch": 18.393030009680544,
434
+ "grad_norm": 0.8105427026748657,
435
+ "learning_rate": 1.6205227492739593e-05,
436
+ "loss": 0.8367,
437
+ "step": 19000
438
+ },
439
+ {
440
+ "epoch": 18.87705711519845,
441
+ "grad_norm": 0.630368173122406,
442
+ "learning_rate": 1.5842207163601163e-05,
443
+ "loss": 0.8418,
444
+ "step": 19500
445
+ },
446
+ {
447
+ "epoch": 19.0,
448
+ "eval_f1": 0.6409826912339475,
449
+ "eval_loss": 0.8236768841743469,
450
+ "eval_runtime": 30.1711,
451
+ "eval_samples_per_second": 118.723,
452
+ "eval_steps_per_second": 4.972,
453
+ "step": 19627
454
+ },
455
+ {
456
+ "epoch": 19.36108422071636,
457
+ "grad_norm": 0.884172797203064,
458
+ "learning_rate": 1.5479186834462732e-05,
459
+ "loss": 0.8503,
460
+ "step": 20000
461
+ },
462
+ {
463
+ "epoch": 19.845111326234267,
464
+ "grad_norm": 0.585003674030304,
465
+ "learning_rate": 1.51161665053243e-05,
466
+ "loss": 0.8384,
467
+ "step": 20500
468
+ },
469
+ {
470
+ "epoch": 20.0,
471
+ "eval_f1": 0.6409826912339475,
472
+ "eval_loss": 0.8199198246002197,
473
+ "eval_runtime": 30.1335,
474
+ "eval_samples_per_second": 118.871,
475
+ "eval_steps_per_second": 4.978,
476
+ "step": 20660
477
+ },
478
+ {
479
+ "epoch": 20.329138431752177,
480
+ "grad_norm": 1.194004774093628,
481
+ "learning_rate": 1.4753146176185867e-05,
482
+ "loss": 0.845,
483
+ "step": 21000
484
+ },
485
+ {
486
+ "epoch": 20.813165537270088,
487
+ "grad_norm": 0.378979355096817,
488
+ "learning_rate": 1.4390125847047436e-05,
489
+ "loss": 0.8387,
490
+ "step": 21500
491
+ },
492
+ {
493
+ "epoch": 21.0,
494
+ "eval_f1": 0.6409826912339475,
495
+ "eval_loss": 0.8225679993629456,
496
+ "eval_runtime": 29.9919,
497
+ "eval_samples_per_second": 119.432,
498
+ "eval_steps_per_second": 5.001,
499
+ "step": 21693
500
+ },
501
+ {
502
+ "epoch": 21.297192642787998,
503
+ "grad_norm": 0.6615312099456787,
504
+ "learning_rate": 1.4027105517909002e-05,
505
+ "loss": 0.833,
506
+ "step": 22000
507
+ },
508
+ {
509
+ "epoch": 21.781219748305904,
510
+ "grad_norm": 0.4334174394607544,
511
+ "learning_rate": 1.3664085188770572e-05,
512
+ "loss": 0.8478,
513
+ "step": 22500
514
+ },
515
+ {
516
+ "epoch": 22.0,
517
+ "eval_f1": 0.6409826912339475,
518
+ "eval_loss": 0.8205086588859558,
519
+ "eval_runtime": 29.9624,
520
+ "eval_samples_per_second": 119.55,
521
+ "eval_steps_per_second": 5.006,
522
+ "step": 22726
523
+ },
524
+ {
525
+ "epoch": 22.265246853823815,
526
+ "grad_norm": 0.8907693028450012,
527
+ "learning_rate": 1.3301064859632139e-05,
528
+ "loss": 0.8442,
529
+ "step": 23000
530
+ },
531
+ {
532
+ "epoch": 22.749273959341725,
533
+ "grad_norm": 0.5629915595054626,
534
+ "learning_rate": 1.2938044530493708e-05,
535
+ "loss": 0.8364,
536
+ "step": 23500
537
+ },
538
+ {
539
+ "epoch": 23.0,
540
+ "eval_f1": 0.6409826912339475,
541
+ "eval_loss": 0.8259473443031311,
542
+ "eval_runtime": 29.9524,
543
+ "eval_samples_per_second": 119.59,
544
+ "eval_steps_per_second": 5.008,
545
+ "step": 23759
546
+ },
547
+ {
548
+ "epoch": 23.23330106485963,
549
+ "grad_norm": 0.47178810834884644,
550
+ "learning_rate": 1.2575024201355276e-05,
551
+ "loss": 0.83,
552
+ "step": 24000
553
+ },
554
+ {
555
+ "epoch": 23.71732817037754,
556
+ "grad_norm": 0.5889131426811218,
557
+ "learning_rate": 1.2212003872216845e-05,
558
+ "loss": 0.8325,
559
+ "step": 24500
560
+ },
561
+ {
562
+ "epoch": 24.0,
563
+ "eval_f1": 0.6409826912339475,
564
+ "eval_loss": 0.8245280385017395,
565
+ "eval_runtime": 29.9556,
566
+ "eval_samples_per_second": 119.577,
567
+ "eval_steps_per_second": 5.007,
568
+ "step": 24792
569
+ },
570
+ {
571
+ "epoch": 24.20135527589545,
572
+ "grad_norm": 0.8042486310005188,
573
+ "learning_rate": 1.1848983543078413e-05,
574
+ "loss": 0.8357,
575
+ "step": 25000
576
+ },
577
+ {
578
+ "epoch": 24.68538238141336,
579
+ "grad_norm": 0.7427254319190979,
580
+ "learning_rate": 1.148596321393998e-05,
581
+ "loss": 0.8289,
582
+ "step": 25500
583
+ },
584
+ {
585
+ "epoch": 25.0,
586
+ "eval_f1": 0.6409826912339475,
587
+ "eval_loss": 0.8248125314712524,
588
+ "eval_runtime": 30.0337,
589
+ "eval_samples_per_second": 119.266,
590
+ "eval_steps_per_second": 4.994,
591
+ "step": 25825
592
+ },
593
+ {
594
+ "epoch": 25.16940948693127,
595
+ "grad_norm": 1.4444160461425781,
596
+ "learning_rate": 1.1122942884801548e-05,
597
+ "loss": 0.8344,
598
+ "step": 26000
599
+ },
600
+ {
601
+ "epoch": 25.65343659244918,
602
+ "grad_norm": 1.6647661924362183,
603
+ "learning_rate": 1.0759922555663117e-05,
604
+ "loss": 0.8251,
605
+ "step": 26500
606
+ },
607
+ {
608
+ "epoch": 26.0,
609
+ "eval_f1": 0.6409826912339475,
610
+ "eval_loss": 0.8246968984603882,
611
+ "eval_runtime": 30.0343,
612
+ "eval_samples_per_second": 119.263,
613
+ "eval_steps_per_second": 4.994,
614
+ "step": 26858
615
+ },
616
+ {
617
+ "epoch": 26.137463697967085,
618
+ "grad_norm": 0.560504138469696,
619
+ "learning_rate": 1.0396902226524685e-05,
620
+ "loss": 0.8265,
621
+ "step": 27000
622
+ },
623
+ {
624
+ "epoch": 26.621490803484996,
625
+ "grad_norm": 2.6784770488739014,
626
+ "learning_rate": 1.0033881897386254e-05,
627
+ "loss": 0.824,
628
+ "step": 27500
629
+ },
630
+ {
631
+ "epoch": 27.0,
632
+ "eval_f1": 0.6409826912339475,
633
+ "eval_loss": 0.821439802646637,
634
+ "eval_runtime": 29.9327,
635
+ "eval_samples_per_second": 119.668,
636
+ "eval_steps_per_second": 5.011,
637
+ "step": 27891
638
+ },
639
+ {
640
+ "epoch": 27.105517909002906,
641
+ "grad_norm": 0.9694743156433105,
642
+ "learning_rate": 9.670861568247822e-06,
643
+ "loss": 0.8304,
644
+ "step": 28000
645
+ },
646
+ {
647
+ "epoch": 27.589545014520812,
648
+ "grad_norm": 0.8298953771591187,
649
+ "learning_rate": 9.307841239109391e-06,
650
+ "loss": 0.8197,
651
+ "step": 28500
652
+ },
653
+ {
654
+ "epoch": 28.0,
655
+ "eval_f1": 0.6409826912339475,
656
+ "eval_loss": 0.8281795382499695,
657
+ "eval_runtime": 29.9052,
658
+ "eval_samples_per_second": 119.778,
659
+ "eval_steps_per_second": 5.016,
660
+ "step": 28924
661
+ },
662
+ {
663
+ "epoch": 28.073572120038722,
664
+ "grad_norm": 1.530045986175537,
665
+ "learning_rate": 8.944820909970959e-06,
666
+ "loss": 0.8214,
667
+ "step": 29000
668
+ },
669
+ {
670
+ "epoch": 28.557599225556633,
671
+ "grad_norm": 1.6190696954727173,
672
+ "learning_rate": 8.581800580832526e-06,
673
+ "loss": 0.8241,
674
+ "step": 29500
675
+ },
676
+ {
677
+ "epoch": 29.0,
678
+ "eval_f1": 0.6409826912339475,
679
+ "eval_loss": 0.8340330123901367,
680
+ "eval_runtime": 29.9993,
681
+ "eval_samples_per_second": 119.403,
682
+ "eval_steps_per_second": 5.0,
683
+ "step": 29957
684
+ },
685
+ {
686
+ "epoch": 29.04162633107454,
687
+ "grad_norm": 0.6487633585929871,
688
+ "learning_rate": 8.218780251694096e-06,
689
+ "loss": 0.8209,
690
+ "step": 30000
691
+ },
692
+ {
693
+ "epoch": 29.52565343659245,
694
+ "grad_norm": 1.1463664770126343,
695
+ "learning_rate": 7.855759922555663e-06,
696
+ "loss": 0.8285,
697
+ "step": 30500
698
+ },
699
+ {
700
+ "epoch": 30.0,
701
+ "eval_f1": 0.6409826912339475,
702
+ "eval_loss": 0.8359894156455994,
703
+ "eval_runtime": 29.9709,
704
+ "eval_samples_per_second": 119.516,
705
+ "eval_steps_per_second": 5.005,
706
+ "step": 30990
707
+ },
708
+ {
709
+ "epoch": 30.00968054211036,
710
+ "grad_norm": 1.2427114248275757,
711
+ "learning_rate": 7.492739593417232e-06,
712
+ "loss": 0.8103,
713
+ "step": 31000
714
+ },
715
+ {
716
+ "epoch": 30.493707647628266,
717
+ "grad_norm": 1.2282146215438843,
718
+ "learning_rate": 7.129719264278799e-06,
719
+ "loss": 0.814,
720
+ "step": 31500
721
+ },
722
+ {
723
+ "epoch": 30.977734753146176,
724
+ "grad_norm": 1.6168450117111206,
725
+ "learning_rate": 6.766698935140368e-06,
726
+ "loss": 0.8169,
727
+ "step": 32000
728
+ },
729
+ {
730
+ "epoch": 31.0,
731
+ "eval_f1": 0.6409826912339475,
732
+ "eval_loss": 0.8400696516036987,
733
+ "eval_runtime": 29.9824,
734
+ "eval_samples_per_second": 119.47,
735
+ "eval_steps_per_second": 5.003,
736
+ "step": 32023
737
+ },
738
+ {
739
+ "epoch": 31.461761858664087,
740
+ "grad_norm": 2.838101863861084,
741
+ "learning_rate": 6.403678606001936e-06,
742
+ "loss": 0.8113,
743
+ "step": 32500
744
+ },
745
+ {
746
+ "epoch": 31.945788964181993,
747
+ "grad_norm": 1.7709537744522095,
748
+ "learning_rate": 6.040658276863505e-06,
749
+ "loss": 0.811,
750
+ "step": 33000
751
+ },
752
+ {
753
+ "epoch": 32.0,
754
+ "eval_f1": 0.6409826912339475,
755
+ "eval_loss": 0.853431224822998,
756
+ "eval_runtime": 30.0224,
757
+ "eval_samples_per_second": 119.311,
758
+ "eval_steps_per_second": 4.996,
759
+ "step": 33056
760
+ },
761
+ {
762
+ "epoch": 32.4298160696999,
763
+ "grad_norm": 1.901810646057129,
764
+ "learning_rate": 5.677637947725073e-06,
765
+ "loss": 0.8153,
766
+ "step": 33500
767
+ },
768
+ {
769
+ "epoch": 32.91384317521781,
770
+ "grad_norm": 3.1245410442352295,
771
+ "learning_rate": 5.3146176185866415e-06,
772
+ "loss": 0.8056,
773
+ "step": 34000
774
+ },
775
+ {
776
+ "epoch": 33.0,
777
+ "eval_f1": 0.6409826912339475,
778
+ "eval_loss": 0.869020938873291,
779
+ "eval_runtime": 29.9807,
780
+ "eval_samples_per_second": 119.477,
781
+ "eval_steps_per_second": 5.003,
782
+ "step": 34089
783
+ },
784
+ {
785
+ "epoch": 33.397870280735724,
786
+ "grad_norm": 2.18399977684021,
787
+ "learning_rate": 4.951597289448209e-06,
788
+ "loss": 0.8193,
789
+ "step": 34500
790
+ },
791
+ {
792
+ "epoch": 33.88189738625363,
793
+ "grad_norm": 0.905960202217102,
794
+ "learning_rate": 4.588576960309778e-06,
795
+ "loss": 0.8023,
796
+ "step": 35000
797
+ },
798
+ {
799
+ "epoch": 34.0,
800
+ "eval_f1": 0.6409826912339475,
801
+ "eval_loss": 0.863983154296875,
802
+ "eval_runtime": 29.8457,
803
+ "eval_samples_per_second": 120.017,
804
+ "eval_steps_per_second": 5.026,
805
+ "step": 35122
806
+ },
807
+ {
808
+ "epoch": 34.36592449177154,
809
+ "grad_norm": 2.841273069381714,
810
+ "learning_rate": 4.225556631171346e-06,
811
+ "loss": 0.8067,
812
+ "step": 35500
813
+ },
814
+ {
815
+ "epoch": 34.84995159728945,
816
+ "grad_norm": 0.9258007407188416,
817
+ "learning_rate": 3.8625363020329145e-06,
818
+ "loss": 0.8146,
819
+ "step": 36000
820
+ },
821
+ {
822
+ "epoch": 35.0,
823
+ "eval_f1": 0.6409826912339475,
824
+ "eval_loss": 0.870430052280426,
825
+ "eval_runtime": 29.8816,
826
+ "eval_samples_per_second": 119.873,
827
+ "eval_steps_per_second": 5.02,
828
+ "step": 36155
829
+ },
830
+ {
831
+ "epoch": 35.33397870280736,
832
+ "grad_norm": 0.954750120639801,
833
+ "learning_rate": 3.499515972894482e-06,
834
+ "loss": 0.7988,
835
+ "step": 36500
836
+ },
837
+ {
838
+ "epoch": 35.81800580832527,
839
+ "grad_norm": 1.9225430488586426,
840
+ "learning_rate": 3.1364956437560505e-06,
841
+ "loss": 0.8079,
842
+ "step": 37000
843
+ },
844
+ {
845
+ "epoch": 36.0,
846
+ "eval_f1": 0.6409826912339475,
847
+ "eval_loss": 0.8958960175514221,
848
+ "eval_runtime": 29.9806,
849
+ "eval_samples_per_second": 119.477,
850
+ "eval_steps_per_second": 5.003,
851
+ "step": 37188
852
+ },
853
+ {
854
+ "epoch": 36.30203291384318,
855
+ "grad_norm": 1.3240045309066772,
856
+ "learning_rate": 2.7734753146176185e-06,
857
+ "loss": 0.8054,
858
+ "step": 37500
859
+ },
860
+ {
861
+ "epoch": 36.78606001936109,
862
+ "grad_norm": 5.058154106140137,
863
+ "learning_rate": 2.410454985479187e-06,
864
+ "loss": 0.8081,
865
+ "step": 38000
866
+ },
867
+ {
868
+ "epoch": 37.0,
869
+ "eval_f1": 0.6409826912339475,
870
+ "eval_loss": 0.8801546692848206,
871
+ "eval_runtime": 29.8881,
872
+ "eval_samples_per_second": 119.847,
873
+ "eval_steps_per_second": 5.019,
874
+ "step": 38221
875
+ },
876
+ {
877
+ "epoch": 37.27008712487899,
878
+ "grad_norm": 2.5231587886810303,
879
+ "learning_rate": 2.047434656340755e-06,
880
+ "loss": 0.8066,
881
+ "step": 38500
882
+ },
883
+ {
884
+ "epoch": 37.7541142303969,
885
+ "grad_norm": 1.007765769958496,
886
+ "learning_rate": 1.6844143272023235e-06,
887
+ "loss": 0.8059,
888
+ "step": 39000
889
+ },
890
+ {
891
+ "epoch": 38.0,
892
+ "eval_f1": 0.6409826912339475,
893
+ "eval_loss": 0.890051543712616,
894
+ "eval_runtime": 29.9175,
895
+ "eval_samples_per_second": 119.729,
896
+ "eval_steps_per_second": 5.014,
897
+ "step": 39254
898
+ },
899
+ {
900
+ "epoch": 38.23814133591481,
901
+ "grad_norm": 1.9777470827102661,
902
+ "learning_rate": 1.3213939980638917e-06,
903
+ "loss": 0.8077,
904
+ "step": 39500
905
+ },
906
+ {
907
+ "epoch": 38.72216844143272,
908
+ "grad_norm": 2.1567304134368896,
909
+ "learning_rate": 9.5837366892546e-07,
910
+ "loss": 0.8045,
911
+ "step": 40000
912
+ },
913
+ {
914
+ "epoch": 39.0,
915
+ "eval_f1": 0.6409826912339475,
916
+ "eval_loss": 0.888158917427063,
917
+ "eval_runtime": 29.9571,
918
+ "eval_samples_per_second": 119.571,
919
+ "eval_steps_per_second": 5.007,
920
+ "step": 40287
921
+ },
922
+ {
923
+ "epoch": 39.20619554695063,
924
+ "grad_norm": 1.4379490613937378,
925
+ "learning_rate": 5.953533397870282e-07,
926
+ "loss": 0.7972,
927
+ "step": 40500
928
+ },
929
+ {
930
+ "epoch": 39.690222652468535,
931
+ "grad_norm": 1.5997909307479858,
932
+ "learning_rate": 2.3233301064859634e-07,
933
+ "loss": 0.8024,
934
+ "step": 41000
935
+ },
936
+ {
937
+ "epoch": 40.0,
938
+ "eval_f1": 0.6409826912339475,
939
+ "eval_loss": 0.8918312788009644,
940
+ "eval_runtime": 29.6532,
941
+ "eval_samples_per_second": 120.796,
942
+ "eval_steps_per_second": 5.058,
943
+ "step": 41320
944
+ },
945
+ {
946
+ "epoch": 40.0,
947
+ "step": 41320,
948
+ "total_flos": 2.3084949799922688e+17,
949
+ "train_loss": 0.688100525762774,
950
+ "train_runtime": 30496.5649,
951
+ "train_samples_per_second": 32.49,
952
+ "train_steps_per_second": 1.355
953
+ }
954
+ ],
955
+ "logging_steps": 500,
956
+ "max_steps": 41320,
957
+ "num_input_tokens_seen": 0,
958
+ "num_train_epochs": 40,
959
+ "save_steps": 500,
960
+ "stateful_callbacks": {
961
+ "TrainerControl": {
962
+ "args": {
963
+ "should_epoch_stop": false,
964
+ "should_evaluate": false,
965
+ "should_log": false,
966
+ "should_save": true,
967
+ "should_training_stop": true
968
+ },
969
+ "attributes": {}
970
+ }
971
+ },
972
+ "total_flos": 2.3084949799922688e+17,
973
+ "train_batch_size": 24,
974
+ "trial_name": null,
975
+ "trial_params": null
976
+ }