carted-ml commited on
Commit
646b8e7
·
1 Parent(s): ff8e97b

carted-nlp/categorization-finetuned-20220721-164940-distilled-20220811-013354

Browse files
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.8775628491620112,
4
+ "eval_f1": 0.8768335428087971,
5
+ "eval_loss": 0.06446786969900131,
6
+ "eval_runtime": 36.0103,
7
+ "eval_samples_per_second": 3976.641,
8
+ "eval_steps_per_second": 41.433,
9
+ "test_samples": 143200,
10
+ "train_loss": 0.06639543622843883,
11
+ "train_runtime": 19911.3581,
12
+ "train_samples": 1138117,
13
+ "train_samples_per_second": 1714.776,
14
+ "train_steps_per_second": 6.699
15
+ }
runs/Aug11_01-34-27_product-categorization/events.out.tfevents.1660201727.product-categorization.2923.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70a1274eb11cc975ac731a8bc855ff76898e266004b91e8620ed9413333c410f
3
+ size 416
test_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.8775628491620112,
4
+ "eval_f1": 0.8768335428087971,
5
+ "eval_loss": 0.06446786969900131,
6
+ "eval_runtime": 36.0103,
7
+ "eval_samples_per_second": 3976.641,
8
+ "eval_steps_per_second": 41.433,
9
+ "test_samples": 143200
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "train_loss": 0.06639543622843883,
4
+ "train_runtime": 19911.3581,
5
+ "train_samples": 1138117,
6
+ "train_samples_per_second": 1714.776,
7
+ "train_steps_per_second": 6.699
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.06449923664331436,
3
+ "best_model_checkpoint": "./categorization-finetuned-20220721-164940-distilled-20220811-013354/checkpoint-127500",
4
+ "epoch": 30.0,
5
+ "global_step": 133380,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.56,
12
+ "learning_rate": 1.999716279095759e-05,
13
+ "loss": 0.2702,
14
+ "step": 2500
15
+ },
16
+ {
17
+ "epoch": 0.56,
18
+ "eval_accuracy": 0.7832328973737684,
19
+ "eval_f1": 0.7782592459057264,
20
+ "eval_loss": 0.12900541722774506,
21
+ "eval_runtime": 33.288,
22
+ "eval_samples_per_second": 4302.123,
23
+ "eval_steps_per_second": 44.821,
24
+ "step": 2500
25
+ },
26
+ {
27
+ "epoch": 1.12,
28
+ "learning_rate": 1.9965262675626726e-05,
29
+ "loss": 0.1246,
30
+ "step": 5000
31
+ },
32
+ {
33
+ "epoch": 1.12,
34
+ "eval_accuracy": 0.8168830171288117,
35
+ "eval_f1": 0.8137001385896282,
36
+ "eval_loss": 0.10469090193510056,
37
+ "eval_runtime": 35.1587,
38
+ "eval_samples_per_second": 4073.218,
39
+ "eval_steps_per_second": 42.436,
40
+ "step": 5000
41
+ },
42
+ {
43
+ "epoch": 1.69,
44
+ "learning_rate": 1.9898029417041328e-05,
45
+ "loss": 0.1066,
46
+ "step": 7500
47
+ },
48
+ {
49
+ "epoch": 1.69,
50
+ "eval_accuracy": 0.8300665460969632,
51
+ "eval_f1": 0.8276076020101577,
52
+ "eval_loss": 0.09453058242797852,
53
+ "eval_runtime": 34.7428,
54
+ "eval_samples_per_second": 4121.976,
55
+ "eval_steps_per_second": 42.944,
56
+ "step": 7500
57
+ },
58
+ {
59
+ "epoch": 2.25,
60
+ "learning_rate": 1.979570139952044e-05,
61
+ "loss": 0.0975,
62
+ "step": 10000
63
+ },
64
+ {
65
+ "epoch": 2.25,
66
+ "eval_accuracy": 0.838564615352387,
67
+ "eval_f1": 0.8366911235307668,
68
+ "eval_loss": 0.0888088271021843,
69
+ "eval_runtime": 33.4588,
70
+ "eval_samples_per_second": 4280.16,
71
+ "eval_steps_per_second": 44.592,
72
+ "step": 10000
73
+ },
74
+ {
75
+ "epoch": 2.81,
76
+ "learning_rate": 1.965864144044478e-05,
77
+ "loss": 0.0917,
78
+ "step": 12500
79
+ },
80
+ {
81
+ "epoch": 2.81,
82
+ "eval_accuracy": 0.844548876118121,
83
+ "eval_f1": 0.8428155480829799,
84
+ "eval_loss": 0.08494799584150314,
85
+ "eval_runtime": 33.2026,
86
+ "eval_samples_per_second": 4313.187,
87
+ "eval_steps_per_second": 44.936,
88
+ "step": 12500
89
+ },
90
+ {
91
+ "epoch": 3.37,
92
+ "learning_rate": 1.9487335503840186e-05,
93
+ "loss": 0.0865,
94
+ "step": 15000
95
+ },
96
+ {
97
+ "epoch": 3.37,
98
+ "eval_accuracy": 0.8495625274947803,
99
+ "eval_f1": 0.8483980426142682,
100
+ "eval_loss": 0.08176358044147491,
101
+ "eval_runtime": 34.1293,
102
+ "eval_samples_per_second": 4196.068,
103
+ "eval_steps_per_second": 43.716,
104
+ "step": 15000
105
+ },
106
+ {
107
+ "epoch": 3.94,
108
+ "learning_rate": 1.9282390977328586e-05,
109
+ "loss": 0.0835,
110
+ "step": 17500
111
+ },
112
+ {
113
+ "epoch": 3.94,
114
+ "eval_accuracy": 0.852572114881048,
115
+ "eval_f1": 0.8508615234241726,
116
+ "eval_loss": 0.07959919422864914,
117
+ "eval_runtime": 33.0925,
118
+ "eval_samples_per_second": 4327.54,
119
+ "eval_steps_per_second": 45.086,
120
+ "step": 17500
121
+ },
122
+ {
123
+ "epoch": 4.5,
124
+ "learning_rate": 1.904453451855566e-05,
125
+ "loss": 0.08,
126
+ "step": 20000
127
+ },
128
+ {
129
+ "epoch": 4.5,
130
+ "eval_accuracy": 0.8552255793979429,
131
+ "eval_f1": 0.8541677666238712,
132
+ "eval_loss": 0.07768898457288742,
133
+ "eval_runtime": 35.7876,
134
+ "eval_samples_per_second": 4001.64,
135
+ "eval_steps_per_second": 41.69,
136
+ "step": 20000
137
+ },
138
+ {
139
+ "epoch": 5.06,
140
+ "learning_rate": 1.8774609478731048e-05,
141
+ "loss": 0.0778,
142
+ "step": 22500
143
+ },
144
+ {
145
+ "epoch": 5.06,
146
+ "eval_accuracy": 0.8580117171406825,
147
+ "eval_f1": 0.856711042073328,
148
+ "eval_loss": 0.0763072520494461,
149
+ "eval_runtime": 35.4902,
150
+ "eval_samples_per_second": 4035.173,
151
+ "eval_steps_per_second": 42.04,
152
+ "step": 22500
153
+ },
154
+ {
155
+ "epoch": 5.62,
156
+ "learning_rate": 1.8473572912416232e-05,
157
+ "loss": 0.0753,
158
+ "step": 25000
159
+ },
160
+ {
161
+ "epoch": 5.62,
162
+ "eval_accuracy": 0.8603509555963661,
163
+ "eval_f1": 0.859160334506478,
164
+ "eval_loss": 0.07438770681619644,
165
+ "eval_runtime": 35.3837,
166
+ "eval_samples_per_second": 4047.317,
167
+ "eval_steps_per_second": 42.166,
168
+ "step": 25000
169
+ },
170
+ {
171
+ "epoch": 6.19,
172
+ "learning_rate": 1.8142492184162323e-05,
173
+ "loss": 0.0739,
174
+ "step": 27500
175
+ },
176
+ {
177
+ "epoch": 6.19,
178
+ "eval_accuracy": 0.8613844101976831,
179
+ "eval_f1": 0.8603226866244833,
180
+ "eval_loss": 0.07380488514900208,
181
+ "eval_runtime": 33.7385,
182
+ "eval_samples_per_second": 4244.676,
183
+ "eval_steps_per_second": 44.222,
184
+ "step": 27500
185
+ },
186
+ {
187
+ "epoch": 6.75,
188
+ "learning_rate": 1.7782541184029316e-05,
189
+ "loss": 0.0716,
190
+ "step": 30000
191
+ },
192
+ {
193
+ "epoch": 6.75,
194
+ "eval_accuracy": 0.863032351318702,
195
+ "eval_f1": 0.8620277221500123,
196
+ "eval_loss": 0.0729290321469307,
197
+ "eval_runtime": 33.5267,
198
+ "eval_samples_per_second": 4271.486,
199
+ "eval_steps_per_second": 44.502,
200
+ "step": 30000
201
+ },
202
+ {
203
+ "epoch": 7.31,
204
+ "learning_rate": 1.7394996165405244e-05,
205
+ "loss": 0.0701,
206
+ "step": 32500
207
+ },
208
+ {
209
+ "epoch": 7.31,
210
+ "eval_accuracy": 0.8645196880084353,
211
+ "eval_f1": 0.8638326806071638,
212
+ "eval_loss": 0.07191809266805649,
213
+ "eval_runtime": 35.0686,
214
+ "eval_samples_per_second": 4083.685,
215
+ "eval_steps_per_second": 42.545,
216
+ "step": 32500
217
+ },
218
+ {
219
+ "epoch": 7.87,
220
+ "learning_rate": 1.69812312198827e-05,
221
+ "loss": 0.0689,
222
+ "step": 35000
223
+ },
224
+ {
225
+ "epoch": 7.87,
226
+ "eval_accuracy": 0.8656578846301559,
227
+ "eval_f1": 0.8646806211498433,
228
+ "eval_loss": 0.07080969214439392,
229
+ "eval_runtime": 33.9508,
230
+ "eval_samples_per_second": 4218.132,
231
+ "eval_steps_per_second": 43.946,
232
+ "step": 35000
233
+ },
234
+ {
235
+ "epoch": 8.43,
236
+ "learning_rate": 1.6542713405237254e-05,
237
+ "loss": 0.067,
238
+ "step": 37500
239
+ },
240
+ {
241
+ "epoch": 8.43,
242
+ "eval_accuracy": 0.8670823761076468,
243
+ "eval_f1": 0.8660045143213903,
244
+ "eval_loss": 0.07053036987781525,
245
+ "eval_runtime": 33.5221,
246
+ "eval_samples_per_second": 4272.077,
247
+ "eval_steps_per_second": 44.508,
248
+ "step": 37500
249
+ },
250
+ {
251
+ "epoch": 9.0,
252
+ "learning_rate": 1.6080997543782063e-05,
253
+ "loss": 0.0669,
254
+ "step": 40000
255
+ },
256
+ {
257
+ "epoch": 9.0,
258
+ "eval_accuracy": 0.8681158307089638,
259
+ "eval_f1": 0.8673702293109924,
260
+ "eval_loss": 0.06987718492746353,
261
+ "eval_runtime": 33.3948,
262
+ "eval_samples_per_second": 4288.361,
263
+ "eval_steps_per_second": 44.678,
264
+ "step": 40000
265
+ },
266
+ {
267
+ "epoch": 9.56,
268
+ "learning_rate": 1.5597720709541834e-05,
269
+ "loss": 0.0647,
270
+ "step": 42500
271
+ },
272
+ {
273
+ "epoch": 9.56,
274
+ "eval_accuracy": 0.8683392803524919,
275
+ "eval_f1": 0.8673043954616042,
276
+ "eval_loss": 0.06969144195318222,
277
+ "eval_runtime": 33.3931,
278
+ "eval_samples_per_second": 4288.583,
279
+ "eval_steps_per_second": 44.68,
280
+ "step": 42500
281
+ },
282
+ {
283
+ "epoch": 10.12,
284
+ "learning_rate": 1.509459642379259e-05,
285
+ "loss": 0.0641,
286
+ "step": 45000
287
+ },
288
+ {
289
+ "epoch": 10.12,
290
+ "eval_accuracy": 0.8690515260912373,
291
+ "eval_f1": 0.8680669250266475,
292
+ "eval_loss": 0.06932760030031204,
293
+ "eval_runtime": 33.6285,
294
+ "eval_samples_per_second": 4258.561,
295
+ "eval_steps_per_second": 44.367,
296
+ "step": 45000
297
+ },
298
+ {
299
+ "epoch": 10.68,
300
+ "learning_rate": 1.4573408579547676e-05,
301
+ "loss": 0.063,
302
+ "step": 47500
303
+ },
304
+ {
305
+ "epoch": 10.68,
306
+ "eval_accuracy": 0.8701827399115978,
307
+ "eval_f1": 0.8693810704654683,
308
+ "eval_loss": 0.06850136071443558,
309
+ "eval_runtime": 33.1134,
310
+ "eval_samples_per_second": 4324.805,
311
+ "eval_steps_per_second": 45.057,
312
+ "step": 47500
313
+ },
314
+ {
315
+ "epoch": 11.25,
316
+ "learning_rate": 1.4036005116531579e-05,
317
+ "loss": 0.0618,
318
+ "step": 50000
319
+ },
320
+ {
321
+ "epoch": 11.25,
322
+ "eval_accuracy": 0.8709089512530637,
323
+ "eval_f1": 0.8700675299699858,
324
+ "eval_loss": 0.06813304871320724,
325
+ "eval_runtime": 33.5088,
326
+ "eval_samples_per_second": 4273.778,
327
+ "eval_steps_per_second": 44.526,
328
+ "step": 50000
329
+ },
330
+ {
331
+ "epoch": 11.81,
332
+ "learning_rate": 1.3484291469067736e-05,
333
+ "loss": 0.0614,
334
+ "step": 52500
335
+ },
336
+ {
337
+ "epoch": 11.81,
338
+ "eval_accuracy": 0.8719843026625421,
339
+ "eval_f1": 0.8711736208762333,
340
+ "eval_loss": 0.0674930214881897,
341
+ "eval_runtime": 33.3571,
342
+ "eval_samples_per_second": 4293.204,
343
+ "eval_steps_per_second": 44.728,
344
+ "step": 52500
345
+ },
346
+ {
347
+ "epoch": 12.37,
348
+ "learning_rate": 1.2920223810111731e-05,
349
+ "loss": 0.0601,
350
+ "step": 55000
351
+ },
352
+ {
353
+ "epoch": 12.37,
354
+ "eval_accuracy": 0.8723543911346354,
355
+ "eval_f1": 0.8713242180501024,
356
+ "eval_loss": 0.06780188530683517,
357
+ "eval_runtime": 33.8602,
358
+ "eval_samples_per_second": 4229.414,
359
+ "eval_steps_per_second": 44.063,
360
+ "step": 55000
361
+ },
362
+ {
363
+ "epoch": 12.93,
364
+ "learning_rate": 1.2345802115384014e-05,
365
+ "loss": 0.0598,
366
+ "step": 57500
367
+ },
368
+ {
369
+ "epoch": 12.93,
370
+ "eval_accuracy": 0.8731853444965051,
371
+ "eval_f1": 0.8725266467114048,
372
+ "eval_loss": 0.06704463809728622,
373
+ "eval_runtime": 34.9394,
374
+ "eval_samples_per_second": 4098.78,
375
+ "eval_steps_per_second": 42.702,
376
+ "step": 57500
377
+ },
378
+ {
379
+ "epoch": 13.5,
380
+ "learning_rate": 1.1763063072194181e-05,
381
+ "loss": 0.0584,
382
+ "step": 60000
383
+ },
384
+ {
385
+ "epoch": 13.5,
386
+ "eval_accuracy": 0.8731713788937846,
387
+ "eval_f1": 0.8723142764159807,
388
+ "eval_loss": 0.06695400178432465,
389
+ "eval_runtime": 33.3284,
390
+ "eval_samples_per_second": 4296.9,
391
+ "eval_steps_per_second": 44.767,
392
+ "step": 60000
393
+ },
394
+ {
395
+ "epoch": 14.06,
396
+ "learning_rate": 1.1174072858099545e-05,
397
+ "loss": 0.0584,
398
+ "step": 62500
399
+ },
400
+ {
401
+ "epoch": 14.06,
402
+ "eval_accuracy": 0.8740023322556543,
403
+ "eval_f1": 0.8732025404369645,
404
+ "eval_loss": 0.0664532408118248,
405
+ "eval_runtime": 36.3142,
406
+ "eval_samples_per_second": 3943.604,
407
+ "eval_steps_per_second": 41.086,
408
+ "step": 62500
409
+ },
410
+ {
411
+ "epoch": 14.62,
412
+ "learning_rate": 1.0580919815002126e-05,
413
+ "loss": 0.0572,
414
+ "step": 65000
415
+ },
416
+ {
417
+ "epoch": 14.62,
418
+ "eval_accuracy": 0.8744282831386295,
419
+ "eval_f1": 0.8734337495737768,
420
+ "eval_loss": 0.06649637967348099,
421
+ "eval_runtime": 33.7341,
422
+ "eval_samples_per_second": 4245.227,
423
+ "eval_steps_per_second": 44.228,
424
+ "step": 65000
425
+ },
426
+ {
427
+ "epoch": 15.18,
428
+ "learning_rate": 9.98570704465907e-06,
429
+ "loss": 0.0567,
430
+ "step": 67500
431
+ },
432
+ {
433
+ "epoch": 15.18,
434
+ "eval_accuracy": 0.8752732021032198,
435
+ "eval_f1": 0.8744599630053497,
436
+ "eval_loss": 0.06611284613609314,
437
+ "eval_runtime": 33.3726,
438
+ "eval_samples_per_second": 4291.211,
439
+ "eval_steps_per_second": 44.707,
440
+ "step": 67500
441
+ },
442
+ {
443
+ "epoch": 15.74,
444
+ "learning_rate": 9.390544951860105e-06,
445
+ "loss": 0.0561,
446
+ "step": 70000
447
+ },
448
+ {
449
+ "epoch": 15.74,
450
+ "eval_accuracy": 0.8756293249725925,
451
+ "eval_f1": 0.8749748466805782,
452
+ "eval_loss": 0.06604801118373871,
453
+ "eval_runtime": 33.5726,
454
+ "eval_samples_per_second": 4265.656,
455
+ "eval_steps_per_second": 44.441,
456
+ "step": 70000
457
+ },
458
+ {
459
+ "epoch": 16.31,
460
+ "learning_rate": 8.797543761711079e-06,
461
+ "loss": 0.0554,
462
+ "step": 72500
463
+ },
464
+ {
465
+ "epoch": 16.31,
466
+ "eval_accuracy": 0.8758876886229218,
467
+ "eval_f1": 0.8750835415020012,
468
+ "eval_loss": 0.06606367230415344,
469
+ "eval_runtime": 35.9562,
470
+ "eval_samples_per_second": 3982.87,
471
+ "eval_steps_per_second": 41.495,
472
+ "step": 72500
473
+ },
474
+ {
475
+ "epoch": 16.87,
476
+ "learning_rate": 8.208806037554645e-06,
477
+ "loss": 0.0552,
478
+ "step": 75000
479
+ },
480
+ {
481
+ "epoch": 16.87,
482
+ "eval_accuracy": 0.875503634548108,
483
+ "eval_f1": 0.8748588561400703,
484
+ "eval_loss": 0.06561503559350967,
485
+ "eval_runtime": 33.3706,
486
+ "eval_samples_per_second": 4291.466,
487
+ "eval_steps_per_second": 44.71,
488
+ "step": 75000
489
+ },
490
+ {
491
+ "epoch": 17.43,
492
+ "learning_rate": 7.6264192260566915e-06,
493
+ "loss": 0.0544,
494
+ "step": 77500
495
+ },
496
+ {
497
+ "epoch": 17.43,
498
+ "eval_accuracy": 0.8761670006773318,
499
+ "eval_f1": 0.875430169191633,
500
+ "eval_loss": 0.06571561098098755,
501
+ "eval_runtime": 33.8002,
502
+ "eval_samples_per_second": 4236.932,
503
+ "eval_steps_per_second": 44.142,
504
+ "step": 77500
505
+ },
506
+ {
507
+ "epoch": 17.99,
508
+ "learning_rate": 7.052448255890958e-06,
509
+ "loss": 0.0544,
510
+ "step": 80000
511
+ },
512
+ {
513
+ "epoch": 17.99,
514
+ "eval_accuracy": 0.8766627795739095,
515
+ "eval_f1": 0.8759974000823453,
516
+ "eval_loss": 0.06542336195707321,
517
+ "eval_runtime": 34.4239,
518
+ "eval_samples_per_second": 4160.157,
519
+ "eval_steps_per_second": 43.342,
520
+ "step": 80000
521
+ },
522
+ {
523
+ "epoch": 18.56,
524
+ "learning_rate": 6.488928216264112e-06,
525
+ "loss": 0.0534,
526
+ "step": 82500
527
+ },
528
+ {
529
+ "epoch": 18.56,
530
+ "eval_accuracy": 0.8767116591834312,
531
+ "eval_f1": 0.8759135326151816,
532
+ "eval_loss": 0.06542443484067917,
533
+ "eval_runtime": 33.2655,
534
+ "eval_samples_per_second": 4305.035,
535
+ "eval_steps_per_second": 44.851,
536
+ "step": 82500
537
+ },
538
+ {
539
+ "epoch": 19.12,
540
+ "learning_rate": 5.93785714124059e-06,
541
+ "loss": 0.0534,
542
+ "step": 85000
543
+ },
544
+ {
545
+ "epoch": 19.12,
546
+ "eval_accuracy": 0.8773470941072139,
547
+ "eval_f1": 0.8766532567238048,
548
+ "eval_loss": 0.06530272215604782,
549
+ "eval_runtime": 35.0124,
550
+ "eval_samples_per_second": 4090.241,
551
+ "eval_steps_per_second": 42.614,
552
+ "step": 85000
553
+ },
554
+ {
555
+ "epoch": 19.68,
556
+ "learning_rate": 5.401188925451274e-06,
557
+ "loss": 0.0528,
558
+ "step": 87500
559
+ },
560
+ {
561
+ "epoch": 19.68,
562
+ "eval_accuracy": 0.8775146813398599,
563
+ "eval_f1": 0.8768256135826535,
564
+ "eval_loss": 0.06490106880664825,
565
+ "eval_runtime": 33.5689,
566
+ "eval_samples_per_second": 4266.118,
567
+ "eval_steps_per_second": 44.446,
568
+ "step": 87500
569
+ },
570
+ {
571
+ "epoch": 20.24,
572
+ "learning_rate": 4.880826396304312e-06,
573
+ "loss": 0.0525,
574
+ "step": 90000
575
+ },
576
+ {
577
+ "epoch": 20.24,
578
+ "eval_accuracy": 0.8776264061616239,
579
+ "eval_f1": 0.8768652778836793,
580
+ "eval_loss": 0.06507841497659683,
581
+ "eval_runtime": 33.3247,
582
+ "eval_samples_per_second": 4297.383,
583
+ "eval_steps_per_second": 44.772,
584
+ "step": 90000
585
+ },
586
+ {
587
+ "epoch": 20.81,
588
+ "learning_rate": 4.378614567261487e-06,
589
+ "loss": 0.0523,
590
+ "step": 92500
591
+ },
592
+ {
593
+ "epoch": 20.81,
594
+ "eval_accuracy": 0.8775007157371394,
595
+ "eval_f1": 0.8767952508584296,
596
+ "eval_loss": 0.06489618122577667,
597
+ "eval_runtime": 36.2651,
598
+ "eval_samples_per_second": 3948.949,
599
+ "eval_steps_per_second": 41.141,
600
+ "step": 92500
601
+ },
602
+ {
603
+ "epoch": 21.37,
604
+ "learning_rate": 3.896334096101447e-06,
605
+ "loss": 0.0517,
606
+ "step": 95000
607
+ },
608
+ {
609
+ "epoch": 21.37,
610
+ "eval_accuracy": 0.8782339098799656,
611
+ "eval_f1": 0.877492038187804,
612
+ "eval_loss": 0.06479762494564056,
613
+ "eval_runtime": 33.4015,
614
+ "eval_samples_per_second": 4287.5,
615
+ "eval_steps_per_second": 44.669,
616
+ "step": 95000
617
+ },
618
+ {
619
+ "epoch": 21.93,
620
+ "learning_rate": 3.4356949713644915e-06,
621
+ "loss": 0.0516,
622
+ "step": 97500
623
+ },
624
+ {
625
+ "epoch": 21.93,
626
+ "eval_accuracy": 0.8782897722908476,
627
+ "eval_f1": 0.8775965653086075,
628
+ "eval_loss": 0.06478870660066605,
629
+ "eval_runtime": 33.7278,
630
+ "eval_samples_per_second": 4246.021,
631
+ "eval_steps_per_second": 44.236,
632
+ "step": 97500
633
+ },
634
+ {
635
+ "epoch": 22.49,
636
+ "learning_rate": 2.9983304493643495e-06,
637
+ "loss": 0.0511,
638
+ "step": 100000
639
+ },
640
+ {
641
+ "epoch": 22.49,
642
+ "eval_accuracy": 0.8780523570445992,
643
+ "eval_f1": 0.8773587469443522,
644
+ "eval_loss": 0.06483691185712814,
645
+ "eval_runtime": 33.3746,
646
+ "eval_samples_per_second": 4290.959,
647
+ "eval_steps_per_second": 44.705,
648
+ "step": 100000
649
+ },
650
+ {
651
+ "epoch": 23.05,
652
+ "learning_rate": 2.5857912632641447e-06,
653
+ "loss": 0.0511,
654
+ "step": 102500
655
+ },
656
+ {
657
+ "epoch": 23.05,
658
+ "eval_accuracy": 0.8783246862976489,
659
+ "eval_f1": 0.8776168302082048,
660
+ "eval_loss": 0.06472069770097733,
661
+ "eval_runtime": 33.1429,
662
+ "eval_samples_per_second": 4320.953,
663
+ "eval_steps_per_second": 45.017,
664
+ "step": 102500
665
+ },
666
+ {
667
+ "epoch": 23.62,
668
+ "learning_rate": 2.199540124748957e-06,
669
+ "loss": 0.0508,
670
+ "step": 105000
671
+ },
672
+ {
673
+ "epoch": 23.62,
674
+ "eval_accuracy": 0.8785062391330154,
675
+ "eval_f1": 0.8777938106114025,
676
+ "eval_loss": 0.06474106758832932,
677
+ "eval_runtime": 33.3508,
678
+ "eval_samples_per_second": 4294.023,
679
+ "eval_steps_per_second": 44.737,
680
+ "step": 105000
681
+ },
682
+ {
683
+ "epoch": 24.18,
684
+ "learning_rate": 1.8409465377900981e-06,
685
+ "loss": 0.0505,
686
+ "step": 107500
687
+ },
688
+ {
689
+ "epoch": 24.18,
690
+ "eval_accuracy": 0.8785341703384564,
691
+ "eval_f1": 0.8777444031001141,
692
+ "eval_loss": 0.0646664947271347,
693
+ "eval_runtime": 33.8974,
694
+ "eval_samples_per_second": 4224.776,
695
+ "eval_steps_per_second": 44.015,
696
+ "step": 107500
697
+ },
698
+ {
699
+ "epoch": 24.74,
700
+ "learning_rate": 1.5112819428894976e-06,
701
+ "loss": 0.0505,
702
+ "step": 110000
703
+ },
704
+ {
705
+ "epoch": 24.74,
706
+ "eval_accuracy": 0.8787855511874254,
707
+ "eval_f1": 0.8780881956947786,
708
+ "eval_loss": 0.06458932906389236,
709
+ "eval_runtime": 33.1686,
710
+ "eval_samples_per_second": 4317.613,
711
+ "eval_steps_per_second": 44.982,
712
+ "step": 110000
713
+ },
714
+ {
715
+ "epoch": 25.3,
716
+ "learning_rate": 1.2117152090209806e-06,
717
+ "loss": 0.0503,
718
+ "step": 112500
719
+ },
720
+ {
721
+ "epoch": 25.3,
722
+ "eval_accuracy": 0.8786389123588602,
723
+ "eval_f1": 0.8779011228018737,
724
+ "eval_loss": 0.06458309292793274,
725
+ "eval_runtime": 33.1934,
726
+ "eval_samples_per_second": 4314.381,
727
+ "eval_steps_per_second": 44.949,
728
+ "step": 112500
729
+ },
730
+ {
731
+ "epoch": 25.87,
732
+ "learning_rate": 9.433084892523181e-07,
733
+ "loss": 0.0502,
734
+ "step": 115000
735
+ },
736
+ {
737
+ "epoch": 25.87,
738
+ "eval_accuracy": 0.8788623620023881,
739
+ "eval_f1": 0.878161188935205,
740
+ "eval_loss": 0.06459838151931763,
741
+ "eval_runtime": 35.5543,
742
+ "eval_samples_per_second": 4027.899,
743
+ "eval_steps_per_second": 41.964,
744
+ "step": 115000
745
+ },
746
+ {
747
+ "epoch": 26.43,
748
+ "learning_rate": 7.070134547424945e-07,
749
+ "loss": 0.0501,
750
+ "step": 117500
751
+ },
752
+ {
753
+ "epoch": 26.43,
754
+ "eval_accuracy": 0.8787995167901459,
755
+ "eval_f1": 0.8780747579631859,
756
+ "eval_loss": 0.06461162865161896,
757
+ "eval_runtime": 33.4871,
758
+ "eval_samples_per_second": 4276.544,
759
+ "eval_steps_per_second": 44.554,
760
+ "step": 117500
761
+ },
762
+ {
763
+ "epoch": 26.99,
764
+ "learning_rate": 5.036679204670313e-07,
765
+ "loss": 0.0501,
766
+ "step": 120000
767
+ },
768
+ {
769
+ "epoch": 26.99,
770
+ "eval_accuracy": 0.8791067600499969,
771
+ "eval_f1": 0.8784083866618659,
772
+ "eval_loss": 0.06452779471874237,
773
+ "eval_runtime": 33.2577,
774
+ "eval_samples_per_second": 4306.037,
775
+ "eval_steps_per_second": 44.862,
776
+ "step": 120000
777
+ },
778
+ {
779
+ "epoch": 27.55,
780
+ "learning_rate": 3.339928746353327e-07,
781
+ "loss": 0.05,
782
+ "step": 122500
783
+ },
784
+ {
785
+ "epoch": 27.55,
786
+ "eval_accuracy": 0.8790439148377546,
787
+ "eval_f1": 0.8783328357475545,
788
+ "eval_loss": 0.06455225497484207,
789
+ "eval_runtime": 33.5082,
790
+ "eval_samples_per_second": 4273.851,
791
+ "eval_steps_per_second": 44.526,
792
+ "step": 122500
793
+ },
794
+ {
795
+ "epoch": 28.12,
796
+ "learning_rate": 1.9858992233260598e-07,
797
+ "loss": 0.0497,
798
+ "step": 125000
799
+ },
800
+ {
801
+ "epoch": 28.12,
802
+ "eval_accuracy": 0.8791626224608788,
803
+ "eval_f1": 0.8784598679092687,
804
+ "eval_loss": 0.06452032178640366,
805
+ "eval_runtime": 33.4248,
806
+ "eval_samples_per_second": 4284.51,
807
+ "eval_steps_per_second": 44.637,
808
+ "step": 125000
809
+ },
810
+ {
811
+ "epoch": 28.68,
812
+ "learning_rate": 9.793915245028595e-08,
813
+ "loss": 0.0499,
814
+ "step": 127500
815
+ },
816
+ {
817
+ "epoch": 28.68,
818
+ "eval_accuracy": 0.8791277084540776,
819
+ "eval_f1": 0.8784333454818922,
820
+ "eval_loss": 0.06449923664331436,
821
+ "eval_runtime": 35.6969,
822
+ "eval_samples_per_second": 4011.8,
823
+ "eval_steps_per_second": 41.796,
824
+ "step": 127500
825
+ },
826
+ {
827
+ "epoch": 29.24,
828
+ "learning_rate": 3.239743546802565e-08,
829
+ "loss": 0.0499,
830
+ "step": 130000
831
+ },
832
+ {
833
+ "epoch": 29.24,
834
+ "eval_accuracy": 0.8791556396595186,
835
+ "eval_f1": 0.8784619602482451,
836
+ "eval_loss": 0.06451133638620377,
837
+ "eval_runtime": 33.3523,
838
+ "eval_samples_per_second": 4293.833,
839
+ "eval_steps_per_second": 44.735,
840
+ "step": 130000
841
+ },
842
+ {
843
+ "epoch": 29.8,
844
+ "learning_rate": 2.197158122699827e-09,
845
+ "loss": 0.0497,
846
+ "step": 132500
847
+ },
848
+ {
849
+ "epoch": 29.8,
850
+ "eval_accuracy": 0.8791137428513571,
851
+ "eval_f1": 0.8784195035720077,
852
+ "eval_loss": 0.06450776755809784,
853
+ "eval_runtime": 33.5967,
854
+ "eval_samples_per_second": 4262.595,
855
+ "eval_steps_per_second": 44.409,
856
+ "step": 132500
857
+ },
858
+ {
859
+ "epoch": 30.0,
860
+ "step": 133380,
861
+ "total_flos": 1.6627855289686797e+17,
862
+ "train_loss": 0.06639543622843883,
863
+ "train_runtime": 19911.3581,
864
+ "train_samples_per_second": 1714.776,
865
+ "train_steps_per_second": 6.699
866
+ }
867
+ ],
868
+ "max_steps": 133380,
869
+ "num_train_epochs": 30,
870
+ "total_flos": 1.6627855289686797e+17,
871
+ "trial_name": null,
872
+ "trial_params": null
873
+ }