mgh6 commited on
Commit
e4d2474
·
verified ·
1 Parent(s): 8c818e1

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:182ee968b6fdeec8216ae2242608aac4cf00a82309a22f2bc546f245f6a30f5b
3
  size 136000488
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a6e94e6f9602b019dfddf88bbd7631188723c74100c86f84c8aa54a9c83d963
3
  size 136000488
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfc3984044cbf3ce86e00b28e3e8d7a9ea91edc27a0b44779f188f12efd55185
3
  size 268176506
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f2395a5137dda5eb97e54395eb65f6e40d010266167d521b6e12b7cb1436435
3
  size 268176506
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56f87b775049fb0adab4e0d540aff9b9f075c23a8d207a780cdfad0536093ab3
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c208ccb16ca4d1a71397af65ad74bbc2d33a691a42a2d223ddec56de42a775e
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a16798b06a013ad4b7ec3ca11219408d900e5c425fe7c3d917c437397043544f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea9b67c97ec0b0a1b79a6330badd5da865b550616c82ba334622fd4f95186829
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1422 +1,27 @@
1
  {
2
- "best_metric": 1.0438764095306396,
3
- "best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-8900",
4
- "epoch": 2.7285921625544267,
5
  "eval_steps": 100,
6
- "global_step": 9400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02902757619738752,
13
- "grad_norm": 1.131932258605957,
14
  "learning_rate": 9.970972423802612e-05,
15
- "loss": 2.8244,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.02902757619738752,
20
- "eval_loss": 1.2662084102630615,
21
- "eval_runtime": 213.5614,
22
- "eval_samples_per_second": 213.049,
23
- "eval_steps_per_second": 3.329,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.05805515239477504,
28
- "grad_norm": 1.0239707231521606,
29
- "learning_rate": 9.941944847605225e-05,
30
- "loss": 2.7081,
31
- "step": 200
32
- },
33
- {
34
- "epoch": 0.05805515239477504,
35
- "eval_loss": 1.2453378438949585,
36
- "eval_runtime": 212.9056,
37
- "eval_samples_per_second": 213.705,
38
- "eval_steps_per_second": 3.34,
39
- "step": 200
40
- },
41
- {
42
- "epoch": 0.08708272859216255,
43
- "grad_norm": 1.1205116510391235,
44
- "learning_rate": 9.912917271407838e-05,
45
- "loss": 2.642,
46
- "step": 300
47
- },
48
- {
49
- "epoch": 0.08708272859216255,
50
- "eval_loss": 1.2237757444381714,
51
- "eval_runtime": 214.4447,
52
- "eval_samples_per_second": 212.171,
53
- "eval_steps_per_second": 3.316,
54
- "step": 300
55
- },
56
- {
57
- "epoch": 0.11611030478955008,
58
- "grad_norm": 1.0193355083465576,
59
- "learning_rate": 9.883889695210451e-05,
60
- "loss": 2.6037,
61
- "step": 400
62
- },
63
- {
64
- "epoch": 0.11611030478955008,
65
- "eval_loss": 1.2148627042770386,
66
- "eval_runtime": 213.5123,
67
- "eval_samples_per_second": 213.098,
68
- "eval_steps_per_second": 3.33,
69
- "step": 400
70
- },
71
- {
72
- "epoch": 0.14513788098693758,
73
- "grad_norm": 1.05299711227417,
74
- "learning_rate": 9.854862119013063e-05,
75
- "loss": 2.5791,
76
- "step": 500
77
- },
78
- {
79
- "epoch": 0.14513788098693758,
80
- "eval_loss": 1.2020208835601807,
81
- "eval_runtime": 213.769,
82
- "eval_samples_per_second": 212.842,
83
- "eval_steps_per_second": 3.326,
84
- "step": 500
85
- },
86
- {
87
- "epoch": 0.1741654571843251,
88
- "grad_norm": 1.0508314371109009,
89
- "learning_rate": 9.825834542815675e-05,
90
- "loss": 2.5464,
91
- "step": 600
92
- },
93
- {
94
- "epoch": 0.1741654571843251,
95
- "eval_loss": 1.1960116624832153,
96
- "eval_runtime": 214.1083,
97
- "eval_samples_per_second": 212.505,
98
- "eval_steps_per_second": 3.321,
99
- "step": 600
100
- },
101
- {
102
- "epoch": 0.20319303338171263,
103
- "grad_norm": 1.158460021018982,
104
- "learning_rate": 9.796806966618288e-05,
105
- "loss": 2.5391,
106
- "step": 700
107
- },
108
- {
109
- "epoch": 0.20319303338171263,
110
- "eval_loss": 1.186664342880249,
111
- "eval_runtime": 213.4364,
112
- "eval_samples_per_second": 213.174,
113
- "eval_steps_per_second": 3.331,
114
- "step": 700
115
- },
116
- {
117
- "epoch": 0.23222060957910015,
118
- "grad_norm": 1.0704821348190308,
119
- "learning_rate": 9.767779390420901e-05,
120
- "loss": 2.4944,
121
- "step": 800
122
- },
123
- {
124
- "epoch": 0.23222060957910015,
125
- "eval_loss": 1.1850290298461914,
126
- "eval_runtime": 213.63,
127
- "eval_samples_per_second": 212.98,
128
- "eval_steps_per_second": 3.328,
129
- "step": 800
130
- },
131
- {
132
- "epoch": 0.2612481857764877,
133
- "grad_norm": 1.0562227964401245,
134
- "learning_rate": 9.738751814223513e-05,
135
- "loss": 2.4879,
136
- "step": 900
137
- },
138
- {
139
- "epoch": 0.2612481857764877,
140
- "eval_loss": 1.1725127696990967,
141
- "eval_runtime": 213.7307,
142
- "eval_samples_per_second": 212.88,
143
- "eval_steps_per_second": 3.327,
144
- "step": 900
145
- },
146
- {
147
- "epoch": 0.29027576197387517,
148
- "grad_norm": 1.136777639389038,
149
- "learning_rate": 9.709724238026126e-05,
150
- "loss": 2.4647,
151
- "step": 1000
152
- },
153
- {
154
- "epoch": 0.29027576197387517,
155
- "eval_loss": 1.1709253787994385,
156
- "eval_runtime": 213.2147,
157
- "eval_samples_per_second": 213.395,
158
- "eval_steps_per_second": 3.335,
159
- "step": 1000
160
- },
161
- {
162
- "epoch": 0.3193033381712627,
163
- "grad_norm": 1.0949931144714355,
164
- "learning_rate": 9.680696661828737e-05,
165
- "loss": 2.4441,
166
- "step": 1100
167
- },
168
- {
169
- "epoch": 0.3193033381712627,
170
- "eval_loss": 1.1647560596466064,
171
- "eval_runtime": 213.5056,
172
- "eval_samples_per_second": 213.104,
173
- "eval_steps_per_second": 3.33,
174
- "step": 1100
175
- },
176
- {
177
- "epoch": 0.3483309143686502,
178
- "grad_norm": 1.2719751596450806,
179
- "learning_rate": 9.65166908563135e-05,
180
- "loss": 2.432,
181
- "step": 1200
182
- },
183
- {
184
- "epoch": 0.3483309143686502,
185
- "eval_loss": 1.1668621301651,
186
- "eval_runtime": 213.8017,
187
- "eval_samples_per_second": 212.809,
188
- "eval_steps_per_second": 3.326,
189
- "step": 1200
190
- },
191
- {
192
- "epoch": 0.37735849056603776,
193
- "grad_norm": 1.1357173919677734,
194
- "learning_rate": 9.622641509433963e-05,
195
- "loss": 2.4173,
196
- "step": 1300
197
- },
198
- {
199
- "epoch": 0.37735849056603776,
200
- "eval_loss": 1.1585583686828613,
201
- "eval_runtime": 212.8448,
202
- "eval_samples_per_second": 213.766,
203
- "eval_steps_per_second": 3.34,
204
- "step": 1300
205
- },
206
- {
207
- "epoch": 0.40638606676342526,
208
- "grad_norm": 1.1240577697753906,
209
- "learning_rate": 9.593613933236575e-05,
210
- "loss": 2.4029,
211
- "step": 1400
212
- },
213
- {
214
- "epoch": 0.40638606676342526,
215
- "eval_loss": 1.1513617038726807,
216
- "eval_runtime": 214.5547,
217
- "eval_samples_per_second": 212.063,
218
- "eval_steps_per_second": 3.314,
219
- "step": 1400
220
- },
221
- {
222
- "epoch": 0.43541364296081275,
223
- "grad_norm": 1.074048399925232,
224
- "learning_rate": 9.564586357039188e-05,
225
- "loss": 2.3964,
226
- "step": 1500
227
- },
228
- {
229
- "epoch": 0.43541364296081275,
230
- "eval_loss": 1.1514214277267456,
231
- "eval_runtime": 213.8115,
232
- "eval_samples_per_second": 212.8,
233
- "eval_steps_per_second": 3.325,
234
- "step": 1500
235
- },
236
- {
237
- "epoch": 0.4644412191582003,
238
- "grad_norm": 1.2565686702728271,
239
- "learning_rate": 9.5355587808418e-05,
240
- "loss": 2.3548,
241
- "step": 1600
242
- },
243
- {
244
- "epoch": 0.4644412191582003,
245
- "eval_loss": 1.1476994752883911,
246
- "eval_runtime": 214.3759,
247
- "eval_samples_per_second": 212.239,
248
- "eval_steps_per_second": 3.317,
249
- "step": 1600
250
- },
251
- {
252
- "epoch": 0.4934687953555878,
253
- "grad_norm": 1.1474090814590454,
254
- "learning_rate": 9.506531204644412e-05,
255
- "loss": 2.36,
256
- "step": 1700
257
- },
258
- {
259
- "epoch": 0.4934687953555878,
260
- "eval_loss": 1.1446571350097656,
261
- "eval_runtime": 213.458,
262
- "eval_samples_per_second": 213.152,
263
- "eval_steps_per_second": 3.331,
264
- "step": 1700
265
- },
266
- {
267
- "epoch": 0.5224963715529753,
268
- "grad_norm": 1.2290916442871094,
269
- "learning_rate": 9.477503628447025e-05,
270
- "loss": 2.3438,
271
- "step": 1800
272
- },
273
- {
274
- "epoch": 0.5224963715529753,
275
- "eval_loss": 1.1393438577651978,
276
- "eval_runtime": 213.014,
277
- "eval_samples_per_second": 213.596,
278
- "eval_steps_per_second": 3.338,
279
- "step": 1800
280
- },
281
- {
282
- "epoch": 0.5515239477503628,
283
- "grad_norm": 1.1700950860977173,
284
- "learning_rate": 9.448476052249638e-05,
285
- "loss": 2.3416,
286
- "step": 1900
287
- },
288
- {
289
- "epoch": 0.5515239477503628,
290
- "eval_loss": 1.1348192691802979,
291
- "eval_runtime": 213.2252,
292
- "eval_samples_per_second": 213.385,
293
- "eval_steps_per_second": 3.335,
294
- "step": 1900
295
- },
296
- {
297
- "epoch": 0.5805515239477503,
298
- "grad_norm": 1.1090705394744873,
299
- "learning_rate": 9.419448476052251e-05,
300
- "loss": 2.3289,
301
- "step": 2000
302
- },
303
- {
304
- "epoch": 0.5805515239477503,
305
- "eval_loss": 1.130873203277588,
306
- "eval_runtime": 212.7564,
307
- "eval_samples_per_second": 213.855,
308
- "eval_steps_per_second": 3.342,
309
- "step": 2000
310
- },
311
- {
312
- "epoch": 0.6095791001451378,
313
- "grad_norm": 1.17753267288208,
314
- "learning_rate": 9.390420899854863e-05,
315
- "loss": 2.3218,
316
- "step": 2100
317
- },
318
- {
319
- "epoch": 0.6095791001451378,
320
- "eval_loss": 1.1335190534591675,
321
- "eval_runtime": 212.7619,
322
- "eval_samples_per_second": 213.849,
323
- "eval_steps_per_second": 3.342,
324
- "step": 2100
325
- },
326
- {
327
- "epoch": 0.6386066763425254,
328
- "grad_norm": 1.087358832359314,
329
- "learning_rate": 9.361393323657474e-05,
330
- "loss": 2.3072,
331
- "step": 2200
332
- },
333
- {
334
- "epoch": 0.6386066763425254,
335
- "eval_loss": 1.1303313970565796,
336
- "eval_runtime": 213.3449,
337
- "eval_samples_per_second": 213.265,
338
- "eval_steps_per_second": 3.333,
339
- "step": 2200
340
- },
341
- {
342
- "epoch": 0.6676342525399129,
343
- "grad_norm": 1.1286981105804443,
344
- "learning_rate": 9.332365747460087e-05,
345
- "loss": 2.2881,
346
- "step": 2300
347
- },
348
- {
349
- "epoch": 0.6676342525399129,
350
- "eval_loss": 1.1234804391860962,
351
- "eval_runtime": 213.3465,
352
- "eval_samples_per_second": 213.263,
353
- "eval_steps_per_second": 3.333,
354
- "step": 2300
355
- },
356
- {
357
- "epoch": 0.6966618287373004,
358
- "grad_norm": 1.1590163707733154,
359
- "learning_rate": 9.3033381712627e-05,
360
- "loss": 2.2751,
361
- "step": 2400
362
- },
363
- {
364
- "epoch": 0.6966618287373004,
365
- "eval_loss": 1.120328664779663,
366
- "eval_runtime": 213.9246,
367
- "eval_samples_per_second": 212.687,
368
- "eval_steps_per_second": 3.324,
369
- "step": 2400
370
- },
371
- {
372
- "epoch": 0.7256894049346879,
373
- "grad_norm": 1.3988169431686401,
374
- "learning_rate": 9.274310595065312e-05,
375
- "loss": 2.2666,
376
- "step": 2500
377
- },
378
- {
379
- "epoch": 0.7256894049346879,
380
- "eval_loss": 1.1266223192214966,
381
- "eval_runtime": 214.3634,
382
- "eval_samples_per_second": 212.252,
383
- "eval_steps_per_second": 3.317,
384
- "step": 2500
385
- },
386
- {
387
- "epoch": 0.7547169811320755,
388
- "grad_norm": 1.239560842514038,
389
- "learning_rate": 9.245283018867925e-05,
390
- "loss": 2.2702,
391
- "step": 2600
392
- },
393
- {
394
- "epoch": 0.7547169811320755,
395
- "eval_loss": 1.1224210262298584,
396
- "eval_runtime": 213.2424,
397
- "eval_samples_per_second": 213.367,
398
- "eval_steps_per_second": 3.334,
399
- "step": 2600
400
- },
401
- {
402
- "epoch": 0.783744557329463,
403
- "grad_norm": 1.1289948225021362,
404
- "learning_rate": 9.216255442670537e-05,
405
- "loss": 2.256,
406
- "step": 2700
407
- },
408
- {
409
- "epoch": 0.783744557329463,
410
- "eval_loss": 1.1150513887405396,
411
- "eval_runtime": 213.4486,
412
- "eval_samples_per_second": 213.161,
413
- "eval_steps_per_second": 3.331,
414
- "step": 2700
415
- },
416
- {
417
- "epoch": 0.8127721335268505,
418
- "grad_norm": 1.1463016271591187,
419
- "learning_rate": 9.18722786647315e-05,
420
- "loss": 2.2483,
421
- "step": 2800
422
- },
423
- {
424
- "epoch": 0.8127721335268505,
425
- "eval_loss": 1.1185483932495117,
426
- "eval_runtime": 212.704,
427
- "eval_samples_per_second": 213.908,
428
- "eval_steps_per_second": 3.343,
429
- "step": 2800
430
- },
431
- {
432
- "epoch": 0.841799709724238,
433
- "grad_norm": 1.1233168840408325,
434
- "learning_rate": 9.158200290275763e-05,
435
- "loss": 2.2328,
436
- "step": 2900
437
- },
438
- {
439
- "epoch": 0.841799709724238,
440
- "eval_loss": 1.1085420846939087,
441
- "eval_runtime": 213.7255,
442
- "eval_samples_per_second": 212.885,
443
- "eval_steps_per_second": 3.327,
444
- "step": 2900
445
- },
446
- {
447
- "epoch": 0.8708272859216255,
448
- "grad_norm": 1.1887527704238892,
449
- "learning_rate": 9.129172714078375e-05,
450
- "loss": 2.235,
451
- "step": 3000
452
- },
453
- {
454
- "epoch": 0.8708272859216255,
455
- "eval_loss": 1.1104073524475098,
456
- "eval_runtime": 213.9252,
457
- "eval_samples_per_second": 212.687,
458
- "eval_steps_per_second": 3.324,
459
- "step": 3000
460
- },
461
- {
462
- "epoch": 0.8998548621190131,
463
- "grad_norm": 1.2834577560424805,
464
- "learning_rate": 9.100145137880988e-05,
465
- "loss": 2.2209,
466
- "step": 3100
467
- },
468
- {
469
- "epoch": 0.8998548621190131,
470
- "eval_loss": 1.1137757301330566,
471
- "eval_runtime": 213.6201,
472
- "eval_samples_per_second": 212.99,
473
- "eval_steps_per_second": 3.328,
474
- "step": 3100
475
- },
476
- {
477
- "epoch": 0.9288824383164006,
478
- "grad_norm": 1.3034873008728027,
479
- "learning_rate": 9.0711175616836e-05,
480
- "loss": 2.2185,
481
- "step": 3200
482
- },
483
- {
484
- "epoch": 0.9288824383164006,
485
- "eval_loss": 1.107863187789917,
486
- "eval_runtime": 213.1098,
487
- "eval_samples_per_second": 213.5,
488
- "eval_steps_per_second": 3.336,
489
- "step": 3200
490
- },
491
- {
492
- "epoch": 0.9579100145137881,
493
- "grad_norm": 1.1802492141723633,
494
- "learning_rate": 9.042089985486212e-05,
495
- "loss": 2.2147,
496
- "step": 3300
497
- },
498
- {
499
- "epoch": 0.9579100145137881,
500
- "eval_loss": 1.1041762828826904,
501
- "eval_runtime": 213.2962,
502
- "eval_samples_per_second": 213.314,
503
- "eval_steps_per_second": 3.333,
504
- "step": 3300
505
- },
506
- {
507
- "epoch": 0.9869375907111756,
508
- "grad_norm": 1.2992894649505615,
509
- "learning_rate": 9.013062409288826e-05,
510
- "loss": 2.216,
511
- "step": 3400
512
- },
513
- {
514
- "epoch": 0.9869375907111756,
515
- "eval_loss": 1.1009138822555542,
516
- "eval_runtime": 213.7998,
517
- "eval_samples_per_second": 212.811,
518
- "eval_steps_per_second": 3.326,
519
- "step": 3400
520
- },
521
- {
522
- "epoch": 1.0159651669085632,
523
- "grad_norm": 1.1432065963745117,
524
- "learning_rate": 8.984034833091437e-05,
525
- "loss": 2.1952,
526
- "step": 3500
527
- },
528
- {
529
- "epoch": 1.0159651669085632,
530
- "eval_loss": 1.106726884841919,
531
- "eval_runtime": 213.7054,
532
- "eval_samples_per_second": 212.905,
533
- "eval_steps_per_second": 3.327,
534
- "step": 3500
535
- },
536
- {
537
- "epoch": 1.0449927431059507,
538
- "grad_norm": 1.1603158712387085,
539
- "learning_rate": 8.95500725689405e-05,
540
- "loss": 2.2019,
541
- "step": 3600
542
- },
543
- {
544
- "epoch": 1.0449927431059507,
545
- "eval_loss": 1.1014330387115479,
546
- "eval_runtime": 213.1977,
547
- "eval_samples_per_second": 213.412,
548
- "eval_steps_per_second": 3.335,
549
- "step": 3600
550
- },
551
- {
552
- "epoch": 1.0740203193033382,
553
- "grad_norm": 1.2428488731384277,
554
- "learning_rate": 8.925979680696662e-05,
555
- "loss": 2.1959,
556
- "step": 3700
557
- },
558
- {
559
- "epoch": 1.0740203193033382,
560
- "eval_loss": 1.1004406213760376,
561
- "eval_runtime": 213.3658,
562
- "eval_samples_per_second": 213.244,
563
- "eval_steps_per_second": 3.332,
564
- "step": 3700
565
- },
566
- {
567
- "epoch": 1.1030478955007257,
568
- "grad_norm": 1.1615545749664307,
569
- "learning_rate": 8.896952104499274e-05,
570
- "loss": 2.1776,
571
- "step": 3800
572
- },
573
- {
574
- "epoch": 1.1030478955007257,
575
- "eval_loss": 1.0938160419464111,
576
- "eval_runtime": 213.3987,
577
- "eval_samples_per_second": 213.211,
578
- "eval_steps_per_second": 3.332,
579
- "step": 3800
580
- },
581
- {
582
- "epoch": 1.1320754716981132,
583
- "grad_norm": 1.1921610832214355,
584
- "learning_rate": 8.867924528301888e-05,
585
- "loss": 2.1762,
586
- "step": 3900
587
- },
588
- {
589
- "epoch": 1.1320754716981132,
590
- "eval_loss": 1.0960694551467896,
591
- "eval_runtime": 213.1832,
592
- "eval_samples_per_second": 213.427,
593
- "eval_steps_per_second": 3.335,
594
- "step": 3900
595
- },
596
- {
597
- "epoch": 1.1611030478955007,
598
- "grad_norm": 1.1980363130569458,
599
- "learning_rate": 8.8388969521045e-05,
600
- "loss": 2.1717,
601
- "step": 4000
602
- },
603
- {
604
- "epoch": 1.1611030478955007,
605
- "eval_loss": 1.0951919555664062,
606
- "eval_runtime": 213.4024,
607
- "eval_samples_per_second": 213.207,
608
- "eval_steps_per_second": 3.332,
609
- "step": 4000
610
- },
611
- {
612
- "epoch": 1.1901306240928882,
613
- "grad_norm": 1.217236042022705,
614
- "learning_rate": 8.809869375907113e-05,
615
- "loss": 2.1534,
616
- "step": 4100
617
- },
618
- {
619
- "epoch": 1.1901306240928882,
620
- "eval_loss": 1.0937577486038208,
621
- "eval_runtime": 213.8113,
622
- "eval_samples_per_second": 212.8,
623
- "eval_steps_per_second": 3.325,
624
- "step": 4100
625
- },
626
- {
627
- "epoch": 1.2191582002902757,
628
- "grad_norm": 1.2121118307113647,
629
- "learning_rate": 8.780841799709725e-05,
630
- "loss": 2.1639,
631
- "step": 4200
632
- },
633
- {
634
- "epoch": 1.2191582002902757,
635
- "eval_loss": 1.0909945964813232,
636
- "eval_runtime": 212.8308,
637
- "eval_samples_per_second": 213.78,
638
- "eval_steps_per_second": 3.341,
639
- "step": 4200
640
- },
641
- {
642
- "epoch": 1.2481857764876634,
643
- "grad_norm": 1.17587411403656,
644
- "learning_rate": 8.751814223512336e-05,
645
- "loss": 2.146,
646
- "step": 4300
647
- },
648
- {
649
- "epoch": 1.2481857764876634,
650
- "eval_loss": 1.0888868570327759,
651
- "eval_runtime": 213.8752,
652
- "eval_samples_per_second": 212.736,
653
- "eval_steps_per_second": 3.324,
654
- "step": 4300
655
- },
656
- {
657
- "epoch": 1.2772133526850509,
658
- "grad_norm": 1.2848412990570068,
659
- "learning_rate": 8.722786647314949e-05,
660
- "loss": 2.1357,
661
- "step": 4400
662
- },
663
- {
664
- "epoch": 1.2772133526850509,
665
- "eval_loss": 1.091068983078003,
666
- "eval_runtime": 213.4081,
667
- "eval_samples_per_second": 213.202,
668
- "eval_steps_per_second": 3.332,
669
- "step": 4400
670
- },
671
- {
672
- "epoch": 1.3062409288824384,
673
- "grad_norm": 1.2059731483459473,
674
- "learning_rate": 8.693759071117562e-05,
675
- "loss": 2.1456,
676
- "step": 4500
677
- },
678
- {
679
- "epoch": 1.3062409288824384,
680
- "eval_loss": 1.0857021808624268,
681
- "eval_runtime": 213.7314,
682
- "eval_samples_per_second": 212.879,
683
- "eval_steps_per_second": 3.327,
684
- "step": 4500
685
- },
686
- {
687
- "epoch": 1.3352685050798259,
688
- "grad_norm": 1.226241946220398,
689
- "learning_rate": 8.664731494920174e-05,
690
- "loss": 2.1453,
691
- "step": 4600
692
- },
693
- {
694
- "epoch": 1.3352685050798259,
695
- "eval_loss": 1.0845140218734741,
696
- "eval_runtime": 213.4698,
697
- "eval_samples_per_second": 213.14,
698
- "eval_steps_per_second": 3.331,
699
- "step": 4600
700
- },
701
- {
702
- "epoch": 1.3642960812772134,
703
- "grad_norm": 1.1810499429702759,
704
- "learning_rate": 8.635703918722787e-05,
705
- "loss": 2.1425,
706
- "step": 4700
707
- },
708
- {
709
- "epoch": 1.3642960812772134,
710
- "eval_loss": 1.0831544399261475,
711
- "eval_runtime": 214.2077,
712
- "eval_samples_per_second": 212.406,
713
- "eval_steps_per_second": 3.319,
714
- "step": 4700
715
- },
716
- {
717
- "epoch": 1.3933236574746009,
718
- "grad_norm": 1.155281662940979,
719
- "learning_rate": 8.606676342525399e-05,
720
- "loss": 2.1173,
721
- "step": 4800
722
- },
723
- {
724
- "epoch": 1.3933236574746009,
725
- "eval_loss": 1.0785441398620605,
726
- "eval_runtime": 213.6973,
727
- "eval_samples_per_second": 212.913,
728
- "eval_steps_per_second": 3.327,
729
- "step": 4800
730
- },
731
- {
732
- "epoch": 1.4223512336719883,
733
- "grad_norm": 1.2070744037628174,
734
- "learning_rate": 8.577648766328012e-05,
735
- "loss": 2.1183,
736
- "step": 4900
737
- },
738
- {
739
- "epoch": 1.4223512336719883,
740
- "eval_loss": 1.0808286666870117,
741
- "eval_runtime": 213.4564,
742
- "eval_samples_per_second": 213.154,
743
- "eval_steps_per_second": 3.331,
744
- "step": 4900
745
- },
746
- {
747
- "epoch": 1.4513788098693758,
748
- "grad_norm": 1.1901525259017944,
749
- "learning_rate": 8.548621190130625e-05,
750
- "loss": 2.1274,
751
- "step": 5000
752
- },
753
- {
754
- "epoch": 1.4513788098693758,
755
- "eval_loss": 1.0827044248580933,
756
- "eval_runtime": 212.5592,
757
- "eval_samples_per_second": 214.053,
758
- "eval_steps_per_second": 3.345,
759
- "step": 5000
760
- },
761
- {
762
- "epoch": 1.4804063860667633,
763
- "grad_norm": 1.1999766826629639,
764
- "learning_rate": 8.519593613933237e-05,
765
- "loss": 2.1145,
766
- "step": 5100
767
- },
768
- {
769
- "epoch": 1.4804063860667633,
770
- "eval_loss": 1.078644037246704,
771
- "eval_runtime": 213.0532,
772
- "eval_samples_per_second": 213.557,
773
- "eval_steps_per_second": 3.337,
774
- "step": 5100
775
- },
776
- {
777
- "epoch": 1.509433962264151,
778
- "grad_norm": 1.2294871807098389,
779
- "learning_rate": 8.49056603773585e-05,
780
- "loss": 2.1067,
781
- "step": 5200
782
- },
783
- {
784
- "epoch": 1.509433962264151,
785
- "eval_loss": 1.0794402360916138,
786
- "eval_runtime": 212.9617,
787
- "eval_samples_per_second": 213.649,
788
- "eval_steps_per_second": 3.339,
789
- "step": 5200
790
- },
791
- {
792
- "epoch": 1.5384615384615383,
793
- "grad_norm": 1.2571580410003662,
794
- "learning_rate": 8.461538461538461e-05,
795
- "loss": 2.1032,
796
- "step": 5300
797
- },
798
- {
799
- "epoch": 1.5384615384615383,
800
- "eval_loss": 1.0783346891403198,
801
- "eval_runtime": 213.4656,
802
- "eval_samples_per_second": 213.144,
803
- "eval_steps_per_second": 3.331,
804
- "step": 5300
805
- },
806
- {
807
- "epoch": 1.567489114658926,
808
- "grad_norm": 1.2078722715377808,
809
- "learning_rate": 8.432510885341074e-05,
810
- "loss": 2.0912,
811
- "step": 5400
812
- },
813
- {
814
- "epoch": 1.567489114658926,
815
- "eval_loss": 1.0764219760894775,
816
- "eval_runtime": 213.826,
817
- "eval_samples_per_second": 212.785,
818
- "eval_steps_per_second": 3.325,
819
- "step": 5400
820
- },
821
- {
822
- "epoch": 1.5965166908563135,
823
- "grad_norm": 1.272294521331787,
824
- "learning_rate": 8.403483309143688e-05,
825
- "loss": 2.0784,
826
- "step": 5500
827
- },
828
- {
829
- "epoch": 1.5965166908563135,
830
- "eval_loss": 1.0817687511444092,
831
- "eval_runtime": 213.443,
832
- "eval_samples_per_second": 213.167,
833
- "eval_steps_per_second": 3.331,
834
- "step": 5500
835
- },
836
- {
837
- "epoch": 1.625544267053701,
838
- "grad_norm": 1.2367442846298218,
839
- "learning_rate": 8.374455732946299e-05,
840
- "loss": 2.0997,
841
- "step": 5600
842
- },
843
- {
844
- "epoch": 1.625544267053701,
845
- "eval_loss": 1.079858660697937,
846
- "eval_runtime": 213.7339,
847
- "eval_samples_per_second": 212.877,
848
- "eval_steps_per_second": 3.327,
849
- "step": 5600
850
- },
851
- {
852
- "epoch": 1.6545718432510885,
853
- "grad_norm": 1.2720229625701904,
854
- "learning_rate": 8.345428156748912e-05,
855
- "loss": 2.093,
856
- "step": 5700
857
- },
858
- {
859
- "epoch": 1.6545718432510885,
860
- "eval_loss": 1.0779507160186768,
861
- "eval_runtime": 213.2034,
862
- "eval_samples_per_second": 213.407,
863
- "eval_steps_per_second": 3.335,
864
- "step": 5700
865
- },
866
- {
867
- "epoch": 1.683599419448476,
868
- "grad_norm": 1.1694726943969727,
869
- "learning_rate": 8.316400580551524e-05,
870
- "loss": 2.0822,
871
- "step": 5800
872
- },
873
- {
874
- "epoch": 1.683599419448476,
875
- "eval_loss": 1.068250060081482,
876
- "eval_runtime": 213.1022,
877
- "eval_samples_per_second": 213.508,
878
- "eval_steps_per_second": 3.336,
879
- "step": 5800
880
- },
881
- {
882
- "epoch": 1.7126269956458637,
883
- "grad_norm": 1.2155323028564453,
884
- "learning_rate": 8.287373004354137e-05,
885
- "loss": 2.0792,
886
- "step": 5900
887
- },
888
- {
889
- "epoch": 1.7126269956458637,
890
- "eval_loss": 1.0666776895523071,
891
- "eval_runtime": 213.4935,
892
- "eval_samples_per_second": 213.117,
893
- "eval_steps_per_second": 3.33,
894
- "step": 5900
895
- },
896
- {
897
- "epoch": 1.741654571843251,
898
- "grad_norm": 1.3163602352142334,
899
- "learning_rate": 8.25834542815675e-05,
900
- "loss": 2.0712,
901
- "step": 6000
902
- },
903
- {
904
- "epoch": 1.741654571843251,
905
- "eval_loss": 1.0677340030670166,
906
- "eval_runtime": 213.751,
907
- "eval_samples_per_second": 212.86,
908
- "eval_steps_per_second": 3.326,
909
- "step": 6000
910
- },
911
- {
912
- "epoch": 1.7706821480406387,
913
- "grad_norm": 1.1972286701202393,
914
- "learning_rate": 8.229317851959362e-05,
915
- "loss": 2.0679,
916
- "step": 6100
917
- },
918
- {
919
- "epoch": 1.7706821480406387,
920
- "eval_loss": 1.0662775039672852,
921
- "eval_runtime": 213.7705,
922
- "eval_samples_per_second": 212.84,
923
- "eval_steps_per_second": 3.326,
924
- "step": 6100
925
- },
926
- {
927
- "epoch": 1.799709724238026,
928
- "grad_norm": 1.189395546913147,
929
- "learning_rate": 8.200290275761974e-05,
930
- "loss": 2.0753,
931
- "step": 6200
932
- },
933
- {
934
- "epoch": 1.799709724238026,
935
- "eval_loss": 1.0646038055419922,
936
- "eval_runtime": 213.3945,
937
- "eval_samples_per_second": 213.215,
938
- "eval_steps_per_second": 3.332,
939
- "step": 6200
940
- },
941
- {
942
- "epoch": 1.8287373004354137,
943
- "grad_norm": 1.2696415185928345,
944
- "learning_rate": 8.171262699564587e-05,
945
- "loss": 2.063,
946
- "step": 6300
947
- },
948
- {
949
- "epoch": 1.8287373004354137,
950
- "eval_loss": 1.0669814348220825,
951
- "eval_runtime": 213.7587,
952
- "eval_samples_per_second": 212.852,
953
- "eval_steps_per_second": 3.326,
954
- "step": 6300
955
- },
956
- {
957
- "epoch": 1.8577648766328012,
958
- "grad_norm": 1.241452693939209,
959
- "learning_rate": 8.142235123367198e-05,
960
- "loss": 2.0508,
961
- "step": 6400
962
- },
963
- {
964
- "epoch": 1.8577648766328012,
965
- "eval_loss": 1.072275996208191,
966
- "eval_runtime": 213.3197,
967
- "eval_samples_per_second": 213.29,
968
- "eval_steps_per_second": 3.333,
969
- "step": 6400
970
- },
971
- {
972
- "epoch": 1.8867924528301887,
973
- "grad_norm": 1.22267484664917,
974
- "learning_rate": 8.113207547169813e-05,
975
- "loss": 2.07,
976
- "step": 6500
977
- },
978
- {
979
- "epoch": 1.8867924528301887,
980
- "eval_loss": 1.0654535293579102,
981
- "eval_runtime": 214.0386,
982
- "eval_samples_per_second": 212.574,
983
- "eval_steps_per_second": 3.322,
984
- "step": 6500
985
- },
986
- {
987
- "epoch": 1.9158200290275762,
988
- "grad_norm": 1.2704839706420898,
989
- "learning_rate": 8.084179970972424e-05,
990
- "loss": 2.0646,
991
- "step": 6600
992
- },
993
- {
994
- "epoch": 1.9158200290275762,
995
- "eval_loss": 1.0614382028579712,
996
- "eval_runtime": 213.4971,
997
- "eval_samples_per_second": 213.113,
998
- "eval_steps_per_second": 3.33,
999
- "step": 6600
1000
- },
1001
- {
1002
- "epoch": 1.9448476052249637,
1003
- "grad_norm": 1.3870867490768433,
1004
- "learning_rate": 8.055152394775036e-05,
1005
- "loss": 2.0598,
1006
- "step": 6700
1007
- },
1008
- {
1009
- "epoch": 1.9448476052249637,
1010
- "eval_loss": 1.067047357559204,
1011
- "eval_runtime": 214.0952,
1012
- "eval_samples_per_second": 212.518,
1013
- "eval_steps_per_second": 3.321,
1014
- "step": 6700
1015
- },
1016
- {
1017
- "epoch": 1.9738751814223512,
1018
- "grad_norm": 1.3581643104553223,
1019
- "learning_rate": 8.026124818577649e-05,
1020
- "loss": 2.0501,
1021
- "step": 6800
1022
- },
1023
- {
1024
- "epoch": 1.9738751814223512,
1025
- "eval_loss": 1.0663081407546997,
1026
- "eval_runtime": 213.8995,
1027
- "eval_samples_per_second": 212.712,
1028
- "eval_steps_per_second": 3.324,
1029
- "step": 6800
1030
- },
1031
- {
1032
- "epoch": 2.0029027576197387,
1033
- "grad_norm": 1.3438752889633179,
1034
- "learning_rate": 7.997097242380261e-05,
1035
- "loss": 2.0332,
1036
- "step": 6900
1037
- },
1038
- {
1039
- "epoch": 2.0029027576197387,
1040
- "eval_loss": 1.059921383857727,
1041
- "eval_runtime": 213.0183,
1042
- "eval_samples_per_second": 213.592,
1043
- "eval_steps_per_second": 3.338,
1044
- "step": 6900
1045
- },
1046
- {
1047
- "epoch": 2.0319303338171264,
1048
- "grad_norm": 1.3646849393844604,
1049
- "learning_rate": 7.968069666182875e-05,
1050
- "loss": 2.0463,
1051
- "step": 7000
1052
- },
1053
- {
1054
- "epoch": 2.0319303338171264,
1055
- "eval_loss": 1.0679893493652344,
1056
- "eval_runtime": 213.3912,
1057
- "eval_samples_per_second": 213.219,
1058
- "eval_steps_per_second": 3.332,
1059
- "step": 7000
1060
- },
1061
- {
1062
- "epoch": 2.0609579100145137,
1063
- "grad_norm": 1.2047359943389893,
1064
- "learning_rate": 7.939042089985487e-05,
1065
- "loss": 2.0376,
1066
- "step": 7100
1067
- },
1068
- {
1069
- "epoch": 2.0609579100145137,
1070
- "eval_loss": 1.0566322803497314,
1071
- "eval_runtime": 213.6266,
1072
- "eval_samples_per_second": 212.984,
1073
- "eval_steps_per_second": 3.328,
1074
- "step": 7100
1075
- },
1076
- {
1077
- "epoch": 2.0899854862119014,
1078
- "grad_norm": 1.2285219430923462,
1079
- "learning_rate": 7.910014513788099e-05,
1080
- "loss": 2.0327,
1081
- "step": 7200
1082
- },
1083
- {
1084
- "epoch": 2.0899854862119014,
1085
- "eval_loss": 1.058618426322937,
1086
- "eval_runtime": 213.6922,
1087
- "eval_samples_per_second": 212.918,
1088
- "eval_steps_per_second": 3.327,
1089
- "step": 7200
1090
- },
1091
- {
1092
- "epoch": 2.1190130624092887,
1093
- "grad_norm": 1.2674715518951416,
1094
- "learning_rate": 7.880986937590712e-05,
1095
- "loss": 2.0347,
1096
- "step": 7300
1097
- },
1098
- {
1099
- "epoch": 2.1190130624092887,
1100
- "eval_loss": 1.0599507093429565,
1101
- "eval_runtime": 213.5256,
1102
- "eval_samples_per_second": 213.085,
1103
- "eval_steps_per_second": 3.33,
1104
- "step": 7300
1105
- },
1106
- {
1107
- "epoch": 2.1480406386066764,
1108
- "grad_norm": 1.3713229894638062,
1109
- "learning_rate": 7.851959361393323e-05,
1110
- "loss": 2.0321,
1111
- "step": 7400
1112
- },
1113
- {
1114
- "epoch": 2.1480406386066764,
1115
- "eval_loss": 1.0617178678512573,
1116
- "eval_runtime": 213.0273,
1117
- "eval_samples_per_second": 213.583,
1118
- "eval_steps_per_second": 3.338,
1119
- "step": 7400
1120
- },
1121
- {
1122
- "epoch": 2.1770682148040637,
1123
- "grad_norm": 1.292090654373169,
1124
- "learning_rate": 7.822931785195937e-05,
1125
- "loss": 2.01,
1126
- "step": 7500
1127
- },
1128
- {
1129
- "epoch": 2.1770682148040637,
1130
- "eval_loss": 1.0593364238739014,
1131
- "eval_runtime": 213.421,
1132
- "eval_samples_per_second": 213.189,
1133
- "eval_steps_per_second": 3.331,
1134
- "step": 7500
1135
- },
1136
- {
1137
- "epoch": 2.2060957910014514,
1138
- "grad_norm": 1.1819452047348022,
1139
- "learning_rate": 7.79390420899855e-05,
1140
- "loss": 2.0209,
1141
- "step": 7600
1142
- },
1143
- {
1144
- "epoch": 2.2060957910014514,
1145
- "eval_loss": 1.0524711608886719,
1146
- "eval_runtime": 214.0149,
1147
- "eval_samples_per_second": 212.597,
1148
- "eval_steps_per_second": 3.322,
1149
- "step": 7600
1150
- },
1151
- {
1152
- "epoch": 2.235123367198839,
1153
- "grad_norm": 1.2881128787994385,
1154
- "learning_rate": 7.764876632801161e-05,
1155
- "loss": 2.0085,
1156
- "step": 7700
1157
- },
1158
- {
1159
- "epoch": 2.235123367198839,
1160
- "eval_loss": 1.0567752122879028,
1161
- "eval_runtime": 213.6228,
1162
- "eval_samples_per_second": 212.988,
1163
- "eval_steps_per_second": 3.328,
1164
- "step": 7700
1165
- },
1166
- {
1167
- "epoch": 2.2641509433962264,
1168
- "grad_norm": 1.2962584495544434,
1169
- "learning_rate": 7.735849056603774e-05,
1170
- "loss": 2.0204,
1171
- "step": 7800
1172
- },
1173
- {
1174
- "epoch": 2.2641509433962264,
1175
- "eval_loss": 1.0586293935775757,
1176
- "eval_runtime": 213.3516,
1177
- "eval_samples_per_second": 213.258,
1178
- "eval_steps_per_second": 3.333,
1179
- "step": 7800
1180
- },
1181
- {
1182
- "epoch": 2.293178519593614,
1183
- "grad_norm": 1.2214884757995605,
1184
- "learning_rate": 7.706821480406386e-05,
1185
- "loss": 2.0184,
1186
- "step": 7900
1187
- },
1188
- {
1189
- "epoch": 2.293178519593614,
1190
- "eval_loss": 1.0525050163269043,
1191
- "eval_runtime": 212.5483,
1192
- "eval_samples_per_second": 214.064,
1193
- "eval_steps_per_second": 3.345,
1194
- "step": 7900
1195
- },
1196
- {
1197
- "epoch": 2.3222060957910013,
1198
- "grad_norm": 1.2622853517532349,
1199
- "learning_rate": 7.677793904208999e-05,
1200
- "loss": 2.0162,
1201
- "step": 8000
1202
- },
1203
- {
1204
- "epoch": 2.3222060957910013,
1205
- "eval_loss": 1.0512940883636475,
1206
- "eval_runtime": 212.6462,
1207
- "eval_samples_per_second": 213.966,
1208
- "eval_steps_per_second": 3.344,
1209
- "step": 8000
1210
- },
1211
- {
1212
- "epoch": 2.351233671988389,
1213
- "grad_norm": 1.2338088750839233,
1214
- "learning_rate": 7.648766328011612e-05,
1215
- "loss": 2.0029,
1216
- "step": 8100
1217
- },
1218
- {
1219
- "epoch": 2.351233671988389,
1220
- "eval_loss": 1.0521414279937744,
1221
- "eval_runtime": 213.5358,
1222
- "eval_samples_per_second": 213.074,
1223
- "eval_steps_per_second": 3.33,
1224
- "step": 8100
1225
- },
1226
- {
1227
- "epoch": 2.3802612481857763,
1228
- "grad_norm": 1.2111109495162964,
1229
- "learning_rate": 7.619738751814224e-05,
1230
- "loss": 2.0101,
1231
- "step": 8200
1232
- },
1233
- {
1234
- "epoch": 2.3802612481857763,
1235
- "eval_loss": 1.0501890182495117,
1236
- "eval_runtime": 213.0351,
1237
- "eval_samples_per_second": 213.575,
1238
- "eval_steps_per_second": 3.337,
1239
- "step": 8200
1240
- },
1241
- {
1242
- "epoch": 2.409288824383164,
1243
- "grad_norm": 1.2333025932312012,
1244
- "learning_rate": 7.590711175616836e-05,
1245
- "loss": 2.0,
1246
- "step": 8300
1247
- },
1248
- {
1249
- "epoch": 2.409288824383164,
1250
- "eval_loss": 1.051579236984253,
1251
- "eval_runtime": 213.5529,
1252
- "eval_samples_per_second": 213.057,
1253
- "eval_steps_per_second": 3.329,
1254
- "step": 8300
1255
- },
1256
- {
1257
- "epoch": 2.4383164005805513,
1258
- "grad_norm": 1.3394699096679688,
1259
- "learning_rate": 7.561683599419449e-05,
1260
- "loss": 1.9986,
1261
- "step": 8400
1262
- },
1263
- {
1264
- "epoch": 2.4383164005805513,
1265
- "eval_loss": 1.0520364046096802,
1266
- "eval_runtime": 212.3818,
1267
- "eval_samples_per_second": 214.232,
1268
- "eval_steps_per_second": 3.348,
1269
- "step": 8400
1270
- },
1271
- {
1272
- "epoch": 2.467343976777939,
1273
- "grad_norm": 1.334936261177063,
1274
- "learning_rate": 7.532656023222062e-05,
1275
- "loss": 1.993,
1276
- "step": 8500
1277
- },
1278
- {
1279
- "epoch": 2.467343976777939,
1280
- "eval_loss": 1.0490361452102661,
1281
- "eval_runtime": 213.9415,
1282
- "eval_samples_per_second": 212.67,
1283
  "eval_steps_per_second": 3.323,
1284
- "step": 8500
1285
- },
1286
- {
1287
- "epoch": 2.4963715529753268,
1288
- "grad_norm": 1.3085263967514038,
1289
- "learning_rate": 7.503628447024675e-05,
1290
- "loss": 1.9771,
1291
- "step": 8600
1292
- },
1293
- {
1294
- "epoch": 2.4963715529753268,
1295
- "eval_loss": 1.0522186756134033,
1296
- "eval_runtime": 212.3302,
1297
- "eval_samples_per_second": 214.284,
1298
- "eval_steps_per_second": 3.349,
1299
- "step": 8600
1300
- },
1301
- {
1302
- "epoch": 2.525399129172714,
1303
- "grad_norm": 1.4204107522964478,
1304
- "learning_rate": 7.474600870827286e-05,
1305
- "loss": 1.9848,
1306
- "step": 8700
1307
- },
1308
- {
1309
- "epoch": 2.525399129172714,
1310
- "eval_loss": 1.0486035346984863,
1311
- "eval_runtime": 213.5477,
1312
- "eval_samples_per_second": 213.062,
1313
- "eval_steps_per_second": 3.329,
1314
- "step": 8700
1315
- },
1316
- {
1317
- "epoch": 2.5544267053701017,
1318
- "grad_norm": 1.2411503791809082,
1319
- "learning_rate": 7.445573294629898e-05,
1320
- "loss": 2.0016,
1321
- "step": 8800
1322
- },
1323
- {
1324
- "epoch": 2.5544267053701017,
1325
- "eval_loss": 1.0516774654388428,
1326
- "eval_runtime": 213.2425,
1327
- "eval_samples_per_second": 213.367,
1328
- "eval_steps_per_second": 3.334,
1329
- "step": 8800
1330
- },
1331
- {
1332
- "epoch": 2.583454281567489,
1333
- "grad_norm": 1.2166720628738403,
1334
- "learning_rate": 7.416545718432511e-05,
1335
- "loss": 1.9761,
1336
- "step": 8900
1337
- },
1338
- {
1339
- "epoch": 2.583454281567489,
1340
- "eval_loss": 1.0438764095306396,
1341
- "eval_runtime": 213.2447,
1342
- "eval_samples_per_second": 213.365,
1343
- "eval_steps_per_second": 3.334,
1344
- "step": 8900
1345
- },
1346
- {
1347
- "epoch": 2.6124818577648767,
1348
- "grad_norm": 1.307707667350769,
1349
- "learning_rate": 7.387518142235124e-05,
1350
- "loss": 1.9753,
1351
- "step": 9000
1352
- },
1353
- {
1354
- "epoch": 2.6124818577648767,
1355
- "eval_loss": 1.0445740222930908,
1356
- "eval_runtime": 212.5813,
1357
- "eval_samples_per_second": 214.031,
1358
- "eval_steps_per_second": 3.345,
1359
- "step": 9000
1360
- },
1361
- {
1362
- "epoch": 2.641509433962264,
1363
- "grad_norm": 1.3446862697601318,
1364
- "learning_rate": 7.358490566037736e-05,
1365
- "loss": 1.9795,
1366
- "step": 9100
1367
- },
1368
- {
1369
- "epoch": 2.641509433962264,
1370
- "eval_loss": 1.0461750030517578,
1371
- "eval_runtime": 213.2022,
1372
- "eval_samples_per_second": 213.408,
1373
- "eval_steps_per_second": 3.335,
1374
- "step": 9100
1375
- },
1376
- {
1377
- "epoch": 2.6705370101596517,
1378
- "grad_norm": 1.25364351272583,
1379
- "learning_rate": 7.329462989840349e-05,
1380
- "loss": 1.966,
1381
- "step": 9200
1382
- },
1383
- {
1384
- "epoch": 2.6705370101596517,
1385
- "eval_loss": 1.0489540100097656,
1386
- "eval_runtime": 213.3373,
1387
- "eval_samples_per_second": 213.273,
1388
- "eval_steps_per_second": 3.333,
1389
- "step": 9200
1390
- },
1391
- {
1392
- "epoch": 2.699564586357039,
1393
- "grad_norm": 1.317325472831726,
1394
- "learning_rate": 7.300435413642961e-05,
1395
- "loss": 1.9853,
1396
- "step": 9300
1397
- },
1398
- {
1399
- "epoch": 2.699564586357039,
1400
- "eval_loss": 1.04426109790802,
1401
- "eval_runtime": 212.5953,
1402
- "eval_samples_per_second": 214.017,
1403
- "eval_steps_per_second": 3.344,
1404
- "step": 9300
1405
- },
1406
- {
1407
- "epoch": 2.7285921625544267,
1408
- "grad_norm": 1.2580476999282837,
1409
- "learning_rate": 7.271407837445574e-05,
1410
- "loss": 1.9873,
1411
- "step": 9400
1412
- },
1413
- {
1414
- "epoch": 2.7285921625544267,
1415
- "eval_loss": 1.0441796779632568,
1416
- "eval_runtime": 213.1744,
1417
- "eval_samples_per_second": 213.436,
1418
- "eval_steps_per_second": 3.335,
1419
- "step": 9400
1420
  }
1421
  ],
1422
  "logging_steps": 100,
@@ -1431,7 +36,7 @@
1431
  "early_stopping_threshold": 0.0
1432
  },
1433
  "attributes": {
1434
- "early_stopping_patience_counter": 5
1435
  }
1436
  },
1437
  "TrainerControl": {
@@ -1440,12 +45,12 @@
1440
  "should_evaluate": false,
1441
  "should_log": false,
1442
  "should_save": true,
1443
- "should_training_stop": true
1444
  },
1445
  "attributes": {}
1446
  }
1447
  },
1448
- "total_flos": 9.403409048272896e+16,
1449
  "train_batch_size": 64,
1450
  "trial_name": null,
1451
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.2625532150268555,
3
+ "best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-100",
4
+ "epoch": 0.02902757619738752,
5
  "eval_steps": 100,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02902757619738752,
13
+ "grad_norm": 1.1695395708084106,
14
  "learning_rate": 9.970972423802612e-05,
15
+ "loss": 2.8263,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.02902757619738752,
20
+ "eval_loss": 1.2625532150268555,
21
+ "eval_runtime": 213.9369,
22
+ "eval_samples_per_second": 212.651,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "eval_steps_per_second": 3.323,
24
+ "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ],
27
  "logging_steps": 100,
 
36
  "early_stopping_threshold": 0.0
37
  },
38
  "attributes": {
39
+ "early_stopping_patience_counter": 0
40
  }
41
  },
42
  "TrainerControl": {
 
45
  "should_evaluate": false,
46
  "should_log": false,
47
  "should_save": true,
48
+ "should_training_stop": false
49
  },
50
  "attributes": {}
51
  }
52
  },
53
+ "total_flos": 1000387607789568.0,
54
  "train_batch_size": 64,
55
  "trial_name": null,
56
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:106cd64593a78067217b619a1bb4288f6aff3cb8411c9fafd726f7129f1b9be1
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f4319603876b63ddfc21d2259b95fa764f31a2204f8d89c6fce365f7769879
3
  size 5368