MHGanainy commited on
Commit
13a5c87
1 Parent(s): 8c689b4

MHGanainy/8-clusters-balanced-lex-best-v2-3

Browse files
Files changed (4) hide show
  1. all_results.json +9 -9
  2. eval_results.json +5 -5
  3. train_results.json +4 -4
  4. trainer_state.json +157 -157
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.9266319274902344,
4
- "eval_runtime": 63.6699,
5
- "eval_samples_per_second": 15.549,
6
- "eval_steps_per_second": 1.948,
7
- "perplexity": 6.866344904696836,
8
  "total_flos": 1.5640093507584e+17,
9
- "train_loss": 2.1807824082961527,
10
- "train_runtime": 2484.8584,
11
- "train_samples_per_second": 6.911,
12
- "train_steps_per_second": 3.456
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 1.9266563653945923,
4
+ "eval_runtime": 63.5112,
5
+ "eval_samples_per_second": 15.588,
6
+ "eval_steps_per_second": 1.952,
7
+ "perplexity": 6.866512705827251,
8
  "total_flos": 1.5640093507584e+17,
9
+ "train_loss": 2.1807946249230077,
10
+ "train_runtime": 2485.1722,
11
+ "train_samples_per_second": 6.91,
12
+ "train_steps_per_second": 3.455
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.9266319274902344,
4
- "eval_runtime": 63.6699,
5
- "eval_samples_per_second": 15.549,
6
- "eval_steps_per_second": 1.948,
7
- "perplexity": 6.866344904696836
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 1.9266563653945923,
4
+ "eval_runtime": 63.5112,
5
+ "eval_samples_per_second": 15.588,
6
+ "eval_steps_per_second": 1.952,
7
+ "perplexity": 6.866512705827251
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.5640093507584e+17,
4
- "train_loss": 2.1807824082961527,
5
- "train_runtime": 2484.8584,
6
- "train_samples_per_second": 6.911,
7
- "train_steps_per_second": 3.456
8
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.5640093507584e+17,
4
+ "train_loss": 2.1807946249230077,
5
+ "train_runtime": 2485.1722,
6
+ "train_samples_per_second": 6.91,
7
+ "train_steps_per_second": 3.455
8
  }
trainer_state.json CHANGED
@@ -10,615 +10,615 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01164551065564225,
13
- "grad_norm": 0.14791734516620636,
14
  "learning_rate": 2.331002331002331e-06,
15
  "loss": 2.4953,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.0232910213112845,
20
- "grad_norm": 0.2441452145576477,
21
  "learning_rate": 4.662004662004662e-06,
22
- "loss": 2.4575,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.03493653196692675,
27
- "grad_norm": 0.22649115324020386,
28
  "learning_rate": 6.993006993006993e-06,
29
  "loss": 2.4409,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.046582042622569,
34
- "grad_norm": 0.3321034610271454,
35
  "learning_rate": 9.324009324009324e-06,
36
  "loss": 2.4497,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.05822755327821125,
41
- "grad_norm": 0.36618897318840027,
42
  "learning_rate": 1.1655011655011657e-05,
43
- "loss": 2.3973,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.0698730639338535,
48
- "grad_norm": 0.42568501830101013,
49
  "learning_rate": 1.3986013986013986e-05,
50
- "loss": 2.356,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.08151857458949575,
55
- "grad_norm": 0.4560384452342987,
56
  "learning_rate": 1.6317016317016318e-05,
57
- "loss": 2.317,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.093164085245138,
62
- "grad_norm": 0.5321765542030334,
63
  "learning_rate": 1.8648018648018647e-05,
64
- "loss": 2.3544,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 0.10480959590078025,
69
- "grad_norm": 0.5255011916160583,
70
  "learning_rate": 1.999854282682324e-05,
71
- "loss": 2.3049,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 0.1164551065564225,
76
- "grad_norm": 0.563510537147522,
77
  "learning_rate": 1.9983347507285766e-05,
78
- "loss": 2.2863,
79
  "step": 1000
80
  },
81
  {
82
  "epoch": 0.12810061721206475,
83
- "grad_norm": 0.5928713083267212,
84
  "learning_rate": 1.9951660332101616e-05,
85
- "loss": 2.2718,
86
  "step": 1100
87
  },
88
  {
89
  "epoch": 0.139746127867707,
90
- "grad_norm": 0.7078022956848145,
91
  "learning_rate": 1.9903533646470504e-05,
92
- "loss": 2.2645,
93
  "step": 1200
94
  },
95
  {
96
  "epoch": 0.15139163852334925,
97
- "grad_norm": 0.612659752368927,
98
  "learning_rate": 1.9839046952618667e-05,
99
- "loss": 2.2507,
100
  "step": 1300
101
  },
102
  {
103
  "epoch": 0.1630371491789915,
104
- "grad_norm": 0.6351534724235535,
105
  "learning_rate": 1.9758306778466264e-05,
106
- "loss": 2.2474,
107
  "step": 1400
108
  },
109
  {
110
  "epoch": 0.17468265983463374,
111
- "grad_norm": 0.616267740726471,
112
  "learning_rate": 1.966144650165e-05,
113
- "loss": 2.2574,
114
  "step": 1500
115
  },
116
  {
117
  "epoch": 0.186328170490276,
118
- "grad_norm": 0.6416229009628296,
119
  "learning_rate": 1.9548626129191778e-05,
120
- "loss": 2.249,
121
  "step": 1600
122
  },
123
  {
124
  "epoch": 0.19797368114591826,
125
- "grad_norm": 0.6149903535842896,
126
  "learning_rate": 1.9420032033177225e-05,
127
- "loss": 2.2497,
128
  "step": 1700
129
  },
130
  {
131
  "epoch": 0.2096191918015605,
132
- "grad_norm": 0.9221161007881165,
133
  "learning_rate": 1.927587664288089e-05,
134
- "loss": 2.223,
135
  "step": 1800
136
  },
137
  {
138
  "epoch": 0.22126470245720276,
139
- "grad_norm": 0.6987215876579285,
140
  "learning_rate": 1.911639809384656e-05,
141
- "loss": 2.1861,
142
  "step": 1900
143
  },
144
  {
145
  "epoch": 0.232910213112845,
146
- "grad_norm": 1.0042574405670166,
147
  "learning_rate": 1.8941859834502484e-05,
148
- "loss": 2.1883,
149
  "step": 2000
150
  },
151
  {
152
  "epoch": 0.24455572376848725,
153
- "grad_norm": 0.975431501865387,
154
  "learning_rate": 1.8752550190961288e-05,
155
- "loss": 2.2078,
156
  "step": 2100
157
  },
158
  {
159
  "epoch": 0.2562012344241295,
160
- "grad_norm": 0.8907233476638794,
161
  "learning_rate": 1.8548781890723614e-05,
162
- "loss": 2.2046,
163
  "step": 2200
164
  },
165
  {
166
  "epoch": 0.26784674507977174,
167
- "grad_norm": 0.9513066411018372,
168
  "learning_rate": 1.8330891546072095e-05,
169
- "loss": 2.2023,
170
  "step": 2300
171
  },
172
  {
173
  "epoch": 0.279492255735414,
174
- "grad_norm": 0.6928815841674805,
175
  "learning_rate": 1.809923909800931e-05,
176
  "loss": 2.1932,
177
  "step": 2400
178
  },
179
  {
180
  "epoch": 0.29113776639105626,
181
- "grad_norm": 0.7470653057098389,
182
  "learning_rate": 1.7854207221658092e-05,
183
- "loss": 2.2031,
184
  "step": 2500
185
  },
186
  {
187
  "epoch": 0.3027832770466985,
188
- "grad_norm": 0.7841106653213501,
189
  "learning_rate": 1.7596200694106552e-05,
190
- "loss": 2.1867,
191
  "step": 2600
192
  },
193
  {
194
  "epoch": 0.31442878770234073,
195
- "grad_norm": 0.9168167114257812,
196
  "learning_rate": 1.7325645725742056e-05,
197
- "loss": 2.1939,
198
  "step": 2700
199
  },
200
  {
201
  "epoch": 0.326074298357983,
202
- "grad_norm": 0.9133324027061462,
203
  "learning_rate": 1.7042989256178744e-05,
204
- "loss": 2.1522,
205
  "step": 2800
206
  },
207
  {
208
  "epoch": 0.33771980901362525,
209
- "grad_norm": 0.985498309135437,
210
  "learning_rate": 1.6748698215941704e-05,
211
- "loss": 2.1735,
212
  "step": 2900
213
  },
214
  {
215
  "epoch": 0.3493653196692675,
216
- "grad_norm": 0.7974863052368164,
217
  "learning_rate": 1.6443258755127393e-05,
218
- "loss": 2.1939,
219
  "step": 3000
220
  },
221
  {
222
  "epoch": 0.36101083032490977,
223
- "grad_norm": 0.6346118450164795,
224
  "learning_rate": 1.6127175440314596e-05,
225
  "loss": 2.1204,
226
  "step": 3100
227
  },
228
  {
229
  "epoch": 0.372656340980552,
230
- "grad_norm": 0.8495335578918457,
231
  "learning_rate": 1.5800970421052487e-05,
232
- "loss": 2.1964,
233
  "step": 3200
234
  },
235
  {
236
  "epoch": 0.38430185163619424,
237
- "grad_norm": 1.029366135597229,
238
  "learning_rate": 1.546518256730277e-05,
239
- "loss": 2.1838,
240
  "step": 3300
241
  },
242
  {
243
  "epoch": 0.3959473622918365,
244
- "grad_norm": 1.0092447996139526,
245
  "learning_rate": 1.5120366579260734e-05,
246
  "loss": 2.1607,
247
  "step": 3400
248
  },
249
  {
250
  "epoch": 0.40759287294747876,
251
- "grad_norm": 0.8872857689857483,
252
  "learning_rate": 1.4767092071025792e-05,
253
- "loss": 2.1836,
254
  "step": 3500
255
  },
256
  {
257
  "epoch": 0.419238383603121,
258
- "grad_norm": 0.8824167251586914,
259
  "learning_rate": 1.4405942629635174e-05,
260
- "loss": 2.1207,
261
  "step": 3600
262
  },
263
  {
264
  "epoch": 0.4308838942587632,
265
- "grad_norm": 0.7451480627059937,
266
  "learning_rate": 1.4037514851015241e-05,
267
  "loss": 2.215,
268
  "step": 3700
269
  },
270
  {
271
  "epoch": 0.4425294049144055,
272
- "grad_norm": 0.9333692193031311,
273
  "learning_rate": 1.3662417354442924e-05,
274
  "loss": 2.1836,
275
  "step": 3800
276
  },
277
  {
278
  "epoch": 0.45417491557004774,
279
- "grad_norm": 0.9870197176933289,
280
  "learning_rate": 1.3281269777145354e-05,
281
- "loss": 2.1935,
282
  "step": 3900
283
  },
284
  {
285
  "epoch": 0.46582042622569,
286
- "grad_norm": 1.1925898790359497,
287
  "learning_rate": 1.2894701750698541e-05,
288
- "loss": 2.1383,
289
  "step": 4000
290
  },
291
  {
292
  "epoch": 0.47746593688133226,
293
- "grad_norm": 0.8721115589141846,
294
  "learning_rate": 1.2503351860916024e-05,
295
- "loss": 2.1506,
296
  "step": 4100
297
  },
298
  {
299
  "epoch": 0.4891114475369745,
300
- "grad_norm": 1.3675851821899414,
301
  "learning_rate": 1.2107866592945686e-05,
302
- "loss": 2.1562,
303
  "step": 4200
304
  },
305
  {
306
  "epoch": 0.5007569581926168,
307
- "grad_norm": 0.9814177751541138,
308
  "learning_rate": 1.1708899263317381e-05,
309
- "loss": 2.1394,
310
  "step": 4300
311
  },
312
  {
313
  "epoch": 0.512402468848259,
314
- "grad_norm": 0.8111391067504883,
315
  "learning_rate": 1.1307108940705536e-05,
316
  "loss": 2.1605,
317
  "step": 4400
318
  },
319
  {
320
  "epoch": 0.5240479795039013,
321
- "grad_norm": 0.9435734152793884,
322
  "learning_rate": 1.090315935718958e-05,
323
  "loss": 2.1108,
324
  "step": 4500
325
  },
326
  {
327
  "epoch": 0.5356934901595435,
328
- "grad_norm": 0.8855921626091003,
329
  "learning_rate": 1.0497717811810748e-05,
330
- "loss": 2.1364,
331
  "step": 4600
332
  },
333
  {
334
  "epoch": 0.5473390008151857,
335
- "grad_norm": 0.7305698990821838,
336
  "learning_rate": 1.0091454068236455e-05,
337
  "loss": 2.1498,
338
  "step": 4700
339
  },
340
  {
341
  "epoch": 0.558984511470828,
342
- "grad_norm": 1.123586654663086,
343
  "learning_rate": 9.685039248353284e-06,
344
- "loss": 2.1634,
345
  "step": 4800
346
  },
347
  {
348
  "epoch": 0.5706300221264703,
349
- "grad_norm": 0.857729434967041,
350
  "learning_rate": 9.279144723616279e-06,
351
- "loss": 2.0721,
352
  "step": 4900
353
  },
354
  {
355
  "epoch": 0.5822755327821125,
356
- "grad_norm": 0.7783474326133728,
357
  "learning_rate": 8.874441005985965e-06,
358
- "loss": 2.1569,
359
  "step": 5000
360
  },
361
  {
362
  "epoch": 0.5939210434377548,
363
- "grad_norm": 1.0050169229507446,
364
  "learning_rate": 8.47159664028521e-06,
365
  "loss": 2.1631,
366
  "step": 5100
367
  },
368
  {
369
  "epoch": 0.605566554093397,
370
- "grad_norm": 0.7871978282928467,
371
  "learning_rate": 8.07127709980564e-06,
372
- "loss": 2.1207,
373
  "step": 5200
374
  },
375
  {
376
  "epoch": 0.6172120647490392,
377
- "grad_norm": 1.3452588319778442,
378
  "learning_rate": 7.674143686988085e-06,
379
  "loss": 2.0871,
380
  "step": 5300
381
  },
382
  {
383
  "epoch": 0.6288575754046815,
384
- "grad_norm": 0.7304858565330505,
385
  "learning_rate": 7.280852440992941e-06,
386
  "loss": 2.1289,
387
  "step": 5400
388
  },
389
  {
390
  "epoch": 0.6405030860603237,
391
- "grad_norm": 0.9213933944702148,
392
  "learning_rate": 6.89205305396518e-06,
393
- "loss": 2.1067,
394
  "step": 5500
395
  },
396
  {
397
  "epoch": 0.652148596715966,
398
- "grad_norm": 0.9295869469642639,
399
  "learning_rate": 6.508387797784227e-06,
400
- "loss": 2.0959,
401
  "step": 5600
402
  },
403
  {
404
  "epoch": 0.6637941073716083,
405
- "grad_norm": 0.9920222759246826,
406
  "learning_rate": 6.130490463071604e-06,
407
- "loss": 2.0919,
408
  "step": 5700
409
  },
410
  {
411
  "epoch": 0.6754396180272505,
412
- "grad_norm": 0.8308998942375183,
413
  "learning_rate": 5.758985312209124e-06,
414
- "loss": 2.1206,
415
  "step": 5800
416
  },
417
  {
418
  "epoch": 0.6870851286828927,
419
- "grad_norm": 1.1122294664382935,
420
  "learning_rate": 5.394486048097099e-06,
421
- "loss": 2.116,
422
  "step": 5900
423
  },
424
  {
425
  "epoch": 0.698730639338535,
426
- "grad_norm": 0.7995828986167908,
427
  "learning_rate": 5.037594800356142e-06,
428
- "loss": 2.1159,
429
  "step": 6000
430
  },
431
  {
432
  "epoch": 0.7103761499941772,
433
- "grad_norm": 1.1691644191741943,
434
  "learning_rate": 4.688901130647314e-06,
435
  "loss": 2.1396,
436
  "step": 6100
437
  },
438
  {
439
  "epoch": 0.7220216606498195,
440
- "grad_norm": 0.8228808045387268,
441
  "learning_rate": 4.348981058753708e-06,
442
  "loss": 2.0819,
443
  "step": 6200
444
  },
445
  {
446
  "epoch": 0.7336671713054618,
447
- "grad_norm": 1.0278650522232056,
448
  "learning_rate": 4.018396111032394e-06,
449
- "loss": 2.1227,
450
  "step": 6300
451
  },
452
  {
453
  "epoch": 0.745312681961104,
454
- "grad_norm": 0.9175333976745605,
455
  "learning_rate": 3.697692392808545e-06,
456
- "loss": 2.1037,
457
  "step": 6400
458
  },
459
  {
460
  "epoch": 0.7569581926167462,
461
- "grad_norm": 0.7555139064788818,
462
  "learning_rate": 3.387399686244144e-06,
463
- "loss": 2.0953,
464
  "step": 6500
465
  },
466
  {
467
  "epoch": 0.7686037032723885,
468
- "grad_norm": 0.8671744465827942,
469
  "learning_rate": 3.0880305751715402e-06,
470
- "loss": 2.1201,
471
  "step": 6600
472
  },
473
  {
474
  "epoch": 0.7802492139280307,
475
- "grad_norm": 0.8459142446517944,
476
  "learning_rate": 2.800079598337505e-06,
477
- "loss": 2.1136,
478
  "step": 6700
479
  },
480
  {
481
  "epoch": 0.791894724583673,
482
- "grad_norm": 0.763583242893219,
483
  "learning_rate": 2.524022432456664e-06,
484
- "loss": 2.1386,
485
  "step": 6800
486
  },
487
  {
488
  "epoch": 0.8035402352393153,
489
- "grad_norm": 0.8807295560836792,
490
  "learning_rate": 2.260315106423807e-06,
491
- "loss": 2.0779,
492
  "step": 6900
493
  },
494
  {
495
  "epoch": 0.8151857458949575,
496
- "grad_norm": 0.8740783333778381,
497
  "learning_rate": 2.0093932479830935e-06,
498
  "loss": 2.1367,
499
  "step": 7000
500
  },
501
  {
502
  "epoch": 0.8268312565505997,
503
- "grad_norm": 1.039287805557251,
504
  "learning_rate": 1.7716713640987526e-06,
505
- "loss": 2.1166,
506
  "step": 7100
507
  },
508
  {
509
  "epoch": 0.838476767206242,
510
- "grad_norm": 0.8560532331466675,
511
  "learning_rate": 1.5475421562158854e-06,
512
  "loss": 2.1459,
513
  "step": 7200
514
  },
515
  {
516
  "epoch": 0.8501222778618842,
517
- "grad_norm": 0.9102568030357361,
518
  "learning_rate": 1.3373758715426444e-06,
519
- "loss": 2.1095,
520
  "step": 7300
521
  },
522
  {
523
  "epoch": 0.8617677885175264,
524
- "grad_norm": 0.8189859390258789,
525
  "learning_rate": 1.141519691425379e-06,
526
- "loss": 2.0852,
527
  "step": 7400
528
  },
529
  {
530
  "epoch": 0.8734132991731688,
531
- "grad_norm": 0.9001794457435608,
532
  "learning_rate": 9.60297157827106e-07,
533
- "loss": 2.1364,
534
  "step": 7500
535
  },
536
  {
537
  "epoch": 0.885058809828811,
538
- "grad_norm": 0.8623350262641907,
539
  "learning_rate": 7.94007638856753e-07,
540
- "loss": 2.1407,
541
  "step": 7600
542
  },
543
  {
544
  "epoch": 0.8967043204844533,
545
- "grad_norm": 0.8134225010871887,
546
  "learning_rate": 6.429258342320677e-07,
547
- "loss": 2.1512,
548
  "step": 7700
549
  },
550
  {
551
  "epoch": 0.9083498311400955,
552
- "grad_norm": 0.8605279326438904,
553
  "learning_rate": 5.073013214931377e-07,
554
- "loss": 2.1837,
555
  "step": 7800
556
  },
557
  {
558
  "epoch": 0.9199953417957377,
559
- "grad_norm": 0.8916401267051697,
560
  "learning_rate": 3.8735814371615554e-07,
561
- "loss": 2.1317,
562
  "step": 7900
563
  },
564
  {
565
  "epoch": 0.93164085245138,
566
- "grad_norm": 0.8076800107955933,
567
  "learning_rate": 2.8329443940849577e-07,
568
  "loss": 2.1349,
569
  "step": 8000
570
  },
571
  {
572
  "epoch": 0.9432863631070223,
573
- "grad_norm": 0.7975759506225586,
574
  "learning_rate": 1.9528211519649341e-07,
575
  "loss": 2.094,
576
  "step": 8100
577
  },
578
  {
579
  "epoch": 0.9549318737626645,
580
- "grad_norm": 1.0410187244415283,
581
  "learning_rate": 1.234665618466202e-07,
582
- "loss": 2.1501,
583
  "step": 8200
584
  },
585
  {
586
  "epoch": 0.9665773844183068,
587
- "grad_norm": 0.8723394274711609,
588
  "learning_rate": 6.796641408917671e-08,
589
  "loss": 2.0692,
590
  "step": 8300
591
  },
592
  {
593
  "epoch": 0.978222895073949,
594
- "grad_norm": 0.7656298279762268,
595
  "learning_rate": 2.8873354641258955e-08,
596
- "loss": 2.0986,
597
  "step": 8400
598
  },
599
  {
600
  "epoch": 0.9898684057295912,
601
- "grad_norm": 1.4604876041412354,
602
- "learning_rate": 6.396496618182868e-09,
603
  "loss": 2.1445,
604
  "step": 8500
605
  },
606
  {
607
  "epoch": 1.0,
608
- "eval_loss": 1.9266319274902344,
609
- "eval_runtime": 63.649,
610
- "eval_samples_per_second": 15.554,
611
- "eval_steps_per_second": 1.948,
612
  "step": 8587
613
  },
614
  {
615
  "epoch": 1.0,
616
  "step": 8587,
617
  "total_flos": 1.5640093507584e+17,
618
- "train_loss": 2.1807824082961527,
619
- "train_runtime": 2484.8584,
620
- "train_samples_per_second": 6.911,
621
- "train_steps_per_second": 3.456
622
  }
623
  ],
624
  "logging_steps": 100,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01164551065564225,
13
+ "grad_norm": 0.14460527896881104,
14
  "learning_rate": 2.331002331002331e-06,
15
  "loss": 2.4953,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.0232910213112845,
20
+ "grad_norm": 0.24249859154224396,
21
  "learning_rate": 4.662004662004662e-06,
22
+ "loss": 2.4574,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.03493653196692675,
27
+ "grad_norm": 0.22802527248859406,
28
  "learning_rate": 6.993006993006993e-06,
29
  "loss": 2.4409,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.046582042622569,
34
+ "grad_norm": 0.33553868532180786,
35
  "learning_rate": 9.324009324009324e-06,
36
  "loss": 2.4497,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.05822755327821125,
41
+ "grad_norm": 0.36703944206237793,
42
  "learning_rate": 1.1655011655011657e-05,
43
+ "loss": 2.3974,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.0698730639338535,
48
+ "grad_norm": 0.42922335863113403,
49
  "learning_rate": 1.3986013986013986e-05,
50
+ "loss": 2.3563,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.08151857458949575,
55
+ "grad_norm": 0.4571892023086548,
56
  "learning_rate": 1.6317016317016318e-05,
57
+ "loss": 2.3174,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.093164085245138,
62
+ "grad_norm": 0.533064067363739,
63
  "learning_rate": 1.8648018648018647e-05,
64
+ "loss": 2.3549,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 0.10480959590078025,
69
+ "grad_norm": 0.5255058407783508,
70
  "learning_rate": 1.999854282682324e-05,
71
+ "loss": 2.3053,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 0.1164551065564225,
76
+ "grad_norm": 0.5629482865333557,
77
  "learning_rate": 1.9983347507285766e-05,
78
+ "loss": 2.2866,
79
  "step": 1000
80
  },
81
  {
82
  "epoch": 0.12810061721206475,
83
+ "grad_norm": 0.5937526226043701,
84
  "learning_rate": 1.9951660332101616e-05,
85
+ "loss": 2.2721,
86
  "step": 1100
87
  },
88
  {
89
  "epoch": 0.139746127867707,
90
+ "grad_norm": 0.703377366065979,
91
  "learning_rate": 1.9903533646470504e-05,
92
+ "loss": 2.2648,
93
  "step": 1200
94
  },
95
  {
96
  "epoch": 0.15139163852334925,
97
+ "grad_norm": 0.6113013625144958,
98
  "learning_rate": 1.9839046952618667e-05,
99
+ "loss": 2.2509,
100
  "step": 1300
101
  },
102
  {
103
  "epoch": 0.1630371491789915,
104
+ "grad_norm": 0.6333953738212585,
105
  "learning_rate": 1.9758306778466264e-05,
106
+ "loss": 2.2475,
107
  "step": 1400
108
  },
109
  {
110
  "epoch": 0.17468265983463374,
111
+ "grad_norm": 0.6210408806800842,
112
  "learning_rate": 1.966144650165e-05,
113
+ "loss": 2.2575,
114
  "step": 1500
115
  },
116
  {
117
  "epoch": 0.186328170490276,
118
+ "grad_norm": 0.6377599239349365,
119
  "learning_rate": 1.9548626129191778e-05,
120
+ "loss": 2.2492,
121
  "step": 1600
122
  },
123
  {
124
  "epoch": 0.19797368114591826,
125
+ "grad_norm": 0.6116755604743958,
126
  "learning_rate": 1.9420032033177225e-05,
127
+ "loss": 2.2498,
128
  "step": 1700
129
  },
130
  {
131
  "epoch": 0.2096191918015605,
132
+ "grad_norm": 0.9220359921455383,
133
  "learning_rate": 1.927587664288089e-05,
134
+ "loss": 2.2231,
135
  "step": 1800
136
  },
137
  {
138
  "epoch": 0.22126470245720276,
139
+ "grad_norm": 0.6930053234100342,
140
  "learning_rate": 1.911639809384656e-05,
141
+ "loss": 2.1863,
142
  "step": 1900
143
  },
144
  {
145
  "epoch": 0.232910213112845,
146
+ "grad_norm": 0.991111695766449,
147
  "learning_rate": 1.8941859834502484e-05,
148
+ "loss": 2.1885,
149
  "step": 2000
150
  },
151
  {
152
  "epoch": 0.24455572376848725,
153
+ "grad_norm": 0.9630569815635681,
154
  "learning_rate": 1.8752550190961288e-05,
155
+ "loss": 2.208,
156
  "step": 2100
157
  },
158
  {
159
  "epoch": 0.2562012344241295,
160
+ "grad_norm": 0.8858104348182678,
161
  "learning_rate": 1.8548781890723614e-05,
162
+ "loss": 2.2048,
163
  "step": 2200
164
  },
165
  {
166
  "epoch": 0.26784674507977174,
167
+ "grad_norm": 0.9373981952667236,
168
  "learning_rate": 1.8330891546072095e-05,
169
+ "loss": 2.2026,
170
  "step": 2300
171
  },
172
  {
173
  "epoch": 0.279492255735414,
174
+ "grad_norm": 0.6907739639282227,
175
  "learning_rate": 1.809923909800931e-05,
176
  "loss": 2.1932,
177
  "step": 2400
178
  },
179
  {
180
  "epoch": 0.29113776639105626,
181
+ "grad_norm": 0.7434536814689636,
182
  "learning_rate": 1.7854207221658092e-05,
183
+ "loss": 2.2032,
184
  "step": 2500
185
  },
186
  {
187
  "epoch": 0.3027832770466985,
188
+ "grad_norm": 0.7883014678955078,
189
  "learning_rate": 1.7596200694106552e-05,
190
+ "loss": 2.1869,
191
  "step": 2600
192
  },
193
  {
194
  "epoch": 0.31442878770234073,
195
+ "grad_norm": 0.9146271347999573,
196
  "learning_rate": 1.7325645725742056e-05,
197
+ "loss": 2.1941,
198
  "step": 2700
199
  },
200
  {
201
  "epoch": 0.326074298357983,
202
+ "grad_norm": 0.9042865633964539,
203
  "learning_rate": 1.7042989256178744e-05,
204
+ "loss": 2.1523,
205
  "step": 2800
206
  },
207
  {
208
  "epoch": 0.33771980901362525,
209
+ "grad_norm": 0.9685861468315125,
210
  "learning_rate": 1.6748698215941704e-05,
211
+ "loss": 2.1736,
212
  "step": 2900
213
  },
214
  {
215
  "epoch": 0.3493653196692675,
216
+ "grad_norm": 0.7863497138023376,
217
  "learning_rate": 1.6443258755127393e-05,
218
+ "loss": 2.1938,
219
  "step": 3000
220
  },
221
  {
222
  "epoch": 0.36101083032490977,
223
+ "grad_norm": 0.631289005279541,
224
  "learning_rate": 1.6127175440314596e-05,
225
  "loss": 2.1204,
226
  "step": 3100
227
  },
228
  {
229
  "epoch": 0.372656340980552,
230
+ "grad_norm": 0.8423033952713013,
231
  "learning_rate": 1.5800970421052487e-05,
232
+ "loss": 2.1962,
233
  "step": 3200
234
  },
235
  {
236
  "epoch": 0.38430185163619424,
237
+ "grad_norm": 1.026877522468567,
238
  "learning_rate": 1.546518256730277e-05,
239
+ "loss": 2.1839,
240
  "step": 3300
241
  },
242
  {
243
  "epoch": 0.3959473622918365,
244
+ "grad_norm": 1.0023020505905151,
245
  "learning_rate": 1.5120366579260734e-05,
246
  "loss": 2.1607,
247
  "step": 3400
248
  },
249
  {
250
  "epoch": 0.40759287294747876,
251
+ "grad_norm": 0.8861306309700012,
252
  "learning_rate": 1.4767092071025792e-05,
253
+ "loss": 2.1838,
254
  "step": 3500
255
  },
256
  {
257
  "epoch": 0.419238383603121,
258
+ "grad_norm": 0.8740290403366089,
259
  "learning_rate": 1.4405942629635174e-05,
260
+ "loss": 2.1205,
261
  "step": 3600
262
  },
263
  {
264
  "epoch": 0.4308838942587632,
265
+ "grad_norm": 0.7423445582389832,
266
  "learning_rate": 1.4037514851015241e-05,
267
  "loss": 2.215,
268
  "step": 3700
269
  },
270
  {
271
  "epoch": 0.4425294049144055,
272
+ "grad_norm": 0.9221410155296326,
273
  "learning_rate": 1.3662417354442924e-05,
274
  "loss": 2.1836,
275
  "step": 3800
276
  },
277
  {
278
  "epoch": 0.45417491557004774,
279
+ "grad_norm": 0.9722244143486023,
280
  "learning_rate": 1.3281269777145354e-05,
281
+ "loss": 2.1934,
282
  "step": 3900
283
  },
284
  {
285
  "epoch": 0.46582042622569,
286
+ "grad_norm": 1.1752955913543701,
287
  "learning_rate": 1.2894701750698541e-05,
288
+ "loss": 2.1382,
289
  "step": 4000
290
  },
291
  {
292
  "epoch": 0.47746593688133226,
293
+ "grad_norm": 0.8646457195281982,
294
  "learning_rate": 1.2503351860916024e-05,
295
+ "loss": 2.1507,
296
  "step": 4100
297
  },
298
  {
299
  "epoch": 0.4891114475369745,
300
+ "grad_norm": 1.307706594467163,
301
  "learning_rate": 1.2107866592945686e-05,
302
+ "loss": 2.1564,
303
  "step": 4200
304
  },
305
  {
306
  "epoch": 0.5007569581926168,
307
+ "grad_norm": 0.9780316948890686,
308
  "learning_rate": 1.1708899263317381e-05,
309
+ "loss": 2.1395,
310
  "step": 4300
311
  },
312
  {
313
  "epoch": 0.512402468848259,
314
+ "grad_norm": 0.8423807621002197,
315
  "learning_rate": 1.1307108940705536e-05,
316
  "loss": 2.1605,
317
  "step": 4400
318
  },
319
  {
320
  "epoch": 0.5240479795039013,
321
+ "grad_norm": 0.9365679621696472,
322
  "learning_rate": 1.090315935718958e-05,
323
  "loss": 2.1108,
324
  "step": 4500
325
  },
326
  {
327
  "epoch": 0.5356934901595435,
328
+ "grad_norm": 0.8869751691818237,
329
  "learning_rate": 1.0497717811810748e-05,
330
+ "loss": 2.1365,
331
  "step": 4600
332
  },
333
  {
334
  "epoch": 0.5473390008151857,
335
+ "grad_norm": 0.7278423309326172,
336
  "learning_rate": 1.0091454068236455e-05,
337
  "loss": 2.1498,
338
  "step": 4700
339
  },
340
  {
341
  "epoch": 0.558984511470828,
342
+ "grad_norm": 1.1241728067398071,
343
  "learning_rate": 9.685039248353284e-06,
344
+ "loss": 2.1635,
345
  "step": 4800
346
  },
347
  {
348
  "epoch": 0.5706300221264703,
349
+ "grad_norm": 0.8585646748542786,
350
  "learning_rate": 9.279144723616279e-06,
351
+ "loss": 2.072,
352
  "step": 4900
353
  },
354
  {
355
  "epoch": 0.5822755327821125,
356
+ "grad_norm": 0.768528938293457,
357
  "learning_rate": 8.874441005985965e-06,
358
+ "loss": 2.1568,
359
  "step": 5000
360
  },
361
  {
362
  "epoch": 0.5939210434377548,
363
+ "grad_norm": 1.002091407775879,
364
  "learning_rate": 8.47159664028521e-06,
365
  "loss": 2.1631,
366
  "step": 5100
367
  },
368
  {
369
  "epoch": 0.605566554093397,
370
+ "grad_norm": 0.7817184329032898,
371
  "learning_rate": 8.07127709980564e-06,
372
+ "loss": 2.1203,
373
  "step": 5200
374
  },
375
  {
376
  "epoch": 0.6172120647490392,
377
+ "grad_norm": 1.3114322423934937,
378
  "learning_rate": 7.674143686988085e-06,
379
  "loss": 2.0871,
380
  "step": 5300
381
  },
382
  {
383
  "epoch": 0.6288575754046815,
384
+ "grad_norm": 0.7261312007904053,
385
  "learning_rate": 7.280852440992941e-06,
386
  "loss": 2.1289,
387
  "step": 5400
388
  },
389
  {
390
  "epoch": 0.6405030860603237,
391
+ "grad_norm": 0.9209310412406921,
392
  "learning_rate": 6.89205305396518e-06,
393
+ "loss": 2.1063,
394
  "step": 5500
395
  },
396
  {
397
  "epoch": 0.652148596715966,
398
+ "grad_norm": 0.9243987798690796,
399
  "learning_rate": 6.508387797784227e-06,
400
+ "loss": 2.0956,
401
  "step": 5600
402
  },
403
  {
404
  "epoch": 0.6637941073716083,
405
+ "grad_norm": 0.9812811017036438,
406
  "learning_rate": 6.130490463071604e-06,
407
+ "loss": 2.0918,
408
  "step": 5700
409
  },
410
  {
411
  "epoch": 0.6754396180272505,
412
+ "grad_norm": 0.8238828182220459,
413
  "learning_rate": 5.758985312209124e-06,
414
+ "loss": 2.1203,
415
  "step": 5800
416
  },
417
  {
418
  "epoch": 0.6870851286828927,
419
+ "grad_norm": 1.1043486595153809,
420
  "learning_rate": 5.394486048097099e-06,
421
+ "loss": 2.1159,
422
  "step": 5900
423
  },
424
  {
425
  "epoch": 0.698730639338535,
426
+ "grad_norm": 0.7944552898406982,
427
  "learning_rate": 5.037594800356142e-06,
428
+ "loss": 2.1157,
429
  "step": 6000
430
  },
431
  {
432
  "epoch": 0.7103761499941772,
433
+ "grad_norm": 1.1705522537231445,
434
  "learning_rate": 4.688901130647314e-06,
435
  "loss": 2.1396,
436
  "step": 6100
437
  },
438
  {
439
  "epoch": 0.7220216606498195,
440
+ "grad_norm": 0.8150696754455566,
441
  "learning_rate": 4.348981058753708e-06,
442
  "loss": 2.0819,
443
  "step": 6200
444
  },
445
  {
446
  "epoch": 0.7336671713054618,
447
+ "grad_norm": 1.0273561477661133,
448
  "learning_rate": 4.018396111032394e-06,
449
+ "loss": 2.1226,
450
  "step": 6300
451
  },
452
  {
453
  "epoch": 0.745312681961104,
454
+ "grad_norm": 0.9231303334236145,
455
  "learning_rate": 3.697692392808545e-06,
456
+ "loss": 2.1035,
457
  "step": 6400
458
  },
459
  {
460
  "epoch": 0.7569581926167462,
461
+ "grad_norm": 0.7557884454727173,
462
  "learning_rate": 3.387399686244144e-06,
463
+ "loss": 2.0954,
464
  "step": 6500
465
  },
466
  {
467
  "epoch": 0.7686037032723885,
468
+ "grad_norm": 0.8633888959884644,
469
  "learning_rate": 3.0880305751715402e-06,
470
+ "loss": 2.1199,
471
  "step": 6600
472
  },
473
  {
474
  "epoch": 0.7802492139280307,
475
+ "grad_norm": 0.8371317982673645,
476
  "learning_rate": 2.800079598337505e-06,
477
+ "loss": 2.1135,
478
  "step": 6700
479
  },
480
  {
481
  "epoch": 0.791894724583673,
482
+ "grad_norm": 0.7599378228187561,
483
  "learning_rate": 2.524022432456664e-06,
484
+ "loss": 2.1385,
485
  "step": 6800
486
  },
487
  {
488
  "epoch": 0.8035402352393153,
489
+ "grad_norm": 0.8745434880256653,
490
  "learning_rate": 2.260315106423807e-06,
491
+ "loss": 2.0778,
492
  "step": 6900
493
  },
494
  {
495
  "epoch": 0.8151857458949575,
496
+ "grad_norm": 0.871006429195404,
497
  "learning_rate": 2.0093932479830935e-06,
498
  "loss": 2.1367,
499
  "step": 7000
500
  },
501
  {
502
  "epoch": 0.8268312565505997,
503
+ "grad_norm": 1.034953236579895,
504
  "learning_rate": 1.7716713640987526e-06,
505
+ "loss": 2.1163,
506
  "step": 7100
507
  },
508
  {
509
  "epoch": 0.838476767206242,
510
+ "grad_norm": 0.8476694226264954,
511
  "learning_rate": 1.5475421562158854e-06,
512
  "loss": 2.1459,
513
  "step": 7200
514
  },
515
  {
516
  "epoch": 0.8501222778618842,
517
+ "grad_norm": 0.9011842012405396,
518
  "learning_rate": 1.3373758715426444e-06,
519
+ "loss": 2.1094,
520
  "step": 7300
521
  },
522
  {
523
  "epoch": 0.8617677885175264,
524
+ "grad_norm": 0.8178461790084839,
525
  "learning_rate": 1.141519691425379e-06,
526
+ "loss": 2.0853,
527
  "step": 7400
528
  },
529
  {
530
  "epoch": 0.8734132991731688,
531
+ "grad_norm": 0.898789644241333,
532
  "learning_rate": 9.60297157827106e-07,
533
+ "loss": 2.1359,
534
  "step": 7500
535
  },
536
  {
537
  "epoch": 0.885058809828811,
538
+ "grad_norm": 0.8626888990402222,
539
  "learning_rate": 7.94007638856753e-07,
540
+ "loss": 2.1405,
541
  "step": 7600
542
  },
543
  {
544
  "epoch": 0.8967043204844533,
545
+ "grad_norm": 0.8167033791542053,
546
  "learning_rate": 6.429258342320677e-07,
547
+ "loss": 2.151,
548
  "step": 7700
549
  },
550
  {
551
  "epoch": 0.9083498311400955,
552
+ "grad_norm": 0.855806827545166,
553
  "learning_rate": 5.073013214931377e-07,
554
+ "loss": 2.1835,
555
  "step": 7800
556
  },
557
  {
558
  "epoch": 0.9199953417957377,
559
+ "grad_norm": 0.8845811486244202,
560
  "learning_rate": 3.8735814371615554e-07,
561
+ "loss": 2.1314,
562
  "step": 7900
563
  },
564
  {
565
  "epoch": 0.93164085245138,
566
+ "grad_norm": 0.7994192838668823,
567
  "learning_rate": 2.8329443940849577e-07,
568
  "loss": 2.1349,
569
  "step": 8000
570
  },
571
  {
572
  "epoch": 0.9432863631070223,
573
+ "grad_norm": 0.792962372303009,
574
  "learning_rate": 1.9528211519649341e-07,
575
  "loss": 2.094,
576
  "step": 8100
577
  },
578
  {
579
  "epoch": 0.9549318737626645,
580
+ "grad_norm": 1.0241106748580933,
581
  "learning_rate": 1.234665618466202e-07,
582
+ "loss": 2.1502,
583
  "step": 8200
584
  },
585
  {
586
  "epoch": 0.9665773844183068,
587
+ "grad_norm": 0.8663437366485596,
588
  "learning_rate": 6.796641408917671e-08,
589
  "loss": 2.0692,
590
  "step": 8300
591
  },
592
  {
593
  "epoch": 0.978222895073949,
594
+ "grad_norm": 0.7561269402503967,
595
  "learning_rate": 2.8873354641258955e-08,
596
+ "loss": 2.0988,
597
  "step": 8400
598
  },
599
  {
600
  "epoch": 0.9898684057295912,
601
+ "grad_norm": 1.3960742950439453,
602
+ "learning_rate": 6.2519627527379836e-09,
603
  "loss": 2.1445,
604
  "step": 8500
605
  },
606
  {
607
  "epoch": 1.0,
608
+ "eval_loss": 1.9266563653945923,
609
+ "eval_runtime": 63.7186,
610
+ "eval_samples_per_second": 15.537,
611
+ "eval_steps_per_second": 1.946,
612
  "step": 8587
613
  },
614
  {
615
  "epoch": 1.0,
616
  "step": 8587,
617
  "total_flos": 1.5640093507584e+17,
618
+ "train_loss": 2.1807946249230077,
619
+ "train_runtime": 2485.1722,
620
+ "train_samples_per_second": 6.91,
621
+ "train_steps_per_second": 3.455
622
  }
623
  ],
624
  "logging_steps": 100,