hajeong67 commited on
Commit
6b16211
·
verified ·
1 Parent(s): 90877ee

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -20,8 +20,8 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d94d9430a83a39f82048c46f2d8c46075d242e146a63e2d859932cb8d08913a9
3
  size 3416264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0346bf3d1823e89b4797054ab8629cb0ecb7942156d4832bce2f4d073b71f201
3
  size 3416264
checkpoint-1050/adapter_config.json CHANGED
@@ -20,8 +20,8 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
checkpoint-1050/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d94d9430a83a39f82048c46f2d8c46075d242e146a63e2d859932cb8d08913a9
3
  size 3416264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0346bf3d1823e89b4797054ab8629cb0ecb7942156d4832bce2f4d073b71f201
3
  size 3416264
checkpoint-1050/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37f8b9babe4bcfbcab994fc22e50ede05b7d24f5911a4988b32409b98159d1bb
3
  size 6869818
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65c69ad74e25d4540bbfeb36ef99adc29f6a5f11187f375b6dcf335632f9bf77
3
  size 6869818
checkpoint-1050/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45173af0ed6f30125c38af1acfce62d5161a517d2a1ddc5c148791675731ae3e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7358d9c30f2639fd60afd346acbf92ecea749dafe0ff584e147ba759751f266
3
  size 14244
checkpoint-1050/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d5dd934cf554c131f164b64d8667367520239153e8e36f14d7c5f59c2d40a7c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dd067d46d8bf3a73453265c0d5998bca08a0fa082902b0c1a537936aedcddc6
3
  size 1064
checkpoint-1050/trainer_state.json CHANGED
@@ -10,761 +10,761 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.02857142857142857,
13
- "grad_norm": 9.284814834594727,
14
- "learning_rate": 9.904761904761905e-05,
15
- "loss": 2.0184,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05714285714285714,
20
- "grad_norm": 12.879095077514648,
21
- "learning_rate": 9.80952380952381e-05,
22
- "loss": 1.7222,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.08571428571428572,
27
- "grad_norm": 17.29737663269043,
28
- "learning_rate": 9.714285714285715e-05,
29
- "loss": 1.7591,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.11428571428571428,
34
- "grad_norm": 5.025689125061035,
35
- "learning_rate": 9.61904761904762e-05,
36
- "loss": 1.7304,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
- "grad_norm": 10.627418518066406,
42
- "learning_rate": 9.523809523809524e-05,
43
- "loss": 1.8282,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.17142857142857143,
48
- "grad_norm": 5.618676662445068,
49
- "learning_rate": 9.428571428571429e-05,
50
- "loss": 1.6407,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.2,
55
- "grad_norm": 13.271527290344238,
56
- "learning_rate": 9.333333333333334e-05,
57
- "loss": 1.7611,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.22857142857142856,
62
- "grad_norm": 7.040605545043945,
63
- "learning_rate": 9.238095238095239e-05,
64
- "loss": 1.8149,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2571428571428571,
69
- "grad_norm": 9.282252311706543,
70
- "learning_rate": 9.142857142857143e-05,
71
- "loss": 1.656,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2857142857142857,
76
- "grad_norm": 7.04727029800415,
77
- "learning_rate": 9.047619047619048e-05,
78
- "loss": 1.6419,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3142857142857143,
83
- "grad_norm": 5.5171122550964355,
84
- "learning_rate": 8.952380952380953e-05,
85
- "loss": 1.682,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.34285714285714286,
90
- "grad_norm": 9.009190559387207,
91
- "learning_rate": 8.857142857142857e-05,
92
- "loss": 1.4019,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.37142857142857144,
97
- "grad_norm": 11.160100936889648,
98
- "learning_rate": 8.761904761904762e-05,
99
- "loss": 1.5941,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.4,
104
- "grad_norm": 6.663743019104004,
105
- "learning_rate": 8.666666666666667e-05,
106
- "loss": 1.4869,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.42857142857142855,
111
- "grad_norm": 7.955056667327881,
112
- "learning_rate": 8.571428571428571e-05,
113
- "loss": 1.6137,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.45714285714285713,
118
- "grad_norm": 7.169972896575928,
119
- "learning_rate": 8.476190476190477e-05,
120
- "loss": 1.7435,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4857142857142857,
125
- "grad_norm": 4.778314113616943,
126
- "learning_rate": 8.380952380952382e-05,
127
- "loss": 1.3997,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.5142857142857142,
132
- "grad_norm": 8.540094375610352,
133
- "learning_rate": 8.285714285714287e-05,
134
- "loss": 1.5345,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5428571428571428,
139
- "grad_norm": 8.377891540527344,
140
- "learning_rate": 8.19047619047619e-05,
141
- "loss": 1.4992,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5714285714285714,
146
- "grad_norm": 6.053997039794922,
147
- "learning_rate": 8.095238095238096e-05,
148
- "loss": 1.4046,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6,
153
- "grad_norm": 11.835803985595703,
154
- "learning_rate": 8e-05,
155
- "loss": 1.3463,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6285714285714286,
160
- "grad_norm": 7.073652744293213,
161
- "learning_rate": 7.904761904761905e-05,
162
- "loss": 1.3764,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.6571428571428571,
167
- "grad_norm": 10.55458927154541,
168
- "learning_rate": 7.80952380952381e-05,
169
- "loss": 1.5288,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6857142857142857,
174
- "grad_norm": 6.555107593536377,
175
- "learning_rate": 7.714285714285715e-05,
176
- "loss": 1.3578,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7142857142857143,
181
- "grad_norm": 5.762172222137451,
182
- "learning_rate": 7.619047619047618e-05,
183
- "loss": 1.4635,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.7428571428571429,
188
- "grad_norm": 5.80485200881958,
189
- "learning_rate": 7.523809523809524e-05,
190
- "loss": 1.3706,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7714285714285715,
195
- "grad_norm": 9.550569534301758,
196
- "learning_rate": 7.428571428571429e-05,
197
- "loss": 1.4227,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8,
202
- "grad_norm": 6.621246814727783,
203
- "learning_rate": 7.333333333333333e-05,
204
- "loss": 1.3751,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8285714285714286,
209
- "grad_norm": 7.168375492095947,
210
- "learning_rate": 7.238095238095238e-05,
211
- "loss": 1.3472,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.8571428571428571,
216
- "grad_norm": 11.96556568145752,
217
- "learning_rate": 7.142857142857143e-05,
218
- "loss": 1.33,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8857142857142857,
223
- "grad_norm": 10.02365779876709,
224
- "learning_rate": 7.047619047619048e-05,
225
- "loss": 1.3553,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9142857142857143,
230
- "grad_norm": 9.006269454956055,
231
- "learning_rate": 6.952380952380952e-05,
232
- "loss": 1.5504,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9428571428571428,
237
- "grad_norm": 7.228586673736572,
238
- "learning_rate": 6.857142857142858e-05,
239
- "loss": 1.3597,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9714285714285714,
244
- "grad_norm": 18.761024475097656,
245
- "learning_rate": 6.761904761904763e-05,
246
- "loss": 1.332,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 1.0,
251
- "grad_norm": 9.89370346069336,
252
- "learning_rate": 6.666666666666667e-05,
253
- "loss": 1.4027,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 1.0,
258
- "eval_loss": 1.2965246438980103,
259
- "eval_runtime": 16.113,
260
- "eval_samples_per_second": 12.412,
261
- "eval_steps_per_second": 1.552,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.0285714285714285,
266
- "grad_norm": 14.148599624633789,
267
- "learning_rate": 6.571428571428571e-05,
268
- "loss": 1.322,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.0571428571428572,
273
- "grad_norm": 6.457850456237793,
274
- "learning_rate": 6.476190476190477e-05,
275
- "loss": 1.4209,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.0857142857142856,
280
- "grad_norm": 7.997476100921631,
281
- "learning_rate": 6.38095238095238e-05,
282
- "loss": 1.3045,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.1142857142857143,
287
- "grad_norm": 11.429621696472168,
288
- "learning_rate": 6.285714285714286e-05,
289
- "loss": 1.3239,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.1428571428571428,
294
- "grad_norm": 8.54537582397461,
295
- "learning_rate": 6.19047619047619e-05,
296
- "loss": 1.3079,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.1714285714285715,
301
- "grad_norm": 9.194470405578613,
302
- "learning_rate": 6.0952380952380964e-05,
303
- "loss": 1.3014,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.2,
308
- "grad_norm": 6.942721366882324,
309
- "learning_rate": 6e-05,
310
- "loss": 1.3108,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.2285714285714286,
315
- "grad_norm": 17.95640754699707,
316
- "learning_rate": 5.904761904761905e-05,
317
- "loss": 1.2473,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.2571428571428571,
322
- "grad_norm": 6.065295696258545,
323
- "learning_rate": 5.8095238095238104e-05,
324
- "loss": 1.1563,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.2857142857142856,
329
- "grad_norm": 18.216981887817383,
330
- "learning_rate": 5.714285714285714e-05,
331
- "loss": 1.5198,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.3142857142857143,
336
- "grad_norm": 7.645397186279297,
337
- "learning_rate": 5.619047619047619e-05,
338
- "loss": 1.294,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.342857142857143,
343
- "grad_norm": 8.216632843017578,
344
- "learning_rate": 5.5238095238095244e-05,
345
- "loss": 1.3319,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.3714285714285714,
350
- "grad_norm": 11.892958641052246,
351
- "learning_rate": 5.428571428571428e-05,
352
- "loss": 1.349,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.4,
357
- "grad_norm": 7.598433494567871,
358
- "learning_rate": 5.333333333333333e-05,
359
- "loss": 1.2459,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.4285714285714286,
364
- "grad_norm": 7.4001898765563965,
365
- "learning_rate": 5.2380952380952384e-05,
366
- "loss": 1.2531,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.457142857142857,
371
- "grad_norm": 9.964533805847168,
372
- "learning_rate": 5.142857142857143e-05,
373
- "loss": 1.2326,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.4857142857142858,
378
- "grad_norm": 10.863306045532227,
379
- "learning_rate": 5.047619047619048e-05,
380
- "loss": 1.1964,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.5142857142857142,
385
- "grad_norm": 7.798760414123535,
386
- "learning_rate": 4.9523809523809525e-05,
387
- "loss": 1.2377,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.5428571428571427,
392
- "grad_norm": 5.575809001922607,
393
- "learning_rate": 4.8571428571428576e-05,
394
- "loss": 1.2545,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.5714285714285714,
399
- "grad_norm": 7.334414958953857,
400
- "learning_rate": 4.761904761904762e-05,
401
- "loss": 1.2441,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.6,
406
- "grad_norm": 7.7508745193481445,
407
- "learning_rate": 4.666666666666667e-05,
408
- "loss": 1.3679,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.6285714285714286,
413
- "grad_norm": 8.727417945861816,
414
- "learning_rate": 4.5714285714285716e-05,
415
- "loss": 1.366,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.657142857142857,
420
- "grad_norm": 11.305671691894531,
421
- "learning_rate": 4.476190476190477e-05,
422
- "loss": 1.2059,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.6857142857142857,
427
- "grad_norm": 12.226043701171875,
428
- "learning_rate": 4.380952380952381e-05,
429
- "loss": 1.3986,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.7142857142857144,
434
- "grad_norm": 9.918877601623535,
435
- "learning_rate": 4.2857142857142856e-05,
436
- "loss": 1.2768,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.7428571428571429,
441
- "grad_norm": 7.957796096801758,
442
- "learning_rate": 4.190476190476191e-05,
443
- "loss": 1.2662,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.7714285714285714,
448
- "grad_norm": 10.324079513549805,
449
- "learning_rate": 4.095238095238095e-05,
450
- "loss": 1.2392,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.8,
455
- "grad_norm": 8.535161972045898,
456
- "learning_rate": 4e-05,
457
- "loss": 1.2802,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.8285714285714287,
462
- "grad_norm": 6.386439800262451,
463
- "learning_rate": 3.904761904761905e-05,
464
- "loss": 1.2973,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.8571428571428572,
469
- "grad_norm": 16.93861961364746,
470
- "learning_rate": 3.809523809523809e-05,
471
- "loss": 1.1805,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.8857142857142857,
476
- "grad_norm": 17.099620819091797,
477
- "learning_rate": 3.7142857142857143e-05,
478
- "loss": 1.2095,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.9142857142857141,
483
- "grad_norm": 9.49264144897461,
484
- "learning_rate": 3.619047619047619e-05,
485
- "loss": 1.2924,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.9428571428571428,
490
- "grad_norm": 5.8994364738464355,
491
- "learning_rate": 3.523809523809524e-05,
492
- "loss": 1.2577,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.9714285714285715,
497
- "grad_norm": 7.082160472869873,
498
- "learning_rate": 3.428571428571429e-05,
499
- "loss": 1.2792,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 2.0,
504
- "grad_norm": 9.511951446533203,
505
- "learning_rate": 3.3333333333333335e-05,
506
- "loss": 1.2854,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 2.0,
511
- "eval_loss": 1.2146018743515015,
512
- "eval_runtime": 16.0983,
513
- "eval_samples_per_second": 12.424,
514
- "eval_steps_per_second": 1.553,
515
  "step": 700
516
  },
517
  {
518
  "epoch": 2.0285714285714285,
519
- "grad_norm": 6.5345964431762695,
520
- "learning_rate": 3.2380952380952386e-05,
521
- "loss": 1.1684,
522
  "step": 710
523
  },
524
  {
525
  "epoch": 2.057142857142857,
526
- "grad_norm": 9.766515731811523,
527
- "learning_rate": 3.142857142857143e-05,
528
- "loss": 1.1704,
529
  "step": 720
530
  },
531
  {
532
  "epoch": 2.085714285714286,
533
- "grad_norm": 9.135540008544922,
534
- "learning_rate": 3.0476190476190482e-05,
535
- "loss": 1.1694,
536
  "step": 730
537
  },
538
  {
539
  "epoch": 2.1142857142857143,
540
- "grad_norm": 9.21896743774414,
541
- "learning_rate": 2.9523809523809526e-05,
542
- "loss": 1.1738,
543
  "step": 740
544
  },
545
  {
546
  "epoch": 2.142857142857143,
547
- "grad_norm": 9.866166114807129,
548
- "learning_rate": 2.857142857142857e-05,
549
- "loss": 1.3546,
550
  "step": 750
551
  },
552
  {
553
  "epoch": 2.1714285714285713,
554
- "grad_norm": 10.150802612304688,
555
- "learning_rate": 2.7619047619047622e-05,
556
- "loss": 1.2242,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.2,
561
- "grad_norm": 6.537503719329834,
562
- "learning_rate": 2.6666666666666667e-05,
563
- "loss": 1.2067,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.2285714285714286,
568
- "grad_norm": 9.176528930664062,
569
- "learning_rate": 2.5714285714285714e-05,
570
- "loss": 1.3008,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.257142857142857,
575
- "grad_norm": 16.931442260742188,
576
- "learning_rate": 2.4761904761904762e-05,
577
- "loss": 1.1371,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.2857142857142856,
582
- "grad_norm": 14.350312232971191,
583
- "learning_rate": 2.380952380952381e-05,
584
- "loss": 1.1948,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.314285714285714,
589
- "grad_norm": 8.385422706604004,
590
- "learning_rate": 2.2857142857142858e-05,
591
- "loss": 1.1904,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.342857142857143,
596
- "grad_norm": 8.879819869995117,
597
- "learning_rate": 2.1904761904761906e-05,
598
- "loss": 1.1808,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.3714285714285714,
603
- "grad_norm": 11.392715454101562,
604
- "learning_rate": 2.0952380952380954e-05,
605
- "loss": 1.2958,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.4,
610
- "grad_norm": 12.15937614440918,
611
- "learning_rate": 2e-05,
612
- "loss": 1.1463,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.4285714285714284,
617
- "grad_norm": 16.796415328979492,
618
- "learning_rate": 1.9047619047619046e-05,
619
- "loss": 1.1485,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.4571428571428573,
624
- "grad_norm": 11.125741958618164,
625
- "learning_rate": 1.8095238095238094e-05,
626
- "loss": 1.2561,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.4857142857142858,
631
- "grad_norm": 5.766628265380859,
632
- "learning_rate": 1.7142857142857145e-05,
633
- "loss": 1.265,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.5142857142857142,
638
- "grad_norm": 7.663820266723633,
639
- "learning_rate": 1.6190476190476193e-05,
640
- "loss": 1.2337,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.5428571428571427,
645
- "grad_norm": 8.046570777893066,
646
- "learning_rate": 1.5238095238095241e-05,
647
- "loss": 1.3382,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.571428571428571,
652
- "grad_norm": 8.289289474487305,
653
- "learning_rate": 1.4285714285714285e-05,
654
- "loss": 1.0253,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.6,
659
- "grad_norm": 6.765757083892822,
660
- "learning_rate": 1.3333333333333333e-05,
661
- "loss": 1.1975,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.6285714285714286,
666
- "grad_norm": 5.381499290466309,
667
- "learning_rate": 1.2380952380952381e-05,
668
- "loss": 1.2505,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.657142857142857,
673
- "grad_norm": 9.704530715942383,
674
- "learning_rate": 1.1428571428571429e-05,
675
- "loss": 1.143,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.685714285714286,
680
- "grad_norm": 9.753270149230957,
681
- "learning_rate": 1.0476190476190477e-05,
682
- "loss": 1.2898,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.7142857142857144,
687
- "grad_norm": 8.451569557189941,
688
- "learning_rate": 9.523809523809523e-06,
689
- "loss": 1.1701,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.742857142857143,
694
- "grad_norm": 15.7116060256958,
695
- "learning_rate": 8.571428571428573e-06,
696
- "loss": 1.1194,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.7714285714285714,
701
- "grad_norm": 10.655159950256348,
702
- "learning_rate": 7.6190476190476205e-06,
703
- "loss": 1.252,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.8,
708
- "grad_norm": 7.193419456481934,
709
- "learning_rate": 6.666666666666667e-06,
710
- "loss": 1.2624,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.8285714285714287,
715
- "grad_norm": 5.822306156158447,
716
- "learning_rate": 5.7142857142857145e-06,
717
- "loss": 1.2784,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.857142857142857,
722
- "grad_norm": 7.571345806121826,
723
- "learning_rate": 4.7619047619047615e-06,
724
- "loss": 1.2404,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.8857142857142857,
729
- "grad_norm": 8.050336837768555,
730
- "learning_rate": 3.8095238095238102e-06,
731
- "loss": 1.2279,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.914285714285714,
736
- "grad_norm": 5.833733558654785,
737
- "learning_rate": 2.8571428571428573e-06,
738
- "loss": 1.1379,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.942857142857143,
743
- "grad_norm": 6.999922275543213,
744
- "learning_rate": 1.9047619047619051e-06,
745
- "loss": 1.0732,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.9714285714285715,
750
- "grad_norm": 7.655697822570801,
751
- "learning_rate": 9.523809523809526e-07,
752
- "loss": 1.1059,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 3.0,
757
- "grad_norm": 7.054152488708496,
758
  "learning_rate": 0.0,
759
- "loss": 1.2239,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 3.0,
764
- "eval_loss": 1.1520273685455322,
765
- "eval_runtime": 16.1113,
766
- "eval_samples_per_second": 12.414,
767
- "eval_steps_per_second": 1.552,
768
  "step": 1050
769
  }
770
  ],
@@ -785,7 +785,7 @@
785
  "attributes": {}
786
  }
787
  },
788
- "total_flos": 6337314611527680.0,
789
  "train_batch_size": 8,
790
  "trial_name": null,
791
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.02857142857142857,
13
+ "grad_norm": 5.210493087768555,
14
+ "learning_rate": 0.0004952380952380952,
15
+ "loss": 4.4131,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05714285714285714,
20
+ "grad_norm": 3.571948528289795,
21
+ "learning_rate": 0.0004904761904761905,
22
+ "loss": 2.5476,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.08571428571428572,
27
+ "grad_norm": 2.640848398208618,
28
+ "learning_rate": 0.0004857142857142857,
29
+ "loss": 2.2491,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.11428571428571428,
34
+ "grad_norm": 2.6611335277557373,
35
+ "learning_rate": 0.00048095238095238095,
36
+ "loss": 2.0477,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
+ "grad_norm": 2.608750581741333,
42
+ "learning_rate": 0.0004761904761904762,
43
+ "loss": 2.028,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.17142857142857143,
48
+ "grad_norm": 5.034807205200195,
49
+ "learning_rate": 0.0004714285714285714,
50
+ "loss": 1.7871,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.2,
55
+ "grad_norm": 3.5893642902374268,
56
+ "learning_rate": 0.00046666666666666666,
57
+ "loss": 1.7879,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.22857142857142856,
62
+ "grad_norm": 5.368953227996826,
63
+ "learning_rate": 0.00046190476190476195,
64
+ "loss": 1.8194,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2571428571428571,
69
+ "grad_norm": 3.7377731800079346,
70
+ "learning_rate": 0.00045714285714285713,
71
+ "loss": 1.6159,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2857142857142857,
76
+ "grad_norm": 7.883308410644531,
77
+ "learning_rate": 0.00045238095238095237,
78
+ "loss": 1.6244,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3142857142857143,
83
+ "grad_norm": 4.968689918518066,
84
+ "learning_rate": 0.00044761904761904766,
85
+ "loss": 1.6614,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.34285714285714286,
90
+ "grad_norm": 2.592766046524048,
91
+ "learning_rate": 0.00044285714285714284,
92
+ "loss": 1.349,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.37142857142857144,
97
+ "grad_norm": 2.783951997756958,
98
+ "learning_rate": 0.0004380952380952381,
99
+ "loss": 1.5065,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.4,
104
+ "grad_norm": 5.2810378074646,
105
+ "learning_rate": 0.00043333333333333337,
106
+ "loss": 1.3934,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.42857142857142855,
111
+ "grad_norm": 4.448086261749268,
112
+ "learning_rate": 0.00042857142857142855,
113
+ "loss": 1.5198,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.45714285714285713,
118
+ "grad_norm": 3.211707353591919,
119
+ "learning_rate": 0.0004238095238095238,
120
+ "loss": 1.6207,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4857142857142857,
125
+ "grad_norm": 3.586463212966919,
126
+ "learning_rate": 0.0004190476190476191,
127
+ "loss": 1.2687,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.5142857142857142,
132
+ "grad_norm": 2.6598610877990723,
133
+ "learning_rate": 0.0004142857142857143,
134
+ "loss": 1.3957,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5428571428571428,
139
+ "grad_norm": 3.480663776397705,
140
+ "learning_rate": 0.00040952380952380955,
141
+ "loss": 1.3543,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5714285714285714,
146
+ "grad_norm": 3.168818712234497,
147
+ "learning_rate": 0.0004047619047619048,
148
+ "loss": 1.2619,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6,
153
+ "grad_norm": 2.7529921531677246,
154
+ "learning_rate": 0.0004,
155
+ "loss": 1.1871,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6285714285714286,
160
+ "grad_norm": 6.538937568664551,
161
+ "learning_rate": 0.00039523809523809526,
162
+ "loss": 1.268,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.6571428571428571,
167
+ "grad_norm": 3.3910727500915527,
168
+ "learning_rate": 0.0003904761904761905,
169
+ "loss": 1.4192,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6857142857142857,
174
+ "grad_norm": 2.31817889213562,
175
+ "learning_rate": 0.0003857142857142857,
176
+ "loss": 1.2339,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7142857142857143,
181
+ "grad_norm": 2.8181235790252686,
182
+ "learning_rate": 0.00038095238095238096,
183
+ "loss": 1.3153,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.7428571428571429,
188
+ "grad_norm": 2.5716922283172607,
189
+ "learning_rate": 0.0003761904761904762,
190
+ "loss": 1.2066,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7714285714285715,
195
+ "grad_norm": 5.403870105743408,
196
+ "learning_rate": 0.00037142857142857143,
197
+ "loss": 1.2784,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8,
202
+ "grad_norm": 3.347729206085205,
203
+ "learning_rate": 0.00036666666666666667,
204
+ "loss": 1.2273,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8285714285714286,
209
+ "grad_norm": 2.7995996475219727,
210
+ "learning_rate": 0.0003619047619047619,
211
+ "loss": 1.159,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.8571428571428571,
216
+ "grad_norm": 3.817213535308838,
217
+ "learning_rate": 0.00035714285714285714,
218
+ "loss": 1.1146,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8857142857142857,
223
+ "grad_norm": 3.3239715099334717,
224
+ "learning_rate": 0.00035238095238095243,
225
+ "loss": 1.1513,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9142857142857143,
230
+ "grad_norm": 3.042973518371582,
231
+ "learning_rate": 0.0003476190476190476,
232
+ "loss": 1.3906,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9428571428571428,
237
+ "grad_norm": 2.8079681396484375,
238
+ "learning_rate": 0.00034285714285714285,
239
+ "loss": 1.1547,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9714285714285714,
244
+ "grad_norm": 6.39453649520874,
245
+ "learning_rate": 0.00033809523809523814,
246
+ "loss": 1.0989,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 1.0,
251
+ "grad_norm": 5.739945411682129,
252
+ "learning_rate": 0.0003333333333333333,
253
+ "loss": 1.1563,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 1.0,
258
+ "eval_loss": 1.064656376838684,
259
+ "eval_runtime": 16.5324,
260
+ "eval_samples_per_second": 12.097,
261
+ "eval_steps_per_second": 1.512,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.0285714285714285,
266
+ "grad_norm": 5.899299144744873,
267
+ "learning_rate": 0.00032857142857142856,
268
+ "loss": 1.0649,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.0571428571428572,
273
+ "grad_norm": 3.3639721870422363,
274
+ "learning_rate": 0.00032380952380952385,
275
+ "loss": 1.1595,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.0857142857142856,
280
+ "grad_norm": 2.811561346054077,
281
+ "learning_rate": 0.00031904761904761903,
282
+ "loss": 1.0871,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.1142857142857143,
287
+ "grad_norm": 2.6730055809020996,
288
+ "learning_rate": 0.00031428571428571427,
289
+ "loss": 1.0294,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.1428571428571428,
294
+ "grad_norm": 5.02337646484375,
295
+ "learning_rate": 0.00030952380952380956,
296
+ "loss": 1.0844,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.1714285714285715,
301
+ "grad_norm": 4.188799858093262,
302
+ "learning_rate": 0.0003047619047619048,
303
+ "loss": 1.0464,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.2,
308
+ "grad_norm": 5.055841445922852,
309
+ "learning_rate": 0.0003,
310
+ "loss": 1.1078,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.2285714285714286,
315
+ "grad_norm": 8.09494400024414,
316
+ "learning_rate": 0.00029523809523809526,
317
+ "loss": 1.0539,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.2571428571428571,
322
+ "grad_norm": 3.012653112411499,
323
+ "learning_rate": 0.0002904761904761905,
324
+ "loss": 0.927,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.2857142857142856,
329
+ "grad_norm": 8.38936710357666,
330
+ "learning_rate": 0.0002857142857142857,
331
+ "loss": 1.2209,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.3142857142857143,
336
+ "grad_norm": 4.191105365753174,
337
+ "learning_rate": 0.00028095238095238097,
338
+ "loss": 1.0086,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.342857142857143,
343
+ "grad_norm": 2.9886045455932617,
344
+ "learning_rate": 0.0002761904761904762,
345
+ "loss": 1.0788,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.3714285714285714,
350
+ "grad_norm": 3.5892975330352783,
351
+ "learning_rate": 0.0002714285714285714,
352
+ "loss": 1.0877,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.4,
357
+ "grad_norm": 4.504238128662109,
358
+ "learning_rate": 0.0002666666666666667,
359
+ "loss": 1.0066,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.4285714285714286,
364
+ "grad_norm": 3.601853132247925,
365
+ "learning_rate": 0.0002619047619047619,
366
+ "loss": 0.9915,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.457142857142857,
371
+ "grad_norm": 4.008484840393066,
372
+ "learning_rate": 0.0002571428571428571,
373
+ "loss": 0.9651,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.4857142857142858,
378
+ "grad_norm": 5.5403900146484375,
379
+ "learning_rate": 0.0002523809523809524,
380
+ "loss": 0.9018,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.5142857142857142,
385
+ "grad_norm": 3.526982069015503,
386
+ "learning_rate": 0.0002476190476190476,
387
+ "loss": 0.9588,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.5428571428571427,
392
+ "grad_norm": 3.666804075241089,
393
+ "learning_rate": 0.00024285714285714286,
394
+ "loss": 1.0092,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.5714285714285714,
399
+ "grad_norm": 3.4340310096740723,
400
+ "learning_rate": 0.0002380952380952381,
401
+ "loss": 0.9342,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.6,
406
+ "grad_norm": 5.815408229827881,
407
+ "learning_rate": 0.00023333333333333333,
408
+ "loss": 1.0666,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.6285714285714286,
413
+ "grad_norm": 2.7780840396881104,
414
+ "learning_rate": 0.00022857142857142857,
415
+ "loss": 1.1026,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.657142857142857,
420
+ "grad_norm": 3.8484044075012207,
421
+ "learning_rate": 0.00022380952380952383,
422
+ "loss": 0.937,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.6857142857142857,
427
+ "grad_norm": 3.819007635116577,
428
+ "learning_rate": 0.00021904761904761904,
429
+ "loss": 1.1247,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.7142857142857144,
434
+ "grad_norm": 3.8324477672576904,
435
+ "learning_rate": 0.00021428571428571427,
436
+ "loss": 0.9961,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.7428571428571429,
441
+ "grad_norm": 3.2769389152526855,
442
+ "learning_rate": 0.00020952380952380954,
443
+ "loss": 0.9987,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.7714285714285714,
448
+ "grad_norm": 5.3248443603515625,
449
+ "learning_rate": 0.00020476190476190477,
450
+ "loss": 0.9219,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.8,
455
+ "grad_norm": 2.6656081676483154,
456
+ "learning_rate": 0.0002,
457
+ "loss": 1.0293,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.8285714285714287,
462
+ "grad_norm": 2.6429567337036133,
463
+ "learning_rate": 0.00019523809523809525,
464
+ "loss": 1.0106,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.8571428571428572,
469
+ "grad_norm": 4.4341864585876465,
470
+ "learning_rate": 0.00019047619047619048,
471
+ "loss": 0.8426,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.8857142857142857,
476
+ "grad_norm": 4.535873889923096,
477
+ "learning_rate": 0.00018571428571428572,
478
+ "loss": 0.9335,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.9142857142857141,
483
+ "grad_norm": 4.100979328155518,
484
+ "learning_rate": 0.00018095238095238095,
485
+ "loss": 0.9936,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.9428571428571428,
490
+ "grad_norm": 4.034025192260742,
491
+ "learning_rate": 0.00017619047619047622,
492
+ "loss": 0.9305,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.9714285714285715,
497
+ "grad_norm": 3.618300676345825,
498
+ "learning_rate": 0.00017142857142857143,
499
+ "loss": 0.975,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 2.0,
504
+ "grad_norm": 3.0254437923431396,
505
+ "learning_rate": 0.00016666666666666666,
506
+ "loss": 0.997,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 2.0,
511
+ "eval_loss": 0.893572211265564,
512
+ "eval_runtime": 16.613,
513
+ "eval_samples_per_second": 12.039,
514
+ "eval_steps_per_second": 1.505,
515
  "step": 700
516
  },
517
  {
518
  "epoch": 2.0285714285714285,
519
+ "grad_norm": 2.8559610843658447,
520
+ "learning_rate": 0.00016190476190476192,
521
+ "loss": 0.8578,
522
  "step": 710
523
  },
524
  {
525
  "epoch": 2.057142857142857,
526
+ "grad_norm": 2.623955488204956,
527
+ "learning_rate": 0.00015714285714285713,
528
+ "loss": 0.8208,
529
  "step": 720
530
  },
531
  {
532
  "epoch": 2.085714285714286,
533
+ "grad_norm": 4.081460475921631,
534
+ "learning_rate": 0.0001523809523809524,
535
+ "loss": 0.8064,
536
  "step": 730
537
  },
538
  {
539
  "epoch": 2.1142857142857143,
540
+ "grad_norm": 4.917724609375,
541
+ "learning_rate": 0.00014761904761904763,
542
+ "loss": 0.8172,
543
  "step": 740
544
  },
545
  {
546
  "epoch": 2.142857142857143,
547
+ "grad_norm": 5.27565336227417,
548
+ "learning_rate": 0.00014285714285714284,
549
+ "loss": 1.0486,
550
  "step": 750
551
  },
552
  {
553
  "epoch": 2.1714285714285713,
554
+ "grad_norm": 4.373621463775635,
555
+ "learning_rate": 0.0001380952380952381,
556
+ "loss": 0.8948,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.2,
561
+ "grad_norm": 3.5879688262939453,
562
+ "learning_rate": 0.00013333333333333334,
563
+ "loss": 0.8612,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.2285714285714286,
568
+ "grad_norm": 3.1236374378204346,
569
+ "learning_rate": 0.00012857142857142855,
570
+ "loss": 0.906,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.257142857142857,
575
+ "grad_norm": 3.8395140171051025,
576
+ "learning_rate": 0.0001238095238095238,
577
+ "loss": 0.7673,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.2857142857142856,
582
+ "grad_norm": 3.8085057735443115,
583
+ "learning_rate": 0.00011904761904761905,
584
+ "loss": 0.8075,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.314285714285714,
589
+ "grad_norm": 3.388486385345459,
590
+ "learning_rate": 0.00011428571428571428,
591
+ "loss": 0.8651,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.342857142857143,
596
+ "grad_norm": 3.3502166271209717,
597
+ "learning_rate": 0.00010952380952380952,
598
+ "loss": 0.8036,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.3714285714285714,
603
+ "grad_norm": 3.833613634109497,
604
+ "learning_rate": 0.00010476190476190477,
605
+ "loss": 0.9703,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.4,
610
+ "grad_norm": 3.7099878787994385,
611
+ "learning_rate": 0.0001,
612
+ "loss": 0.8167,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.4285714285714284,
617
+ "grad_norm": 6.892563343048096,
618
+ "learning_rate": 9.523809523809524e-05,
619
+ "loss": 0.7719,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.4571428571428573,
624
+ "grad_norm": 4.409047603607178,
625
+ "learning_rate": 9.047619047619048e-05,
626
+ "loss": 0.9031,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.4857142857142858,
631
+ "grad_norm": 1.9474296569824219,
632
+ "learning_rate": 8.571428571428571e-05,
633
+ "loss": 0.9334,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.5142857142857142,
638
+ "grad_norm": 2.9493942260742188,
639
+ "learning_rate": 8.095238095238096e-05,
640
+ "loss": 0.8624,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.5428571428571427,
645
+ "grad_norm": 4.493179798126221,
646
+ "learning_rate": 7.61904761904762e-05,
647
+ "loss": 0.9769,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.571428571428571,
652
+ "grad_norm": 5.590090751647949,
653
+ "learning_rate": 7.142857142857142e-05,
654
+ "loss": 0.6596,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.6,
659
+ "grad_norm": 2.4073421955108643,
660
+ "learning_rate": 6.666666666666667e-05,
661
+ "loss": 0.8155,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.6285714285714286,
666
+ "grad_norm": 3.83565092086792,
667
+ "learning_rate": 6.19047619047619e-05,
668
+ "loss": 0.8871,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.657142857142857,
673
+ "grad_norm": 5.591251850128174,
674
+ "learning_rate": 5.714285714285714e-05,
675
+ "loss": 0.7536,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.685714285714286,
680
+ "grad_norm": 4.405236721038818,
681
+ "learning_rate": 5.2380952380952384e-05,
682
+ "loss": 0.9314,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.7142857142857144,
687
+ "grad_norm": 4.951947212219238,
688
+ "learning_rate": 4.761904761904762e-05,
689
+ "loss": 0.796,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.742857142857143,
694
+ "grad_norm": 5.001076698303223,
695
+ "learning_rate": 4.2857142857142856e-05,
696
+ "loss": 0.7446,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.7714285714285714,
701
+ "grad_norm": 5.915081024169922,
702
+ "learning_rate": 3.80952380952381e-05,
703
+ "loss": 0.9323,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.8,
708
+ "grad_norm": 2.5451648235321045,
709
+ "learning_rate": 3.3333333333333335e-05,
710
+ "loss": 0.9232,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.8285714285714287,
715
+ "grad_norm": 3.71482515335083,
716
+ "learning_rate": 2.857142857142857e-05,
717
+ "loss": 0.8619,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.857142857142857,
722
+ "grad_norm": 3.601994037628174,
723
+ "learning_rate": 2.380952380952381e-05,
724
+ "loss": 0.8866,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.8857142857142857,
729
+ "grad_norm": 4.266910552978516,
730
+ "learning_rate": 1.904761904761905e-05,
731
+ "loss": 0.7992,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.914285714285714,
736
+ "grad_norm": 2.968625783920288,
737
+ "learning_rate": 1.4285714285714285e-05,
738
+ "loss": 0.7861,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.942857142857143,
743
+ "grad_norm": 3.180135488510132,
744
+ "learning_rate": 9.523809523809525e-06,
745
+ "loss": 0.6923,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.9714285714285715,
750
+ "grad_norm": 3.609273910522461,
751
+ "learning_rate": 4.761904761904762e-06,
752
+ "loss": 0.6872,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 3.0,
757
+ "grad_norm": 4.716108798980713,
758
  "learning_rate": 0.0,
759
+ "loss": 0.8321,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 3.0,
764
+ "eval_loss": 0.7921221852302551,
765
+ "eval_runtime": 16.5336,
766
+ "eval_samples_per_second": 12.097,
767
+ "eval_steps_per_second": 1.512,
768
  "step": 1050
769
  }
770
  ],
 
785
  "attributes": {}
786
  }
787
  },
788
+ "total_flos": 6283456413696000.0,
789
  "train_batch_size": 8,
790
  "trial_name": null,
791
  "trial_params": null
checkpoint-1050/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e1dfa9107963f8185e69ba39492321da63b95b2c62e54736ef9c90df528570
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1982b07d8277b0bc7aaf4f34307bfaa24bd97b8994cd2c48e972769e68324f20
3
  size 5240
checkpoint-350/adapter_config.json CHANGED
@@ -20,8 +20,8 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
checkpoint-350/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7134e8571a6eb9a671f924ca5254574877103a854e2fb418241240873e61c8d1
3
  size 3416264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7425737072ba8acbe6d2f9c94c69f458d16d0a7f53fa0676b6840781f5a4b2fc
3
  size 3416264
checkpoint-350/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dab7d1739bea84cef24eed0f552f33b6c7fa803e7c2988607daf9049e70ec9c
3
  size 6869818
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ce96627e6b7708b4c7f689eccadb34c4192d05616b766eab604b48141e5e66
3
  size 6869818
checkpoint-350/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd178daa6f4d440d8189bbd31bca1f40c0bb1df60aa27e585e4e6653d855a91b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a60eb44815728d1c05bb3472d8ee196dc9c4a01b9978bca3eb64637848fdb4c7
3
  size 14244
checkpoint-350/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33aefed7c277ae39c1954cde2ddf39a160d3f27df9bf151a72089b407fe3071e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:def6878723ae6de031424f87c5cc1de1a9c00f84dc97979090ddbd977a6d40b3
3
  size 1064
checkpoint-350/trainer_state.json CHANGED
@@ -10,255 +10,255 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.02857142857142857,
13
- "grad_norm": 9.284814834594727,
14
- "learning_rate": 9.904761904761905e-05,
15
- "loss": 2.0184,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05714285714285714,
20
- "grad_norm": 12.879095077514648,
21
- "learning_rate": 9.80952380952381e-05,
22
- "loss": 1.7222,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.08571428571428572,
27
- "grad_norm": 17.29737663269043,
28
- "learning_rate": 9.714285714285715e-05,
29
- "loss": 1.7591,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.11428571428571428,
34
- "grad_norm": 5.025689125061035,
35
- "learning_rate": 9.61904761904762e-05,
36
- "loss": 1.7304,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
- "grad_norm": 10.627418518066406,
42
- "learning_rate": 9.523809523809524e-05,
43
- "loss": 1.8282,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.17142857142857143,
48
- "grad_norm": 5.618676662445068,
49
- "learning_rate": 9.428571428571429e-05,
50
- "loss": 1.6407,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.2,
55
- "grad_norm": 13.271527290344238,
56
- "learning_rate": 9.333333333333334e-05,
57
- "loss": 1.7611,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.22857142857142856,
62
- "grad_norm": 7.040605545043945,
63
- "learning_rate": 9.238095238095239e-05,
64
- "loss": 1.8149,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2571428571428571,
69
- "grad_norm": 9.282252311706543,
70
- "learning_rate": 9.142857142857143e-05,
71
- "loss": 1.656,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2857142857142857,
76
- "grad_norm": 7.04727029800415,
77
- "learning_rate": 9.047619047619048e-05,
78
- "loss": 1.6419,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3142857142857143,
83
- "grad_norm": 5.5171122550964355,
84
- "learning_rate": 8.952380952380953e-05,
85
- "loss": 1.682,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.34285714285714286,
90
- "grad_norm": 9.009190559387207,
91
- "learning_rate": 8.857142857142857e-05,
92
- "loss": 1.4019,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.37142857142857144,
97
- "grad_norm": 11.160100936889648,
98
- "learning_rate": 8.761904761904762e-05,
99
- "loss": 1.5941,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.4,
104
- "grad_norm": 6.663743019104004,
105
- "learning_rate": 8.666666666666667e-05,
106
- "loss": 1.4869,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.42857142857142855,
111
- "grad_norm": 7.955056667327881,
112
- "learning_rate": 8.571428571428571e-05,
113
- "loss": 1.6137,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.45714285714285713,
118
- "grad_norm": 7.169972896575928,
119
- "learning_rate": 8.476190476190477e-05,
120
- "loss": 1.7435,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4857142857142857,
125
- "grad_norm": 4.778314113616943,
126
- "learning_rate": 8.380952380952382e-05,
127
- "loss": 1.3997,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.5142857142857142,
132
- "grad_norm": 8.540094375610352,
133
- "learning_rate": 8.285714285714287e-05,
134
- "loss": 1.5345,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5428571428571428,
139
- "grad_norm": 8.377891540527344,
140
- "learning_rate": 8.19047619047619e-05,
141
- "loss": 1.4992,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5714285714285714,
146
- "grad_norm": 6.053997039794922,
147
- "learning_rate": 8.095238095238096e-05,
148
- "loss": 1.4046,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6,
153
- "grad_norm": 11.835803985595703,
154
- "learning_rate": 8e-05,
155
- "loss": 1.3463,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6285714285714286,
160
- "grad_norm": 7.073652744293213,
161
- "learning_rate": 7.904761904761905e-05,
162
- "loss": 1.3764,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.6571428571428571,
167
- "grad_norm": 10.55458927154541,
168
- "learning_rate": 7.80952380952381e-05,
169
- "loss": 1.5288,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6857142857142857,
174
- "grad_norm": 6.555107593536377,
175
- "learning_rate": 7.714285714285715e-05,
176
- "loss": 1.3578,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7142857142857143,
181
- "grad_norm": 5.762172222137451,
182
- "learning_rate": 7.619047619047618e-05,
183
- "loss": 1.4635,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.7428571428571429,
188
- "grad_norm": 5.80485200881958,
189
- "learning_rate": 7.523809523809524e-05,
190
- "loss": 1.3706,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7714285714285715,
195
- "grad_norm": 9.550569534301758,
196
- "learning_rate": 7.428571428571429e-05,
197
- "loss": 1.4227,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8,
202
- "grad_norm": 6.621246814727783,
203
- "learning_rate": 7.333333333333333e-05,
204
- "loss": 1.3751,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8285714285714286,
209
- "grad_norm": 7.168375492095947,
210
- "learning_rate": 7.238095238095238e-05,
211
- "loss": 1.3472,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.8571428571428571,
216
- "grad_norm": 11.96556568145752,
217
- "learning_rate": 7.142857142857143e-05,
218
- "loss": 1.33,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8857142857142857,
223
- "grad_norm": 10.02365779876709,
224
- "learning_rate": 7.047619047619048e-05,
225
- "loss": 1.3553,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9142857142857143,
230
- "grad_norm": 9.006269454956055,
231
- "learning_rate": 6.952380952380952e-05,
232
- "loss": 1.5504,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9428571428571428,
237
- "grad_norm": 7.228586673736572,
238
- "learning_rate": 6.857142857142858e-05,
239
- "loss": 1.3597,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9714285714285714,
244
- "grad_norm": 18.761024475097656,
245
- "learning_rate": 6.761904761904763e-05,
246
- "loss": 1.332,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 1.0,
251
- "grad_norm": 9.89370346069336,
252
- "learning_rate": 6.666666666666667e-05,
253
- "loss": 1.4027,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 1.0,
258
- "eval_loss": 1.2965246438980103,
259
- "eval_runtime": 16.113,
260
- "eval_samples_per_second": 12.412,
261
- "eval_steps_per_second": 1.552,
262
  "step": 350
263
  }
264
  ],
@@ -279,7 +279,7 @@
279
  "attributes": {}
280
  }
281
  },
282
- "total_flos": 2148343669063680.0,
283
  "train_batch_size": 8,
284
  "trial_name": null,
285
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.02857142857142857,
13
+ "grad_norm": 5.210493087768555,
14
+ "learning_rate": 0.0004952380952380952,
15
+ "loss": 4.4131,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05714285714285714,
20
+ "grad_norm": 3.571948528289795,
21
+ "learning_rate": 0.0004904761904761905,
22
+ "loss": 2.5476,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.08571428571428572,
27
+ "grad_norm": 2.640848398208618,
28
+ "learning_rate": 0.0004857142857142857,
29
+ "loss": 2.2491,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.11428571428571428,
34
+ "grad_norm": 2.6611335277557373,
35
+ "learning_rate": 0.00048095238095238095,
36
+ "loss": 2.0477,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
+ "grad_norm": 2.608750581741333,
42
+ "learning_rate": 0.0004761904761904762,
43
+ "loss": 2.028,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.17142857142857143,
48
+ "grad_norm": 5.034807205200195,
49
+ "learning_rate": 0.0004714285714285714,
50
+ "loss": 1.7871,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.2,
55
+ "grad_norm": 3.5893642902374268,
56
+ "learning_rate": 0.00046666666666666666,
57
+ "loss": 1.7879,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.22857142857142856,
62
+ "grad_norm": 5.368953227996826,
63
+ "learning_rate": 0.00046190476190476195,
64
+ "loss": 1.8194,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2571428571428571,
69
+ "grad_norm": 3.7377731800079346,
70
+ "learning_rate": 0.00045714285714285713,
71
+ "loss": 1.6159,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2857142857142857,
76
+ "grad_norm": 7.883308410644531,
77
+ "learning_rate": 0.00045238095238095237,
78
+ "loss": 1.6244,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3142857142857143,
83
+ "grad_norm": 4.968689918518066,
84
+ "learning_rate": 0.00044761904761904766,
85
+ "loss": 1.6614,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.34285714285714286,
90
+ "grad_norm": 2.592766046524048,
91
+ "learning_rate": 0.00044285714285714284,
92
+ "loss": 1.349,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.37142857142857144,
97
+ "grad_norm": 2.783951997756958,
98
+ "learning_rate": 0.0004380952380952381,
99
+ "loss": 1.5065,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.4,
104
+ "grad_norm": 5.2810378074646,
105
+ "learning_rate": 0.00043333333333333337,
106
+ "loss": 1.3934,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.42857142857142855,
111
+ "grad_norm": 4.448086261749268,
112
+ "learning_rate": 0.00042857142857142855,
113
+ "loss": 1.5198,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.45714285714285713,
118
+ "grad_norm": 3.211707353591919,
119
+ "learning_rate": 0.0004238095238095238,
120
+ "loss": 1.6207,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4857142857142857,
125
+ "grad_norm": 3.586463212966919,
126
+ "learning_rate": 0.0004190476190476191,
127
+ "loss": 1.2687,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.5142857142857142,
132
+ "grad_norm": 2.6598610877990723,
133
+ "learning_rate": 0.0004142857142857143,
134
+ "loss": 1.3957,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5428571428571428,
139
+ "grad_norm": 3.480663776397705,
140
+ "learning_rate": 0.00040952380952380955,
141
+ "loss": 1.3543,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5714285714285714,
146
+ "grad_norm": 3.168818712234497,
147
+ "learning_rate": 0.0004047619047619048,
148
+ "loss": 1.2619,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6,
153
+ "grad_norm": 2.7529921531677246,
154
+ "learning_rate": 0.0004,
155
+ "loss": 1.1871,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6285714285714286,
160
+ "grad_norm": 6.538937568664551,
161
+ "learning_rate": 0.00039523809523809526,
162
+ "loss": 1.268,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.6571428571428571,
167
+ "grad_norm": 3.3910727500915527,
168
+ "learning_rate": 0.0003904761904761905,
169
+ "loss": 1.4192,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6857142857142857,
174
+ "grad_norm": 2.31817889213562,
175
+ "learning_rate": 0.0003857142857142857,
176
+ "loss": 1.2339,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7142857142857143,
181
+ "grad_norm": 2.8181235790252686,
182
+ "learning_rate": 0.00038095238095238096,
183
+ "loss": 1.3153,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.7428571428571429,
188
+ "grad_norm": 2.5716922283172607,
189
+ "learning_rate": 0.0003761904761904762,
190
+ "loss": 1.2066,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7714285714285715,
195
+ "grad_norm": 5.403870105743408,
196
+ "learning_rate": 0.00037142857142857143,
197
+ "loss": 1.2784,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8,
202
+ "grad_norm": 3.347729206085205,
203
+ "learning_rate": 0.00036666666666666667,
204
+ "loss": 1.2273,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8285714285714286,
209
+ "grad_norm": 2.7995996475219727,
210
+ "learning_rate": 0.0003619047619047619,
211
+ "loss": 1.159,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.8571428571428571,
216
+ "grad_norm": 3.817213535308838,
217
+ "learning_rate": 0.00035714285714285714,
218
+ "loss": 1.1146,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8857142857142857,
223
+ "grad_norm": 3.3239715099334717,
224
+ "learning_rate": 0.00035238095238095243,
225
+ "loss": 1.1513,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9142857142857143,
230
+ "grad_norm": 3.042973518371582,
231
+ "learning_rate": 0.0003476190476190476,
232
+ "loss": 1.3906,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9428571428571428,
237
+ "grad_norm": 2.8079681396484375,
238
+ "learning_rate": 0.00034285714285714285,
239
+ "loss": 1.1547,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9714285714285714,
244
+ "grad_norm": 6.39453649520874,
245
+ "learning_rate": 0.00033809523809523814,
246
+ "loss": 1.0989,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 1.0,
251
+ "grad_norm": 5.739945411682129,
252
+ "learning_rate": 0.0003333333333333333,
253
+ "loss": 1.1563,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 1.0,
258
+ "eval_loss": 1.064656376838684,
259
+ "eval_runtime": 16.5324,
260
+ "eval_samples_per_second": 12.097,
261
+ "eval_steps_per_second": 1.512,
262
  "step": 350
263
  }
264
  ],
 
279
  "attributes": {}
280
  }
281
  },
282
+ "total_flos": 2094485471232000.0,
283
  "train_batch_size": 8,
284
  "trial_name": null,
285
  "trial_params": null
checkpoint-350/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e1dfa9107963f8185e69ba39492321da63b95b2c62e54736ef9c90df528570
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1982b07d8277b0bc7aaf4f34307bfaa24bd97b8994cd2c48e972769e68324f20
3
  size 5240
checkpoint-700/adapter_config.json CHANGED
@@ -20,8 +20,8 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
checkpoint-700/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dbe85e2dd4f1f5fc2eb17da5a265bddf136411ea27e85a75c0f7eea7ce2f2e0
3
  size 3416264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272d892f181091df9c53c56b733b46487117c9ff3f709e43844e4925d4302ea6
3
  size 3416264
checkpoint-700/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b37faece466f9bb9c07d1d8555f770f47ea66562a46a581b8b8f4f8c5da2e6b
3
  size 6869818
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fdad2f22ddadb3c8130f49e48198958bf03d74f744708327044990e56b3d21a
3
  size 6869818
checkpoint-700/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2bd29ae77322656ada7c1bb2d3d83cc8190b748cdd472f783b591d1bfc8cb7c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50e1d829b6f58094a5a1390d8e3ee0aa39e25fa2ec141d6471b70ffb58c1ac3
3
  size 14244
checkpoint-700/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7f2210d0183813c5f239e54b2c5c45a2e710e979039235ab9d6882e70c940d8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b00ddbc3d3a3abe006434542557db9d921f0260b12618e776b614466247588
3
  size 1064
checkpoint-700/trainer_state.json CHANGED
@@ -10,508 +10,508 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.02857142857142857,
13
- "grad_norm": 9.284814834594727,
14
- "learning_rate": 9.904761904761905e-05,
15
- "loss": 2.0184,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05714285714285714,
20
- "grad_norm": 12.879095077514648,
21
- "learning_rate": 9.80952380952381e-05,
22
- "loss": 1.7222,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.08571428571428572,
27
- "grad_norm": 17.29737663269043,
28
- "learning_rate": 9.714285714285715e-05,
29
- "loss": 1.7591,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.11428571428571428,
34
- "grad_norm": 5.025689125061035,
35
- "learning_rate": 9.61904761904762e-05,
36
- "loss": 1.7304,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
- "grad_norm": 10.627418518066406,
42
- "learning_rate": 9.523809523809524e-05,
43
- "loss": 1.8282,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.17142857142857143,
48
- "grad_norm": 5.618676662445068,
49
- "learning_rate": 9.428571428571429e-05,
50
- "loss": 1.6407,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.2,
55
- "grad_norm": 13.271527290344238,
56
- "learning_rate": 9.333333333333334e-05,
57
- "loss": 1.7611,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.22857142857142856,
62
- "grad_norm": 7.040605545043945,
63
- "learning_rate": 9.238095238095239e-05,
64
- "loss": 1.8149,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2571428571428571,
69
- "grad_norm": 9.282252311706543,
70
- "learning_rate": 9.142857142857143e-05,
71
- "loss": 1.656,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2857142857142857,
76
- "grad_norm": 7.04727029800415,
77
- "learning_rate": 9.047619047619048e-05,
78
- "loss": 1.6419,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3142857142857143,
83
- "grad_norm": 5.5171122550964355,
84
- "learning_rate": 8.952380952380953e-05,
85
- "loss": 1.682,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.34285714285714286,
90
- "grad_norm": 9.009190559387207,
91
- "learning_rate": 8.857142857142857e-05,
92
- "loss": 1.4019,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.37142857142857144,
97
- "grad_norm": 11.160100936889648,
98
- "learning_rate": 8.761904761904762e-05,
99
- "loss": 1.5941,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.4,
104
- "grad_norm": 6.663743019104004,
105
- "learning_rate": 8.666666666666667e-05,
106
- "loss": 1.4869,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.42857142857142855,
111
- "grad_norm": 7.955056667327881,
112
- "learning_rate": 8.571428571428571e-05,
113
- "loss": 1.6137,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.45714285714285713,
118
- "grad_norm": 7.169972896575928,
119
- "learning_rate": 8.476190476190477e-05,
120
- "loss": 1.7435,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4857142857142857,
125
- "grad_norm": 4.778314113616943,
126
- "learning_rate": 8.380952380952382e-05,
127
- "loss": 1.3997,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.5142857142857142,
132
- "grad_norm": 8.540094375610352,
133
- "learning_rate": 8.285714285714287e-05,
134
- "loss": 1.5345,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5428571428571428,
139
- "grad_norm": 8.377891540527344,
140
- "learning_rate": 8.19047619047619e-05,
141
- "loss": 1.4992,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5714285714285714,
146
- "grad_norm": 6.053997039794922,
147
- "learning_rate": 8.095238095238096e-05,
148
- "loss": 1.4046,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6,
153
- "grad_norm": 11.835803985595703,
154
- "learning_rate": 8e-05,
155
- "loss": 1.3463,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6285714285714286,
160
- "grad_norm": 7.073652744293213,
161
- "learning_rate": 7.904761904761905e-05,
162
- "loss": 1.3764,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.6571428571428571,
167
- "grad_norm": 10.55458927154541,
168
- "learning_rate": 7.80952380952381e-05,
169
- "loss": 1.5288,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6857142857142857,
174
- "grad_norm": 6.555107593536377,
175
- "learning_rate": 7.714285714285715e-05,
176
- "loss": 1.3578,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7142857142857143,
181
- "grad_norm": 5.762172222137451,
182
- "learning_rate": 7.619047619047618e-05,
183
- "loss": 1.4635,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.7428571428571429,
188
- "grad_norm": 5.80485200881958,
189
- "learning_rate": 7.523809523809524e-05,
190
- "loss": 1.3706,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7714285714285715,
195
- "grad_norm": 9.550569534301758,
196
- "learning_rate": 7.428571428571429e-05,
197
- "loss": 1.4227,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8,
202
- "grad_norm": 6.621246814727783,
203
- "learning_rate": 7.333333333333333e-05,
204
- "loss": 1.3751,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8285714285714286,
209
- "grad_norm": 7.168375492095947,
210
- "learning_rate": 7.238095238095238e-05,
211
- "loss": 1.3472,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.8571428571428571,
216
- "grad_norm": 11.96556568145752,
217
- "learning_rate": 7.142857142857143e-05,
218
- "loss": 1.33,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8857142857142857,
223
- "grad_norm": 10.02365779876709,
224
- "learning_rate": 7.047619047619048e-05,
225
- "loss": 1.3553,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9142857142857143,
230
- "grad_norm": 9.006269454956055,
231
- "learning_rate": 6.952380952380952e-05,
232
- "loss": 1.5504,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9428571428571428,
237
- "grad_norm": 7.228586673736572,
238
- "learning_rate": 6.857142857142858e-05,
239
- "loss": 1.3597,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9714285714285714,
244
- "grad_norm": 18.761024475097656,
245
- "learning_rate": 6.761904761904763e-05,
246
- "loss": 1.332,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 1.0,
251
- "grad_norm": 9.89370346069336,
252
- "learning_rate": 6.666666666666667e-05,
253
- "loss": 1.4027,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 1.0,
258
- "eval_loss": 1.2965246438980103,
259
- "eval_runtime": 16.113,
260
- "eval_samples_per_second": 12.412,
261
- "eval_steps_per_second": 1.552,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.0285714285714285,
266
- "grad_norm": 14.148599624633789,
267
- "learning_rate": 6.571428571428571e-05,
268
- "loss": 1.322,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.0571428571428572,
273
- "grad_norm": 6.457850456237793,
274
- "learning_rate": 6.476190476190477e-05,
275
- "loss": 1.4209,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.0857142857142856,
280
- "grad_norm": 7.997476100921631,
281
- "learning_rate": 6.38095238095238e-05,
282
- "loss": 1.3045,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.1142857142857143,
287
- "grad_norm": 11.429621696472168,
288
- "learning_rate": 6.285714285714286e-05,
289
- "loss": 1.3239,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.1428571428571428,
294
- "grad_norm": 8.54537582397461,
295
- "learning_rate": 6.19047619047619e-05,
296
- "loss": 1.3079,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.1714285714285715,
301
- "grad_norm": 9.194470405578613,
302
- "learning_rate": 6.0952380952380964e-05,
303
- "loss": 1.3014,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.2,
308
- "grad_norm": 6.942721366882324,
309
- "learning_rate": 6e-05,
310
- "loss": 1.3108,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.2285714285714286,
315
- "grad_norm": 17.95640754699707,
316
- "learning_rate": 5.904761904761905e-05,
317
- "loss": 1.2473,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.2571428571428571,
322
- "grad_norm": 6.065295696258545,
323
- "learning_rate": 5.8095238095238104e-05,
324
- "loss": 1.1563,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.2857142857142856,
329
- "grad_norm": 18.216981887817383,
330
- "learning_rate": 5.714285714285714e-05,
331
- "loss": 1.5198,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.3142857142857143,
336
- "grad_norm": 7.645397186279297,
337
- "learning_rate": 5.619047619047619e-05,
338
- "loss": 1.294,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.342857142857143,
343
- "grad_norm": 8.216632843017578,
344
- "learning_rate": 5.5238095238095244e-05,
345
- "loss": 1.3319,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.3714285714285714,
350
- "grad_norm": 11.892958641052246,
351
- "learning_rate": 5.428571428571428e-05,
352
- "loss": 1.349,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.4,
357
- "grad_norm": 7.598433494567871,
358
- "learning_rate": 5.333333333333333e-05,
359
- "loss": 1.2459,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.4285714285714286,
364
- "grad_norm": 7.4001898765563965,
365
- "learning_rate": 5.2380952380952384e-05,
366
- "loss": 1.2531,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.457142857142857,
371
- "grad_norm": 9.964533805847168,
372
- "learning_rate": 5.142857142857143e-05,
373
- "loss": 1.2326,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.4857142857142858,
378
- "grad_norm": 10.863306045532227,
379
- "learning_rate": 5.047619047619048e-05,
380
- "loss": 1.1964,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.5142857142857142,
385
- "grad_norm": 7.798760414123535,
386
- "learning_rate": 4.9523809523809525e-05,
387
- "loss": 1.2377,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.5428571428571427,
392
- "grad_norm": 5.575809001922607,
393
- "learning_rate": 4.8571428571428576e-05,
394
- "loss": 1.2545,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.5714285714285714,
399
- "grad_norm": 7.334414958953857,
400
- "learning_rate": 4.761904761904762e-05,
401
- "loss": 1.2441,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.6,
406
- "grad_norm": 7.7508745193481445,
407
- "learning_rate": 4.666666666666667e-05,
408
- "loss": 1.3679,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.6285714285714286,
413
- "grad_norm": 8.727417945861816,
414
- "learning_rate": 4.5714285714285716e-05,
415
- "loss": 1.366,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.657142857142857,
420
- "grad_norm": 11.305671691894531,
421
- "learning_rate": 4.476190476190477e-05,
422
- "loss": 1.2059,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.6857142857142857,
427
- "grad_norm": 12.226043701171875,
428
- "learning_rate": 4.380952380952381e-05,
429
- "loss": 1.3986,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.7142857142857144,
434
- "grad_norm": 9.918877601623535,
435
- "learning_rate": 4.2857142857142856e-05,
436
- "loss": 1.2768,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.7428571428571429,
441
- "grad_norm": 7.957796096801758,
442
- "learning_rate": 4.190476190476191e-05,
443
- "loss": 1.2662,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.7714285714285714,
448
- "grad_norm": 10.324079513549805,
449
- "learning_rate": 4.095238095238095e-05,
450
- "loss": 1.2392,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.8,
455
- "grad_norm": 8.535161972045898,
456
- "learning_rate": 4e-05,
457
- "loss": 1.2802,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.8285714285714287,
462
- "grad_norm": 6.386439800262451,
463
- "learning_rate": 3.904761904761905e-05,
464
- "loss": 1.2973,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.8571428571428572,
469
- "grad_norm": 16.93861961364746,
470
- "learning_rate": 3.809523809523809e-05,
471
- "loss": 1.1805,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.8857142857142857,
476
- "grad_norm": 17.099620819091797,
477
- "learning_rate": 3.7142857142857143e-05,
478
- "loss": 1.2095,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.9142857142857141,
483
- "grad_norm": 9.49264144897461,
484
- "learning_rate": 3.619047619047619e-05,
485
- "loss": 1.2924,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.9428571428571428,
490
- "grad_norm": 5.8994364738464355,
491
- "learning_rate": 3.523809523809524e-05,
492
- "loss": 1.2577,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.9714285714285715,
497
- "grad_norm": 7.082160472869873,
498
- "learning_rate": 3.428571428571429e-05,
499
- "loss": 1.2792,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 2.0,
504
- "grad_norm": 9.511951446533203,
505
- "learning_rate": 3.3333333333333335e-05,
506
- "loss": 1.2854,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 2.0,
511
- "eval_loss": 1.2146018743515015,
512
- "eval_runtime": 16.0983,
513
- "eval_samples_per_second": 12.424,
514
- "eval_steps_per_second": 1.553,
515
  "step": 700
516
  }
517
  ],
@@ -532,7 +532,7 @@
532
  "attributes": {}
533
  }
534
  },
535
- "total_flos": 4242829140295680.0,
536
  "train_batch_size": 8,
537
  "trial_name": null,
538
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.02857142857142857,
13
+ "grad_norm": 5.210493087768555,
14
+ "learning_rate": 0.0004952380952380952,
15
+ "loss": 4.4131,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05714285714285714,
20
+ "grad_norm": 3.571948528289795,
21
+ "learning_rate": 0.0004904761904761905,
22
+ "loss": 2.5476,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.08571428571428572,
27
+ "grad_norm": 2.640848398208618,
28
+ "learning_rate": 0.0004857142857142857,
29
+ "loss": 2.2491,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.11428571428571428,
34
+ "grad_norm": 2.6611335277557373,
35
+ "learning_rate": 0.00048095238095238095,
36
+ "loss": 2.0477,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
+ "grad_norm": 2.608750581741333,
42
+ "learning_rate": 0.0004761904761904762,
43
+ "loss": 2.028,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.17142857142857143,
48
+ "grad_norm": 5.034807205200195,
49
+ "learning_rate": 0.0004714285714285714,
50
+ "loss": 1.7871,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.2,
55
+ "grad_norm": 3.5893642902374268,
56
+ "learning_rate": 0.00046666666666666666,
57
+ "loss": 1.7879,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.22857142857142856,
62
+ "grad_norm": 5.368953227996826,
63
+ "learning_rate": 0.00046190476190476195,
64
+ "loss": 1.8194,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2571428571428571,
69
+ "grad_norm": 3.7377731800079346,
70
+ "learning_rate": 0.00045714285714285713,
71
+ "loss": 1.6159,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2857142857142857,
76
+ "grad_norm": 7.883308410644531,
77
+ "learning_rate": 0.00045238095238095237,
78
+ "loss": 1.6244,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.3142857142857143,
83
+ "grad_norm": 4.968689918518066,
84
+ "learning_rate": 0.00044761904761904766,
85
+ "loss": 1.6614,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.34285714285714286,
90
+ "grad_norm": 2.592766046524048,
91
+ "learning_rate": 0.00044285714285714284,
92
+ "loss": 1.349,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.37142857142857144,
97
+ "grad_norm": 2.783951997756958,
98
+ "learning_rate": 0.0004380952380952381,
99
+ "loss": 1.5065,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.4,
104
+ "grad_norm": 5.2810378074646,
105
+ "learning_rate": 0.00043333333333333337,
106
+ "loss": 1.3934,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.42857142857142855,
111
+ "grad_norm": 4.448086261749268,
112
+ "learning_rate": 0.00042857142857142855,
113
+ "loss": 1.5198,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.45714285714285713,
118
+ "grad_norm": 3.211707353591919,
119
+ "learning_rate": 0.0004238095238095238,
120
+ "loss": 1.6207,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4857142857142857,
125
+ "grad_norm": 3.586463212966919,
126
+ "learning_rate": 0.0004190476190476191,
127
+ "loss": 1.2687,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.5142857142857142,
132
+ "grad_norm": 2.6598610877990723,
133
+ "learning_rate": 0.0004142857142857143,
134
+ "loss": 1.3957,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5428571428571428,
139
+ "grad_norm": 3.480663776397705,
140
+ "learning_rate": 0.00040952380952380955,
141
+ "loss": 1.3543,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5714285714285714,
146
+ "grad_norm": 3.168818712234497,
147
+ "learning_rate": 0.0004047619047619048,
148
+ "loss": 1.2619,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6,
153
+ "grad_norm": 2.7529921531677246,
154
+ "learning_rate": 0.0004,
155
+ "loss": 1.1871,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6285714285714286,
160
+ "grad_norm": 6.538937568664551,
161
+ "learning_rate": 0.00039523809523809526,
162
+ "loss": 1.268,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.6571428571428571,
167
+ "grad_norm": 3.3910727500915527,
168
+ "learning_rate": 0.0003904761904761905,
169
+ "loss": 1.4192,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.6857142857142857,
174
+ "grad_norm": 2.31817889213562,
175
+ "learning_rate": 0.0003857142857142857,
176
+ "loss": 1.2339,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7142857142857143,
181
+ "grad_norm": 2.8181235790252686,
182
+ "learning_rate": 0.00038095238095238096,
183
+ "loss": 1.3153,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.7428571428571429,
188
+ "grad_norm": 2.5716922283172607,
189
+ "learning_rate": 0.0003761904761904762,
190
+ "loss": 1.2066,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7714285714285715,
195
+ "grad_norm": 5.403870105743408,
196
+ "learning_rate": 0.00037142857142857143,
197
+ "loss": 1.2784,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8,
202
+ "grad_norm": 3.347729206085205,
203
+ "learning_rate": 0.00036666666666666667,
204
+ "loss": 1.2273,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8285714285714286,
209
+ "grad_norm": 2.7995996475219727,
210
+ "learning_rate": 0.0003619047619047619,
211
+ "loss": 1.159,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.8571428571428571,
216
+ "grad_norm": 3.817213535308838,
217
+ "learning_rate": 0.00035714285714285714,
218
+ "loss": 1.1146,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8857142857142857,
223
+ "grad_norm": 3.3239715099334717,
224
+ "learning_rate": 0.00035238095238095243,
225
+ "loss": 1.1513,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9142857142857143,
230
+ "grad_norm": 3.042973518371582,
231
+ "learning_rate": 0.0003476190476190476,
232
+ "loss": 1.3906,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9428571428571428,
237
+ "grad_norm": 2.8079681396484375,
238
+ "learning_rate": 0.00034285714285714285,
239
+ "loss": 1.1547,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9714285714285714,
244
+ "grad_norm": 6.39453649520874,
245
+ "learning_rate": 0.00033809523809523814,
246
+ "loss": 1.0989,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 1.0,
251
+ "grad_norm": 5.739945411682129,
252
+ "learning_rate": 0.0003333333333333333,
253
+ "loss": 1.1563,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 1.0,
258
+ "eval_loss": 1.064656376838684,
259
+ "eval_runtime": 16.5324,
260
+ "eval_samples_per_second": 12.097,
261
+ "eval_steps_per_second": 1.512,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.0285714285714285,
266
+ "grad_norm": 5.899299144744873,
267
+ "learning_rate": 0.00032857142857142856,
268
+ "loss": 1.0649,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.0571428571428572,
273
+ "grad_norm": 3.3639721870422363,
274
+ "learning_rate": 0.00032380952380952385,
275
+ "loss": 1.1595,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.0857142857142856,
280
+ "grad_norm": 2.811561346054077,
281
+ "learning_rate": 0.00031904761904761903,
282
+ "loss": 1.0871,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.1142857142857143,
287
+ "grad_norm": 2.6730055809020996,
288
+ "learning_rate": 0.00031428571428571427,
289
+ "loss": 1.0294,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.1428571428571428,
294
+ "grad_norm": 5.02337646484375,
295
+ "learning_rate": 0.00030952380952380956,
296
+ "loss": 1.0844,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.1714285714285715,
301
+ "grad_norm": 4.188799858093262,
302
+ "learning_rate": 0.0003047619047619048,
303
+ "loss": 1.0464,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.2,
308
+ "grad_norm": 5.055841445922852,
309
+ "learning_rate": 0.0003,
310
+ "loss": 1.1078,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.2285714285714286,
315
+ "grad_norm": 8.09494400024414,
316
+ "learning_rate": 0.00029523809523809526,
317
+ "loss": 1.0539,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.2571428571428571,
322
+ "grad_norm": 3.012653112411499,
323
+ "learning_rate": 0.0002904761904761905,
324
+ "loss": 0.927,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.2857142857142856,
329
+ "grad_norm": 8.38936710357666,
330
+ "learning_rate": 0.0002857142857142857,
331
+ "loss": 1.2209,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.3142857142857143,
336
+ "grad_norm": 4.191105365753174,
337
+ "learning_rate": 0.00028095238095238097,
338
+ "loss": 1.0086,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.342857142857143,
343
+ "grad_norm": 2.9886045455932617,
344
+ "learning_rate": 0.0002761904761904762,
345
+ "loss": 1.0788,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.3714285714285714,
350
+ "grad_norm": 3.5892975330352783,
351
+ "learning_rate": 0.0002714285714285714,
352
+ "loss": 1.0877,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.4,
357
+ "grad_norm": 4.504238128662109,
358
+ "learning_rate": 0.0002666666666666667,
359
+ "loss": 1.0066,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.4285714285714286,
364
+ "grad_norm": 3.601853132247925,
365
+ "learning_rate": 0.0002619047619047619,
366
+ "loss": 0.9915,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.457142857142857,
371
+ "grad_norm": 4.008484840393066,
372
+ "learning_rate": 0.0002571428571428571,
373
+ "loss": 0.9651,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.4857142857142858,
378
+ "grad_norm": 5.5403900146484375,
379
+ "learning_rate": 0.0002523809523809524,
380
+ "loss": 0.9018,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.5142857142857142,
385
+ "grad_norm": 3.526982069015503,
386
+ "learning_rate": 0.0002476190476190476,
387
+ "loss": 0.9588,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.5428571428571427,
392
+ "grad_norm": 3.666804075241089,
393
+ "learning_rate": 0.00024285714285714286,
394
+ "loss": 1.0092,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.5714285714285714,
399
+ "grad_norm": 3.4340310096740723,
400
+ "learning_rate": 0.0002380952380952381,
401
+ "loss": 0.9342,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.6,
406
+ "grad_norm": 5.815408229827881,
407
+ "learning_rate": 0.00023333333333333333,
408
+ "loss": 1.0666,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.6285714285714286,
413
+ "grad_norm": 2.7780840396881104,
414
+ "learning_rate": 0.00022857142857142857,
415
+ "loss": 1.1026,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.657142857142857,
420
+ "grad_norm": 3.8484044075012207,
421
+ "learning_rate": 0.00022380952380952383,
422
+ "loss": 0.937,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.6857142857142857,
427
+ "grad_norm": 3.819007635116577,
428
+ "learning_rate": 0.00021904761904761904,
429
+ "loss": 1.1247,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.7142857142857144,
434
+ "grad_norm": 3.8324477672576904,
435
+ "learning_rate": 0.00021428571428571427,
436
+ "loss": 0.9961,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.7428571428571429,
441
+ "grad_norm": 3.2769389152526855,
442
+ "learning_rate": 0.00020952380952380954,
443
+ "loss": 0.9987,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.7714285714285714,
448
+ "grad_norm": 5.3248443603515625,
449
+ "learning_rate": 0.00020476190476190477,
450
+ "loss": 0.9219,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.8,
455
+ "grad_norm": 2.6656081676483154,
456
+ "learning_rate": 0.0002,
457
+ "loss": 1.0293,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.8285714285714287,
462
+ "grad_norm": 2.6429567337036133,
463
+ "learning_rate": 0.00019523809523809525,
464
+ "loss": 1.0106,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.8571428571428572,
469
+ "grad_norm": 4.4341864585876465,
470
+ "learning_rate": 0.00019047619047619048,
471
+ "loss": 0.8426,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.8857142857142857,
476
+ "grad_norm": 4.535873889923096,
477
+ "learning_rate": 0.00018571428571428572,
478
+ "loss": 0.9335,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.9142857142857141,
483
+ "grad_norm": 4.100979328155518,
484
+ "learning_rate": 0.00018095238095238095,
485
+ "loss": 0.9936,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.9428571428571428,
490
+ "grad_norm": 4.034025192260742,
491
+ "learning_rate": 0.00017619047619047622,
492
+ "loss": 0.9305,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.9714285714285715,
497
+ "grad_norm": 3.618300676345825,
498
+ "learning_rate": 0.00017142857142857143,
499
+ "loss": 0.975,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 2.0,
504
+ "grad_norm": 3.0254437923431396,
505
+ "learning_rate": 0.00016666666666666666,
506
+ "loss": 0.997,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 2.0,
511
+ "eval_loss": 0.893572211265564,
512
+ "eval_runtime": 16.613,
513
+ "eval_samples_per_second": 12.039,
514
+ "eval_steps_per_second": 1.505,
515
  "step": 700
516
  }
517
  ],
 
532
  "attributes": {}
533
  }
534
  },
535
+ "total_flos": 4188970942464000.0,
536
  "train_batch_size": 8,
537
  "trial_name": null,
538
  "trial_params": null
checkpoint-700/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e1dfa9107963f8185e69ba39492321da63b95b2c62e54736ef9c90df528570
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1982b07d8277b0bc7aaf4f34307bfaa24bd97b8994cd2c48e972769e68324f20
3
  size 5240
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e1dfa9107963f8185e69ba39492321da63b95b2c62e54736ef9c90df528570
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1982b07d8277b0bc7aaf4f34307bfaa24bd97b8994cd2c48e972769e68324f20
3
  size 5240