hajeong67 commited on
Commit
63fc94f
·
verified ·
1 Parent(s): 3732790

Upload folder using huggingface_hub

Browse files
checkpoint-1400/adapter_config.json CHANGED
@@ -10,20 +10,18 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 16,
14
  "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
  "v_proj",
24
- "o_proj",
25
- "q_proj",
26
- "k_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 32,
14
  "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
  "v_proj",
24
+ "q_proj"
 
 
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
checkpoint-1400/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9401d1e629b1ab3d58f131e884be617a6d85bda9188213b3fe9a349e5b3de0c
3
- size 54543112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1be083b1e7d0a2ca73501e3a1fbce6b84a850044a233872d5b01cdcf0a65c58f
3
+ size 3416264
checkpoint-1400/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f025e413edc76837904c1e5cfc24e7d486f79382a2b2fdbfd642c2343b983d4
3
- size 109159930
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:642c174b47359c6527c32c721dc0bc13b6b0b98189bbc690582671c72af6696b
3
+ size 6869818
checkpoint-1400/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d5b74bc90e579e47da3877ca4510f5d87b4db22e3422b487a0da42e750bff90
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e654950e8ee96fc7cda636036720d303346a436a535ba00c826eceac02a302f
3
  size 14244
checkpoint-1400/trainer_state.json CHANGED
@@ -10,990 +10,990 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.007142857142857143,
13
- "grad_norm": 0.8348605036735535,
14
  "learning_rate": 9.92857142857143e-05,
15
- "loss": 1.2652,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.014285714285714285,
20
- "grad_norm": 0.41527849435806274,
21
  "learning_rate": 9.857142857142858e-05,
22
- "loss": 1.1228,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.02142857142857143,
27
- "grad_norm": 0.7952600717544556,
28
  "learning_rate": 9.785714285714286e-05,
29
- "loss": 0.9764,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.02857142857142857,
34
- "grad_norm": 0.2524937689304352,
35
  "learning_rate": 9.714285714285715e-05,
36
- "loss": 0.887,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.03571428571428571,
41
- "grad_norm": 0.7529658675193787,
42
  "learning_rate": 9.642857142857143e-05,
43
- "loss": 0.6871,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.04285714285714286,
48
- "grad_norm": 0.289154052734375,
49
  "learning_rate": 9.571428571428573e-05,
50
- "loss": 0.5856,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.05,
55
- "grad_norm": 0.6982300281524658,
56
  "learning_rate": 9.5e-05,
57
- "loss": 0.6057,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.05714285714285714,
62
- "grad_norm": 0.36802777647972107,
63
  "learning_rate": 9.428571428571429e-05,
64
- "loss": 0.5035,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.06428571428571428,
69
- "grad_norm": 0.7616478800773621,
70
  "learning_rate": 9.357142857142858e-05,
71
- "loss": 0.5143,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.07142857142857142,
76
- "grad_norm": 0.36089685559272766,
77
  "learning_rate": 9.285714285714286e-05,
78
- "loss": 0.5063,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.07857142857142857,
83
- "grad_norm": 0.31661227345466614,
84
  "learning_rate": 9.214285714285714e-05,
85
- "loss": 0.5535,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.08571428571428572,
90
- "grad_norm": 0.19921253621578217,
91
  "learning_rate": 9.142857142857143e-05,
92
- "loss": 0.5267,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.09285714285714286,
97
- "grad_norm": 0.4584966003894806,
98
  "learning_rate": 9.071428571428571e-05,
99
- "loss": 0.5315,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.1,
104
- "grad_norm": 0.4068152606487274,
105
  "learning_rate": 9e-05,
106
- "loss": 0.5072,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.10714285714285714,
111
- "grad_norm": 0.40502503514289856,
112
  "learning_rate": 8.92857142857143e-05,
113
- "loss": 0.4591,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.11428571428571428,
118
- "grad_norm": 0.22944864630699158,
119
  "learning_rate": 8.857142857142857e-05,
120
- "loss": 0.4721,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.12142857142857143,
125
- "grad_norm": 0.39912867546081543,
126
  "learning_rate": 8.785714285714286e-05,
127
- "loss": 0.4958,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.12857142857142856,
132
- "grad_norm": 0.4304330050945282,
133
  "learning_rate": 8.714285714285715e-05,
134
- "loss": 0.4957,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.1357142857142857,
139
- "grad_norm": 0.6040734648704529,
140
  "learning_rate": 8.642857142857143e-05,
141
- "loss": 0.5536,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.14285714285714285,
146
- "grad_norm": 0.7617625594139099,
147
  "learning_rate": 8.571428571428571e-05,
148
- "loss": 0.453,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.15,
153
- "grad_norm": 0.30004259943962097,
154
  "learning_rate": 8.5e-05,
155
- "loss": 0.4292,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.15714285714285714,
160
- "grad_norm": 0.6278651356697083,
161
  "learning_rate": 8.428571428571429e-05,
162
- "loss": 0.4667,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.16428571428571428,
167
- "grad_norm": 0.4061867892742157,
168
  "learning_rate": 8.357142857142858e-05,
169
- "loss": 0.4267,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.17142857142857143,
174
- "grad_norm": 0.2937834858894348,
175
  "learning_rate": 8.285714285714287e-05,
176
- "loss": 0.4398,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.17857142857142858,
181
- "grad_norm": 0.41685324907302856,
182
  "learning_rate": 8.214285714285714e-05,
183
- "loss": 0.4409,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.18571428571428572,
188
- "grad_norm": 0.5377163887023926,
189
  "learning_rate": 8.142857142857143e-05,
190
- "loss": 0.4335,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.19285714285714287,
195
- "grad_norm": 0.2825513184070587,
196
  "learning_rate": 8.071428571428573e-05,
197
- "loss": 0.4841,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.2,
202
- "grad_norm": 0.6520470380783081,
203
  "learning_rate": 8e-05,
204
- "loss": 0.427,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.20714285714285716,
209
- "grad_norm": 0.6439694762229919,
210
  "learning_rate": 7.928571428571429e-05,
211
- "loss": 0.4373,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.21428571428571427,
216
- "grad_norm": 0.7449022531509399,
217
  "learning_rate": 7.857142857142858e-05,
218
- "loss": 0.4622,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.22142857142857142,
223
- "grad_norm": 0.702427864074707,
224
  "learning_rate": 7.785714285714286e-05,
225
- "loss": 0.4139,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.22857142857142856,
230
- "grad_norm": 0.4422167241573334,
231
  "learning_rate": 7.714285714285715e-05,
232
- "loss": 0.5105,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.2357142857142857,
237
- "grad_norm": 0.3643128573894501,
238
  "learning_rate": 7.642857142857143e-05,
239
- "loss": 0.4018,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.24285714285714285,
244
- "grad_norm": 0.3324808180332184,
245
  "learning_rate": 7.571428571428571e-05,
246
- "loss": 0.3991,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.25,
251
- "grad_norm": 0.5424559116363525,
252
  "learning_rate": 7.500000000000001e-05,
253
- "loss": 0.3948,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.2571428571428571,
258
- "grad_norm": 0.7667222619056702,
259
  "learning_rate": 7.428571428571429e-05,
260
- "loss": 0.4075,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.2642857142857143,
265
- "grad_norm": 0.3680099844932556,
266
  "learning_rate": 7.357142857142858e-05,
267
- "loss": 0.4721,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.2714285714285714,
272
- "grad_norm": 0.4167829155921936,
273
  "learning_rate": 7.285714285714286e-05,
274
- "loss": 0.3657,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.2785714285714286,
279
- "grad_norm": 0.6765364408493042,
280
  "learning_rate": 7.214285714285714e-05,
281
- "loss": 0.4158,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.2857142857142857,
286
- "grad_norm": 0.5632287859916687,
287
  "learning_rate": 7.142857142857143e-05,
288
- "loss": 0.3808,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.29285714285714287,
293
- "grad_norm": 0.6470784544944763,
294
  "learning_rate": 7.071428571428573e-05,
295
- "loss": 0.3818,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.3,
300
- "grad_norm": 0.6362716555595398,
301
  "learning_rate": 7e-05,
302
- "loss": 0.382,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.30714285714285716,
307
- "grad_norm": 0.7216530442237854,
308
  "learning_rate": 6.928571428571429e-05,
309
- "loss": 0.4106,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.3142857142857143,
314
- "grad_norm": 0.6893362402915955,
315
  "learning_rate": 6.857142857142858e-05,
316
- "loss": 0.4753,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.32142857142857145,
321
- "grad_norm": 0.3167782127857208,
322
  "learning_rate": 6.785714285714286e-05,
323
- "loss": 0.3535,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.32857142857142857,
328
- "grad_norm": 0.5213643908500671,
329
  "learning_rate": 6.714285714285714e-05,
330
- "loss": 0.355,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.3357142857142857,
335
- "grad_norm": 0.44351714849472046,
336
  "learning_rate": 6.642857142857143e-05,
337
- "loss": 0.3341,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.34285714285714286,
342
- "grad_norm": 0.45469874143600464,
343
  "learning_rate": 6.571428571428571e-05,
344
- "loss": 0.318,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.35,
349
- "grad_norm": 0.6794070601463318,
350
  "learning_rate": 6.500000000000001e-05,
351
- "loss": 0.4363,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.35714285714285715,
356
- "grad_norm": 0.35572198033332825,
357
  "learning_rate": 6.428571428571429e-05,
358
- "loss": 0.337,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.36428571428571427,
363
- "grad_norm": 0.2703068256378174,
364
  "learning_rate": 6.357142857142857e-05,
365
- "loss": 0.3572,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.37142857142857144,
370
- "grad_norm": 0.37365567684173584,
371
  "learning_rate": 6.285714285714286e-05,
372
- "loss": 0.3851,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.37857142857142856,
377
- "grad_norm": 0.623077392578125,
378
  "learning_rate": 6.214285714285714e-05,
379
- "loss": 0.352,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.38571428571428573,
384
- "grad_norm": 0.721612274646759,
385
  "learning_rate": 6.142857142857143e-05,
386
- "loss": 0.3283,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.39285714285714285,
391
- "grad_norm": 0.45066478848457336,
392
  "learning_rate": 6.0714285714285715e-05,
393
- "loss": 0.3644,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.4,
398
- "grad_norm": 0.5320075154304504,
399
  "learning_rate": 6e-05,
400
- "loss": 0.3458,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.40714285714285714,
405
- "grad_norm": 0.669303297996521,
406
  "learning_rate": 5.928571428571429e-05,
407
- "loss": 0.4307,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.4142857142857143,
412
- "grad_norm": 0.4207151234149933,
413
  "learning_rate": 5.8571428571428575e-05,
414
- "loss": 0.3836,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.42142857142857143,
419
- "grad_norm": 0.992280125617981,
420
  "learning_rate": 5.785714285714287e-05,
421
- "loss": 0.3339,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.42857142857142855,
426
- "grad_norm": 0.9569453001022339,
427
  "learning_rate": 5.714285714285714e-05,
428
- "loss": 0.3794,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.4357142857142857,
433
- "grad_norm": 0.48947492241859436,
434
  "learning_rate": 5.642857142857143e-05,
435
- "loss": 0.4,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.44285714285714284,
440
- "grad_norm": 0.5944671034812927,
441
  "learning_rate": 5.571428571428572e-05,
442
- "loss": 0.4114,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.45,
447
- "grad_norm": 1.0119761228561401,
448
  "learning_rate": 5.500000000000001e-05,
449
- "loss": 0.3887,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.45714285714285713,
454
- "grad_norm": 1.074351191520691,
455
  "learning_rate": 5.428571428571428e-05,
456
- "loss": 0.4314,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.4642857142857143,
461
- "grad_norm": 0.36413517594337463,
462
  "learning_rate": 5.3571428571428575e-05,
463
- "loss": 0.3711,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.4714285714285714,
468
- "grad_norm": 0.43678954243659973,
469
  "learning_rate": 5.285714285714286e-05,
470
- "loss": 0.3162,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.4785714285714286,
475
- "grad_norm": 0.5720781683921814,
476
  "learning_rate": 5.214285714285715e-05,
477
- "loss": 0.303,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.4857142857142857,
482
- "grad_norm": 0.59657883644104,
483
  "learning_rate": 5.142857142857143e-05,
484
- "loss": 0.2921,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.4928571428571429,
489
- "grad_norm": 0.4800083637237549,
490
  "learning_rate": 5.0714285714285716e-05,
491
- "loss": 0.3375,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.5,
496
- "grad_norm": 0.5119388699531555,
497
  "learning_rate": 5e-05,
498
- "loss": 0.337,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.5071428571428571,
503
- "grad_norm": 0.43967753648757935,
504
  "learning_rate": 4.928571428571429e-05,
505
- "loss": 0.4005,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.5142857142857142,
510
- "grad_norm": 0.4737344980239868,
511
  "learning_rate": 4.8571428571428576e-05,
512
- "loss": 0.3365,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.5214285714285715,
517
- "grad_norm": 0.6178275346755981,
518
  "learning_rate": 4.785714285714286e-05,
519
- "loss": 0.3612,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.5285714285714286,
524
- "grad_norm": 0.45657363533973694,
525
  "learning_rate": 4.714285714285714e-05,
526
- "loss": 0.3474,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.5357142857142857,
531
- "grad_norm": 0.3173658847808838,
532
  "learning_rate": 4.642857142857143e-05,
533
- "loss": 0.3295,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.5428571428571428,
538
- "grad_norm": 0.5634258389472961,
539
  "learning_rate": 4.5714285714285716e-05,
540
- "loss": 0.347,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.55,
545
- "grad_norm": 0.48927634954452515,
546
  "learning_rate": 4.5e-05,
547
- "loss": 0.3665,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.5571428571428572,
552
- "grad_norm": 0.4992411732673645,
553
  "learning_rate": 4.428571428571428e-05,
554
- "loss": 0.3229,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.5642857142857143,
559
- "grad_norm": 0.2783334255218506,
560
  "learning_rate": 4.3571428571428576e-05,
561
- "loss": 0.3176,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.5714285714285714,
566
- "grad_norm": 0.7770132422447205,
567
  "learning_rate": 4.2857142857142856e-05,
568
- "loss": 0.292,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.5785714285714286,
573
- "grad_norm": 0.3741398751735687,
574
  "learning_rate": 4.214285714285714e-05,
575
- "loss": 0.2749,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.5857142857142857,
580
- "grad_norm": 1.1872639656066895,
581
  "learning_rate": 4.1428571428571437e-05,
582
- "loss": 0.3475,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.5928571428571429,
587
- "grad_norm": 0.5437417030334473,
588
  "learning_rate": 4.0714285714285717e-05,
589
- "loss": 0.2855,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.6,
594
- "grad_norm": 0.6971333622932434,
595
  "learning_rate": 4e-05,
596
- "loss": 0.3026,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.6071428571428571,
601
- "grad_norm": 0.4371561110019684,
602
  "learning_rate": 3.928571428571429e-05,
603
- "loss": 0.2841,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.6142857142857143,
608
- "grad_norm": 0.7760091423988342,
609
  "learning_rate": 3.857142857142858e-05,
610
- "loss": 0.347,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.6214285714285714,
615
- "grad_norm": 0.29647722840309143,
616
  "learning_rate": 3.785714285714286e-05,
617
- "loss": 0.3093,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.6285714285714286,
622
- "grad_norm": 0.5774063467979431,
623
  "learning_rate": 3.7142857142857143e-05,
624
- "loss": 0.355,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.6357142857142857,
629
- "grad_norm": 0.42540696263313293,
630
  "learning_rate": 3.642857142857143e-05,
631
- "loss": 0.4248,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.6428571428571429,
636
- "grad_norm": 0.4201267659664154,
637
  "learning_rate": 3.571428571428572e-05,
638
- "loss": 0.3186,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.65,
643
- "grad_norm": 0.6699719429016113,
644
  "learning_rate": 3.5e-05,
645
- "loss": 0.3345,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.6571428571428571,
650
- "grad_norm": 0.9379881024360657,
651
  "learning_rate": 3.428571428571429e-05,
652
- "loss": 0.3523,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.6642857142857143,
657
- "grad_norm": 0.4937261939048767,
658
  "learning_rate": 3.357142857142857e-05,
659
- "loss": 0.3424,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.6714285714285714,
664
- "grad_norm": 0.4084944427013397,
665
  "learning_rate": 3.285714285714286e-05,
666
- "loss": 0.2488,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.6785714285714286,
671
- "grad_norm": 0.3513486981391907,
672
  "learning_rate": 3.2142857142857144e-05,
673
- "loss": 0.308,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.6857142857142857,
678
- "grad_norm": 0.6102254986763,
679
  "learning_rate": 3.142857142857143e-05,
680
- "loss": 0.326,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.6928571428571428,
685
- "grad_norm": 1.241847038269043,
686
  "learning_rate": 3.071428571428572e-05,
687
- "loss": 0.3028,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.7,
692
- "grad_norm": 1.8293100595474243,
693
  "learning_rate": 3e-05,
694
- "loss": 0.3235,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.7071428571428572,
699
- "grad_norm": 0.691096842288971,
700
  "learning_rate": 2.9285714285714288e-05,
701
- "loss": 0.3713,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.7142857142857143,
706
- "grad_norm": 0.47313550114631653,
707
  "learning_rate": 2.857142857142857e-05,
708
- "loss": 0.3652,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.7214285714285714,
713
- "grad_norm": 0.5191896557807922,
714
  "learning_rate": 2.785714285714286e-05,
715
- "loss": 0.331,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.7285714285714285,
720
- "grad_norm": 0.509594738483429,
721
  "learning_rate": 2.714285714285714e-05,
722
- "loss": 0.2926,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.7357142857142858,
727
- "grad_norm": 0.9162160754203796,
728
  "learning_rate": 2.642857142857143e-05,
729
- "loss": 0.2854,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.7428571428571429,
734
- "grad_norm": 0.5884422659873962,
735
  "learning_rate": 2.5714285714285714e-05,
736
- "loss": 0.3348,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.75,
741
- "grad_norm": 0.6538064479827881,
742
  "learning_rate": 2.5e-05,
743
- "loss": 0.2992,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.7571428571428571,
748
- "grad_norm": 0.49203699827194214,
749
  "learning_rate": 2.4285714285714288e-05,
750
- "loss": 0.3497,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.7642857142857142,
755
- "grad_norm": 0.798219621181488,
756
  "learning_rate": 2.357142857142857e-05,
757
- "loss": 0.3492,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.7714285714285715,
762
- "grad_norm": 0.6431849598884583,
763
  "learning_rate": 2.2857142857142858e-05,
764
- "loss": 0.2953,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.7785714285714286,
769
- "grad_norm": 0.5676552653312683,
770
  "learning_rate": 2.214285714285714e-05,
771
- "loss": 0.323,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.7857142857142857,
776
- "grad_norm": 0.4670363962650299,
777
  "learning_rate": 2.1428571428571428e-05,
778
- "loss": 0.3133,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.7928571428571428,
783
- "grad_norm": 0.6372350454330444,
784
  "learning_rate": 2.0714285714285718e-05,
785
- "loss": 0.3025,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.8,
790
- "grad_norm": 1.2349172830581665,
791
  "learning_rate": 2e-05,
792
- "loss": 0.3084,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.8071428571428572,
797
- "grad_norm": 0.6304973363876343,
798
  "learning_rate": 1.928571428571429e-05,
799
- "loss": 0.2793,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.8142857142857143,
804
- "grad_norm": 0.47565215826034546,
805
  "learning_rate": 1.8571428571428572e-05,
806
- "loss": 0.2974,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.8214285714285714,
811
- "grad_norm": 0.920874297618866,
812
  "learning_rate": 1.785714285714286e-05,
813
- "loss": 0.3422,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.8285714285714286,
818
- "grad_norm": 0.36493098735809326,
819
  "learning_rate": 1.7142857142857145e-05,
820
- "loss": 0.3024,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.8357142857142857,
825
- "grad_norm": 0.8869501352310181,
826
  "learning_rate": 1.642857142857143e-05,
827
- "loss": 0.2979,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.8428571428571429,
832
- "grad_norm": 0.7538784742355347,
833
  "learning_rate": 1.5714285714285715e-05,
834
- "loss": 0.3076,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.85,
839
- "grad_norm": 0.8670153617858887,
840
  "learning_rate": 1.5e-05,
841
- "loss": 0.2915,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.8571428571428571,
846
- "grad_norm": 0.6011945009231567,
847
  "learning_rate": 1.4285714285714285e-05,
848
- "loss": 0.3086,
849
  "step": 1200
850
  },
851
  {
852
  "epoch": 0.8642857142857143,
853
- "grad_norm": 0.364296555519104,
854
  "learning_rate": 1.357142857142857e-05,
855
- "loss": 0.3253,
856
  "step": 1210
857
  },
858
  {
859
  "epoch": 0.8714285714285714,
860
- "grad_norm": 0.7194721102714539,
861
  "learning_rate": 1.2857142857142857e-05,
862
- "loss": 0.3211,
863
  "step": 1220
864
  },
865
  {
866
  "epoch": 0.8785714285714286,
867
- "grad_norm": 0.41066431999206543,
868
  "learning_rate": 1.2142857142857144e-05,
869
- "loss": 0.2648,
870
  "step": 1230
871
  },
872
  {
873
  "epoch": 0.8857142857142857,
874
- "grad_norm": 0.7685422301292419,
875
  "learning_rate": 1.1428571428571429e-05,
876
- "loss": 0.3156,
877
  "step": 1240
878
  },
879
  {
880
  "epoch": 0.8928571428571429,
881
- "grad_norm": 0.5432989001274109,
882
  "learning_rate": 1.0714285714285714e-05,
883
- "loss": 0.3418,
884
  "step": 1250
885
  },
886
  {
887
  "epoch": 0.9,
888
- "grad_norm": 0.5999932885169983,
889
  "learning_rate": 1e-05,
890
- "loss": 0.381,
891
  "step": 1260
892
  },
893
  {
894
  "epoch": 0.9071428571428571,
895
- "grad_norm": 0.3930542767047882,
896
  "learning_rate": 9.285714285714286e-06,
897
- "loss": 0.384,
898
  "step": 1270
899
  },
900
  {
901
  "epoch": 0.9142857142857143,
902
- "grad_norm": 0.3380388021469116,
903
  "learning_rate": 8.571428571428573e-06,
904
- "loss": 0.3303,
905
  "step": 1280
906
  },
907
  {
908
  "epoch": 0.9214285714285714,
909
- "grad_norm": 1.0583548545837402,
910
  "learning_rate": 7.857142857142858e-06,
911
- "loss": 0.3573,
912
  "step": 1290
913
  },
914
  {
915
  "epoch": 0.9285714285714286,
916
- "grad_norm": 0.5554153323173523,
917
  "learning_rate": 7.142857142857143e-06,
918
- "loss": 0.3045,
919
  "step": 1300
920
  },
921
  {
922
  "epoch": 0.9357142857142857,
923
- "grad_norm": 0.6635413765907288,
924
  "learning_rate": 6.428571428571429e-06,
925
- "loss": 0.3179,
926
  "step": 1310
927
  },
928
  {
929
  "epoch": 0.9428571428571428,
930
- "grad_norm": 0.9328262805938721,
931
  "learning_rate": 5.7142857142857145e-06,
932
- "loss": 0.2692,
933
  "step": 1320
934
  },
935
  {
936
  "epoch": 0.95,
937
- "grad_norm": 0.6629207730293274,
938
  "learning_rate": 5e-06,
939
- "loss": 0.3041,
940
  "step": 1330
941
  },
942
  {
943
  "epoch": 0.9571428571428572,
944
- "grad_norm": 1.1389262676239014,
945
  "learning_rate": 4.285714285714286e-06,
946
- "loss": 0.321,
947
  "step": 1340
948
  },
949
  {
950
  "epoch": 0.9642857142857143,
951
- "grad_norm": 0.5038421750068665,
952
  "learning_rate": 3.5714285714285714e-06,
953
- "loss": 0.3192,
954
  "step": 1350
955
  },
956
  {
957
  "epoch": 0.9714285714285714,
958
- "grad_norm": 0.4525507986545563,
959
  "learning_rate": 2.8571428571428573e-06,
960
- "loss": 0.2807,
961
  "step": 1360
962
  },
963
  {
964
  "epoch": 0.9785714285714285,
965
- "grad_norm": 0.5145785212516785,
966
  "learning_rate": 2.142857142857143e-06,
967
- "loss": 0.3307,
968
  "step": 1370
969
  },
970
  {
971
  "epoch": 0.9857142857142858,
972
- "grad_norm": 0.6031885147094727,
973
  "learning_rate": 1.4285714285714286e-06,
974
- "loss": 0.3564,
975
  "step": 1380
976
  },
977
  {
978
  "epoch": 0.9928571428571429,
979
- "grad_norm": 0.38553744554519653,
980
  "learning_rate": 7.142857142857143e-07,
981
- "loss": 0.315,
982
  "step": 1390
983
  },
984
  {
985
  "epoch": 1.0,
986
- "grad_norm": 0.5251179933547974,
987
  "learning_rate": 0.0,
988
- "loss": 0.2818,
989
  "step": 1400
990
  },
991
  {
992
  "epoch": 1.0,
993
- "eval_loss": 0.29903435707092285,
994
- "eval_runtime": 66.066,
995
- "eval_samples_per_second": 3.027,
996
- "eval_steps_per_second": 0.378,
997
  "step": 1400
998
  }
999
  ],
@@ -1014,7 +1014,7 @@
1014
  "attributes": {}
1015
  }
1016
  },
1017
- "total_flos": 8487866204160000.0,
1018
  "train_batch_size": 2,
1019
  "trial_name": null,
1020
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.007142857142857143,
13
+ "grad_norm": 3.0706775188446045,
14
  "learning_rate": 9.92857142857143e-05,
15
+ "loss": 1.2883,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.014285714285714285,
20
+ "grad_norm": 2.5454561710357666,
21
  "learning_rate": 9.857142857142858e-05,
22
+ "loss": 1.1211,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.02142857142857143,
27
+ "grad_norm": 2.717937469482422,
28
  "learning_rate": 9.785714285714286e-05,
29
+ "loss": 0.9927,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.02857142857142857,
34
+ "grad_norm": 1.2683441638946533,
35
  "learning_rate": 9.714285714285715e-05,
36
+ "loss": 0.9406,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.03571428571428571,
41
+ "grad_norm": 7.902857303619385,
42
  "learning_rate": 9.642857142857143e-05,
43
+ "loss": 0.7558,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.04285714285714286,
48
+ "grad_norm": 2.552997350692749,
49
  "learning_rate": 9.571428571428573e-05,
50
+ "loss": 0.6445,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.05,
55
+ "grad_norm": 3.5295348167419434,
56
  "learning_rate": 9.5e-05,
57
+ "loss": 0.6534,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.05714285714285714,
62
+ "grad_norm": 1.8732738494873047,
63
  "learning_rate": 9.428571428571429e-05,
64
+ "loss": 0.537,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.06428571428571428,
69
+ "grad_norm": 4.299726963043213,
70
  "learning_rate": 9.357142857142858e-05,
71
+ "loss": 0.5508,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.07142857142857142,
76
+ "grad_norm": 2.1415882110595703,
77
  "learning_rate": 9.285714285714286e-05,
78
+ "loss": 0.5438,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.07857142857142857,
83
+ "grad_norm": 3.7711620330810547,
84
  "learning_rate": 9.214285714285714e-05,
85
+ "loss": 0.5906,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.08571428571428572,
90
+ "grad_norm": 1.4387797117233276,
91
  "learning_rate": 9.142857142857143e-05,
92
+ "loss": 0.541,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.09285714285714286,
97
+ "grad_norm": 4.010119915008545,
98
  "learning_rate": 9.071428571428571e-05,
99
+ "loss": 0.56,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.1,
104
+ "grad_norm": 4.021274089813232,
105
  "learning_rate": 9e-05,
106
+ "loss": 0.5323,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.10714285714285714,
111
+ "grad_norm": 2.138714551925659,
112
  "learning_rate": 8.92857142857143e-05,
113
+ "loss": 0.4784,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.11428571428571428,
118
+ "grad_norm": 1.371328353881836,
119
  "learning_rate": 8.857142857142857e-05,
120
+ "loss": 0.4861,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.12142857142857143,
125
+ "grad_norm": 3.1777868270874023,
126
  "learning_rate": 8.785714285714286e-05,
127
+ "loss": 0.504,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.12857142857142856,
132
+ "grad_norm": 2.0382461547851562,
133
  "learning_rate": 8.714285714285715e-05,
134
+ "loss": 0.496,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.1357142857142857,
139
+ "grad_norm": 2.8927695751190186,
140
  "learning_rate": 8.642857142857143e-05,
141
+ "loss": 0.5581,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.14285714285714285,
146
+ "grad_norm": 4.461900234222412,
147
  "learning_rate": 8.571428571428571e-05,
148
+ "loss": 0.4643,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.15,
153
+ "grad_norm": 2.028140068054199,
154
  "learning_rate": 8.5e-05,
155
+ "loss": 0.4384,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.15714285714285714,
160
+ "grad_norm": 2.4652626514434814,
161
  "learning_rate": 8.428571428571429e-05,
162
+ "loss": 0.4705,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.16428571428571428,
167
+ "grad_norm": 2.360128879547119,
168
  "learning_rate": 8.357142857142858e-05,
169
+ "loss": 0.4238,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.17142857142857143,
174
+ "grad_norm": 1.5631648302078247,
175
  "learning_rate": 8.285714285714287e-05,
176
+ "loss": 0.4364,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.17857142857142858,
181
+ "grad_norm": 1.9402961730957031,
182
  "learning_rate": 8.214285714285714e-05,
183
+ "loss": 0.4472,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.18571428571428572,
188
+ "grad_norm": 2.9574289321899414,
189
  "learning_rate": 8.142857142857143e-05,
190
+ "loss": 0.4319,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.19285714285714287,
195
+ "grad_norm": 1.7986479997634888,
196
  "learning_rate": 8.071428571428573e-05,
197
+ "loss": 0.4873,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.2,
202
+ "grad_norm": 4.539698123931885,
203
  "learning_rate": 8e-05,
204
+ "loss": 0.4222,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.20714285714285716,
209
+ "grad_norm": 4.539310932159424,
210
  "learning_rate": 7.928571428571429e-05,
211
+ "loss": 0.444,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.21428571428571427,
216
+ "grad_norm": 4.211949348449707,
217
  "learning_rate": 7.857142857142858e-05,
218
+ "loss": 0.4713,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.22142857142857142,
223
+ "grad_norm": 3.4612796306610107,
224
  "learning_rate": 7.785714285714286e-05,
225
+ "loss": 0.421,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.22857142857142856,
230
+ "grad_norm": 2.1370456218719482,
231
  "learning_rate": 7.714285714285715e-05,
232
+ "loss": 0.5117,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.2357142857142857,
237
+ "grad_norm": 2.022341012954712,
238
  "learning_rate": 7.642857142857143e-05,
239
+ "loss": 0.3968,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.24285714285714285,
244
+ "grad_norm": 1.5263112783432007,
245
  "learning_rate": 7.571428571428571e-05,
246
+ "loss": 0.4032,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.25,
251
+ "grad_norm": 3.152087450027466,
252
  "learning_rate": 7.500000000000001e-05,
253
+ "loss": 0.4024,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.2571428571428571,
258
+ "grad_norm": 3.5288479328155518,
259
  "learning_rate": 7.428571428571429e-05,
260
+ "loss": 0.4079,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.2642857142857143,
265
+ "grad_norm": 1.922340989112854,
266
  "learning_rate": 7.357142857142858e-05,
267
+ "loss": 0.4839,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.2714285714285714,
272
+ "grad_norm": 2.6698594093322754,
273
  "learning_rate": 7.285714285714286e-05,
274
+ "loss": 0.3772,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.2785714285714286,
279
+ "grad_norm": 2.9632909297943115,
280
  "learning_rate": 7.214285714285714e-05,
281
+ "loss": 0.4296,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.2857142857142857,
286
+ "grad_norm": 2.624523639678955,
287
  "learning_rate": 7.142857142857143e-05,
288
+ "loss": 0.3931,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.29285714285714287,
293
+ "grad_norm": 3.6488404273986816,
294
  "learning_rate": 7.071428571428573e-05,
295
+ "loss": 0.3986,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.3,
300
+ "grad_norm": 3.6739540100097656,
301
  "learning_rate": 7e-05,
302
+ "loss": 0.3979,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.30714285714285716,
307
+ "grad_norm": 2.866800546646118,
308
  "learning_rate": 6.928571428571429e-05,
309
+ "loss": 0.4203,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.3142857142857143,
314
+ "grad_norm": 3.4130825996398926,
315
  "learning_rate": 6.857142857142858e-05,
316
+ "loss": 0.498,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.32142857142857145,
321
+ "grad_norm": 1.6507424116134644,
322
  "learning_rate": 6.785714285714286e-05,
323
+ "loss": 0.3672,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.32857142857142857,
328
+ "grad_norm": 2.9661030769348145,
329
  "learning_rate": 6.714285714285714e-05,
330
+ "loss": 0.372,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.3357142857142857,
335
+ "grad_norm": 1.8931576013565063,
336
  "learning_rate": 6.642857142857143e-05,
337
+ "loss": 0.341,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.34285714285714286,
342
+ "grad_norm": 2.128786563873291,
343
  "learning_rate": 6.571428571428571e-05,
344
+ "loss": 0.3235,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.35,
349
+ "grad_norm": 3.9210121631622314,
350
  "learning_rate": 6.500000000000001e-05,
351
+ "loss": 0.4568,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.35714285714285715,
356
+ "grad_norm": 2.2050564289093018,
357
  "learning_rate": 6.428571428571429e-05,
358
+ "loss": 0.3468,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.36428571428571427,
363
+ "grad_norm": 1.4351139068603516,
364
  "learning_rate": 6.357142857142857e-05,
365
+ "loss": 0.3745,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.37142857142857144,
370
+ "grad_norm": 1.60938560962677,
371
  "learning_rate": 6.285714285714286e-05,
372
+ "loss": 0.3944,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.37857142857142856,
377
+ "grad_norm": 3.240180730819702,
378
  "learning_rate": 6.214285714285714e-05,
379
+ "loss": 0.3701,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.38571428571428573,
384
+ "grad_norm": 2.8629231452941895,
385
  "learning_rate": 6.142857142857143e-05,
386
+ "loss": 0.3378,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.39285714285714285,
391
+ "grad_norm": 2.2353878021240234,
392
  "learning_rate": 6.0714285714285715e-05,
393
+ "loss": 0.3783,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.4,
398
+ "grad_norm": 5.000092029571533,
399
  "learning_rate": 6e-05,
400
+ "loss": 0.3664,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.40714285714285714,
405
+ "grad_norm": 1.9683377742767334,
406
  "learning_rate": 5.928571428571429e-05,
407
+ "loss": 0.4366,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.4142857142857143,
412
+ "grad_norm": 1.6545411348342896,
413
  "learning_rate": 5.8571428571428575e-05,
414
+ "loss": 0.4047,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.42142857142857143,
419
+ "grad_norm": 6.393994331359863,
420
  "learning_rate": 5.785714285714287e-05,
421
+ "loss": 0.3513,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.42857142857142855,
426
+ "grad_norm": 2.786479949951172,
427
  "learning_rate": 5.714285714285714e-05,
428
+ "loss": 0.3973,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.4357142857142857,
433
+ "grad_norm": 2.1006624698638916,
434
  "learning_rate": 5.642857142857143e-05,
435
+ "loss": 0.413,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.44285714285714284,
440
+ "grad_norm": 2.376877784729004,
441
  "learning_rate": 5.571428571428572e-05,
442
+ "loss": 0.427,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.45,
447
+ "grad_norm": 3.311000347137451,
448
  "learning_rate": 5.500000000000001e-05,
449
+ "loss": 0.4043,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.45714285714285713,
454
+ "grad_norm": 5.115108489990234,
455
  "learning_rate": 5.428571428571428e-05,
456
+ "loss": 0.4501,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.4642857142857143,
461
+ "grad_norm": 1.7104153633117676,
462
  "learning_rate": 5.3571428571428575e-05,
463
+ "loss": 0.3966,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.4714285714285714,
468
+ "grad_norm": 1.9817299842834473,
469
  "learning_rate": 5.285714285714286e-05,
470
+ "loss": 0.3308,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.4785714285714286,
475
+ "grad_norm": 2.1275410652160645,
476
  "learning_rate": 5.214285714285715e-05,
477
+ "loss": 0.3219,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.4857142857142857,
482
+ "grad_norm": 2.4896678924560547,
483
  "learning_rate": 5.142857142857143e-05,
484
+ "loss": 0.3084,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.4928571428571429,
489
+ "grad_norm": 2.4593498706817627,
490
  "learning_rate": 5.0714285714285716e-05,
491
+ "loss": 0.3695,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.5,
496
+ "grad_norm": 2.4440925121307373,
497
  "learning_rate": 5e-05,
498
+ "loss": 0.3584,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.5071428571428571,
503
+ "grad_norm": 1.6439753770828247,
504
  "learning_rate": 4.928571428571429e-05,
505
+ "loss": 0.4272,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.5142857142857142,
510
+ "grad_norm": 2.897373676300049,
511
  "learning_rate": 4.8571428571428576e-05,
512
+ "loss": 0.3496,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.5214285714285715,
517
+ "grad_norm": 2.620026111602783,
518
  "learning_rate": 4.785714285714286e-05,
519
+ "loss": 0.3745,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.5285714285714286,
524
+ "grad_norm": 2.081660032272339,
525
  "learning_rate": 4.714285714285714e-05,
526
+ "loss": 0.3608,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.5357142857142857,
531
+ "grad_norm": 1.8209961652755737,
532
  "learning_rate": 4.642857142857143e-05,
533
+ "loss": 0.3429,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.5428571428571428,
538
+ "grad_norm": 1.898141622543335,
539
  "learning_rate": 4.5714285714285716e-05,
540
+ "loss": 0.3671,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.55,
545
+ "grad_norm": 2.252241611480713,
546
  "learning_rate": 4.5e-05,
547
+ "loss": 0.3843,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.5571428571428572,
552
+ "grad_norm": 2.201845407485962,
553
  "learning_rate": 4.428571428571428e-05,
554
+ "loss": 0.3379,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.5642857142857143,
559
+ "grad_norm": 1.9457173347473145,
560
  "learning_rate": 4.3571428571428576e-05,
561
+ "loss": 0.3366,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.5714285714285714,
566
+ "grad_norm": 3.7929930686950684,
567
  "learning_rate": 4.2857142857142856e-05,
568
+ "loss": 0.3091,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.5785714285714286,
573
+ "grad_norm": 1.8005012273788452,
574
  "learning_rate": 4.214285714285714e-05,
575
+ "loss": 0.2944,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.5857142857142857,
580
+ "grad_norm": 6.003371715545654,
581
  "learning_rate": 4.1428571428571437e-05,
582
+ "loss": 0.3722,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.5928571428571429,
587
+ "grad_norm": 2.1478536128997803,
588
  "learning_rate": 4.0714285714285717e-05,
589
+ "loss": 0.3081,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.6,
594
+ "grad_norm": 2.839242935180664,
595
  "learning_rate": 4e-05,
596
+ "loss": 0.317,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.6071428571428571,
601
+ "grad_norm": 1.891158938407898,
602
  "learning_rate": 3.928571428571429e-05,
603
+ "loss": 0.2995,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.6142857142857143,
608
+ "grad_norm": 5.433562278747559,
609
  "learning_rate": 3.857142857142858e-05,
610
+ "loss": 0.3637,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.6214285714285714,
615
+ "grad_norm": 1.653617024421692,
616
  "learning_rate": 3.785714285714286e-05,
617
+ "loss": 0.3285,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.6285714285714286,
622
+ "grad_norm": 2.9657585620880127,
623
  "learning_rate": 3.7142857142857143e-05,
624
+ "loss": 0.3761,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.6357142857142857,
629
+ "grad_norm": 1.5580040216445923,
630
  "learning_rate": 3.642857142857143e-05,
631
+ "loss": 0.4427,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.6428571428571429,
636
+ "grad_norm": 2.411190986633301,
637
  "learning_rate": 3.571428571428572e-05,
638
+ "loss": 0.3322,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.65,
643
+ "grad_norm": 4.375690937042236,
644
  "learning_rate": 3.5e-05,
645
+ "loss": 0.3546,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.6571428571428571,
650
+ "grad_norm": 3.571958541870117,
651
  "learning_rate": 3.428571428571429e-05,
652
+ "loss": 0.3742,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.6642857142857143,
657
+ "grad_norm": 3.132997512817383,
658
  "learning_rate": 3.357142857142857e-05,
659
+ "loss": 0.3667,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.6714285714285714,
664
+ "grad_norm": 2.296008586883545,
665
  "learning_rate": 3.285714285714286e-05,
666
+ "loss": 0.2686,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.6785714285714286,
671
+ "grad_norm": 1.7339441776275635,
672
  "learning_rate": 3.2142857142857144e-05,
673
+ "loss": 0.328,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.6857142857142857,
678
+ "grad_norm": 2.5679969787597656,
679
  "learning_rate": 3.142857142857143e-05,
680
+ "loss": 0.3437,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.6928571428571428,
685
+ "grad_norm": 6.115095615386963,
686
  "learning_rate": 3.071428571428572e-05,
687
+ "loss": 0.3264,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.7,
692
+ "grad_norm": 9.149005889892578,
693
  "learning_rate": 3e-05,
694
+ "loss": 0.339,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.7071428571428572,
699
+ "grad_norm": 3.723829746246338,
700
  "learning_rate": 2.9285714285714288e-05,
701
+ "loss": 0.3932,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.7142857142857143,
706
+ "grad_norm": 2.2082722187042236,
707
  "learning_rate": 2.857142857142857e-05,
708
+ "loss": 0.3854,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.7214285714285714,
713
+ "grad_norm": 2.192686080932617,
714
  "learning_rate": 2.785714285714286e-05,
715
+ "loss": 0.352,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.7285714285714285,
720
+ "grad_norm": 2.3141751289367676,
721
  "learning_rate": 2.714285714285714e-05,
722
+ "loss": 0.3132,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.7357142857142858,
727
+ "grad_norm": 4.0575995445251465,
728
  "learning_rate": 2.642857142857143e-05,
729
+ "loss": 0.2929,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.7428571428571429,
734
+ "grad_norm": 2.5704588890075684,
735
  "learning_rate": 2.5714285714285714e-05,
736
+ "loss": 0.3549,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.75,
741
+ "grad_norm": 2.970313310623169,
742
  "learning_rate": 2.5e-05,
743
+ "loss": 0.3139,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.7571428571428571,
748
+ "grad_norm": 3.143388271331787,
749
  "learning_rate": 2.4285714285714288e-05,
750
+ "loss": 0.3732,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.7642857142857142,
755
+ "grad_norm": 3.7672691345214844,
756
  "learning_rate": 2.357142857142857e-05,
757
+ "loss": 0.3752,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.7714285714285715,
762
+ "grad_norm": 3.106049060821533,
763
  "learning_rate": 2.2857142857142858e-05,
764
+ "loss": 0.3174,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.7785714285714286,
769
+ "grad_norm": 3.5952601432800293,
770
  "learning_rate": 2.214285714285714e-05,
771
+ "loss": 0.3445,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.7857142857142857,
776
+ "grad_norm": 2.764934778213501,
777
  "learning_rate": 2.1428571428571428e-05,
778
+ "loss": 0.3343,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.7928571428571428,
783
+ "grad_norm": 3.4910130500793457,
784
  "learning_rate": 2.0714285714285718e-05,
785
+ "loss": 0.3265,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.8,
790
+ "grad_norm": 6.074836254119873,
791
  "learning_rate": 2e-05,
792
+ "loss": 0.334,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.8071428571428572,
797
+ "grad_norm": 3.0394229888916016,
798
  "learning_rate": 1.928571428571429e-05,
799
+ "loss": 0.2932,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.8142857142857143,
804
+ "grad_norm": 2.841371536254883,
805
  "learning_rate": 1.8571428571428572e-05,
806
+ "loss": 0.3143,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.8214285714285714,
811
+ "grad_norm": 3.9328362941741943,
812
  "learning_rate": 1.785714285714286e-05,
813
+ "loss": 0.3726,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.8285714285714286,
818
+ "grad_norm": 2.6354167461395264,
819
  "learning_rate": 1.7142857142857145e-05,
820
+ "loss": 0.325,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.8357142857142857,
825
+ "grad_norm": 4.113726615905762,
826
  "learning_rate": 1.642857142857143e-05,
827
+ "loss": 0.3199,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.8428571428571429,
832
+ "grad_norm": 3.8376033306121826,
833
  "learning_rate": 1.5714285714285715e-05,
834
+ "loss": 0.3299,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.85,
839
+ "grad_norm": 5.71781063079834,
840
  "learning_rate": 1.5e-05,
841
+ "loss": 0.317,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.8571428571428571,
846
+ "grad_norm": 3.487872362136841,
847
  "learning_rate": 1.4285714285714285e-05,
848
+ "loss": 0.3301,
849
  "step": 1200
850
  },
851
  {
852
  "epoch": 0.8642857142857143,
853
+ "grad_norm": 1.754288673400879,
854
  "learning_rate": 1.357142857142857e-05,
855
+ "loss": 0.3469,
856
  "step": 1210
857
  },
858
  {
859
  "epoch": 0.8714285714285714,
860
+ "grad_norm": 3.3238632678985596,
861
  "learning_rate": 1.2857142857142857e-05,
862
+ "loss": 0.3434,
863
  "step": 1220
864
  },
865
  {
866
  "epoch": 0.8785714285714286,
867
+ "grad_norm": 2.276270627975464,
868
  "learning_rate": 1.2142857142857144e-05,
869
+ "loss": 0.2825,
870
  "step": 1230
871
  },
872
  {
873
  "epoch": 0.8857142857142857,
874
+ "grad_norm": 4.0892720222473145,
875
  "learning_rate": 1.1428571428571429e-05,
876
+ "loss": 0.3364,
877
  "step": 1240
878
  },
879
  {
880
  "epoch": 0.8928571428571429,
881
+ "grad_norm": 2.1741929054260254,
882
  "learning_rate": 1.0714285714285714e-05,
883
+ "loss": 0.3592,
884
  "step": 1250
885
  },
886
  {
887
  "epoch": 0.9,
888
+ "grad_norm": 2.3750619888305664,
889
  "learning_rate": 1e-05,
890
+ "loss": 0.4065,
891
  "step": 1260
892
  },
893
  {
894
  "epoch": 0.9071428571428571,
895
+ "grad_norm": 2.2269678115844727,
896
  "learning_rate": 9.285714285714286e-06,
897
+ "loss": 0.4073,
898
  "step": 1270
899
  },
900
  {
901
  "epoch": 0.9142857142857143,
902
+ "grad_norm": 2.025587320327759,
903
  "learning_rate": 8.571428571428573e-06,
904
+ "loss": 0.3478,
905
  "step": 1280
906
  },
907
  {
908
  "epoch": 0.9214285714285714,
909
+ "grad_norm": 5.870853900909424,
910
  "learning_rate": 7.857142857142858e-06,
911
+ "loss": 0.386,
912
  "step": 1290
913
  },
914
  {
915
  "epoch": 0.9285714285714286,
916
+ "grad_norm": 2.9051337242126465,
917
  "learning_rate": 7.142857142857143e-06,
918
+ "loss": 0.3257,
919
  "step": 1300
920
  },
921
  {
922
  "epoch": 0.9357142857142857,
923
+ "grad_norm": 3.7982888221740723,
924
  "learning_rate": 6.428571428571429e-06,
925
+ "loss": 0.3405,
926
  "step": 1310
927
  },
928
  {
929
  "epoch": 0.9428571428571428,
930
+ "grad_norm": 5.413133144378662,
931
  "learning_rate": 5.7142857142857145e-06,
932
+ "loss": 0.2909,
933
  "step": 1320
934
  },
935
  {
936
  "epoch": 0.95,
937
+ "grad_norm": 2.1331217288970947,
938
  "learning_rate": 5e-06,
939
+ "loss": 0.3175,
940
  "step": 1330
941
  },
942
  {
943
  "epoch": 0.9571428571428572,
944
+ "grad_norm": 4.568922519683838,
945
  "learning_rate": 4.285714285714286e-06,
946
+ "loss": 0.3447,
947
  "step": 1340
948
  },
949
  {
950
  "epoch": 0.9642857142857143,
951
+ "grad_norm": 2.920402765274048,
952
  "learning_rate": 3.5714285714285714e-06,
953
+ "loss": 0.3357,
954
  "step": 1350
955
  },
956
  {
957
  "epoch": 0.9714285714285714,
958
+ "grad_norm": 1.938262939453125,
959
  "learning_rate": 2.8571428571428573e-06,
960
+ "loss": 0.2942,
961
  "step": 1360
962
  },
963
  {
964
  "epoch": 0.9785714285714285,
965
+ "grad_norm": 1.8624756336212158,
966
  "learning_rate": 2.142857142857143e-06,
967
+ "loss": 0.3498,
968
  "step": 1370
969
  },
970
  {
971
  "epoch": 0.9857142857142858,
972
+ "grad_norm": 2.570404291152954,
973
  "learning_rate": 1.4285714285714286e-06,
974
+ "loss": 0.3777,
975
  "step": 1380
976
  },
977
  {
978
  "epoch": 0.9928571428571429,
979
+ "grad_norm": 1.80099356174469,
980
  "learning_rate": 7.142857142857143e-07,
981
+ "loss": 0.3356,
982
  "step": 1390
983
  },
984
  {
985
  "epoch": 1.0,
986
+ "grad_norm": 2.955940008163452,
987
  "learning_rate": 0.0,
988
+ "loss": 0.3044,
989
  "step": 1400
990
  },
991
  {
992
  "epoch": 1.0,
993
+ "eval_loss": 0.3177724778652191,
994
+ "eval_runtime": 67.4116,
995
+ "eval_samples_per_second": 2.967,
996
+ "eval_steps_per_second": 0.371,
997
  "step": 1400
998
  }
999
  ],
 
1014
  "attributes": {}
1015
  }
1016
  },
1017
+ "total_flos": 8377941884928000.0,
1018
  "train_batch_size": 2,
1019
  "trial_name": null,
1020
  "trial_params": null