Namronaldo2004 commited on
Commit
cbb0219
·
1 Parent(s): 960d107

Update fine-tuned model

Browse files
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "k_proj",
27
- "o_proj",
28
- "up_proj",
29
- "v_proj",
30
  "down_proj",
 
 
 
31
  "q_proj",
32
- "gate_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "gate_proj",
 
 
 
27
  "down_proj",
28
+ "v_proj",
29
+ "up_proj",
30
+ "o_proj",
31
  "q_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e39357b5e1933c0645027a46b91d0ecdc6f4bf8cd51738f54357c8987d67592
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc6a40ff3d37d8bf4cbd39b104eda057f3448f96f9e8547f016be36df6a3e524
3
  size 159967880
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:035fd60242408b013b171574b1e908c2df6aebaaeb341864628bf0b3695e99c2
3
  size 852876198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:657628d5d155d79b9aad8789e0951663eaa204fb58bf12b323c023fffe2b0085
3
  size 852876198
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9fdc09048aeb5e786b623e473b823031e30bdd8fc2c3f0655e8e64ce6286d57
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dbbb3800d4095b7540d07b5bcccd341ea22380b31ae2d3484b7f5c78f026c73
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:194456d3c9e165255d5406a0f3f62973b0bede79d91784f72431350783e27ae7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a60c7d771c1fd156acee762fba03c724cb41829a3f71df370ecd1d20b134982
3
  size 1064
trainer_state.json CHANGED
@@ -1,1068 +1,718 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02,
13
- "grad_norm": 0.5529273748397827,
14
- "learning_rate": 2.5e-05,
15
- "loss": 0.2724,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.04,
20
- "grad_norm": 0.5265244245529175,
21
- "learning_rate": 5e-05,
22
- "loss": 0.278,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.06,
27
- "grad_norm": 0.49891576170921326,
28
- "learning_rate": 7.500000000000001e-05,
29
- "loss": 0.2999,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.08,
34
- "grad_norm": 0.5106935501098633,
35
- "learning_rate": 0.0001,
36
- "loss": 0.2894,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.1,
41
- "grad_norm": 0.5380656719207764,
42
- "learning_rate": 0.000125,
43
- "loss": 0.2936,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.12,
48
- "grad_norm": 0.5809736847877502,
49
- "learning_rate": 0.00015000000000000001,
50
- "loss": 0.2693,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.14,
55
- "grad_norm": 0.6198976039886475,
56
- "learning_rate": 0.000175,
57
- "loss": 0.2862,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.16,
62
- "grad_norm": 0.5730974674224854,
63
- "learning_rate": 0.0002,
64
- "loss": 0.3018,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.18,
69
- "grad_norm": 0.5895105004310608,
70
- "learning_rate": 0.00019997552766852432,
71
- "loss": 0.2923,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.2,
76
- "grad_norm": 0.5411907434463501,
77
- "learning_rate": 0.00019990212265199738,
78
- "loss": 0.2853,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.22,
83
- "grad_norm": 0.5928137302398682,
84
- "learning_rate": 0.00019977982087825713,
85
- "loss": 0.3421,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 0.24,
90
- "grad_norm": 0.5884208083152771,
91
- "learning_rate": 0.00019960868220749448,
92
- "loss": 0.2963,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 0.26,
97
- "grad_norm": 0.5560010075569153,
98
- "learning_rate": 0.00019938879040295508,
99
- "loss": 0.3257,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 0.28,
104
- "grad_norm": 0.5500514507293701,
105
- "learning_rate": 0.00019912025308994148,
106
- "loss": 0.3122,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 0.3,
111
- "grad_norm": 0.5563028454780579,
112
- "learning_rate": 0.0001988032017031364,
113
- "loss": 0.3222,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 0.32,
118
- "grad_norm": 0.5426145792007446,
119
- "learning_rate": 0.00019843779142227256,
120
- "loss": 0.3383,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 0.34,
125
- "grad_norm": 0.5534917116165161,
126
- "learning_rate": 0.0001980242010961803,
127
- "loss": 0.3199,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 0.36,
132
- "grad_norm": 0.5668300986289978,
133
- "learning_rate": 0.0001975626331552507,
134
- "loss": 0.3568,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 0.38,
139
- "grad_norm": 0.5552094578742981,
140
- "learning_rate": 0.00019705331351235674,
141
- "loss": 0.3289,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 0.4,
146
- "grad_norm": 0.5791586637496948,
147
- "learning_rate": 0.00019649649145228102,
148
- "loss": 0.3168,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 0.42,
153
- "grad_norm": 0.5354134440422058,
154
- "learning_rate": 0.00019589243950970402,
155
- "loss": 0.332,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 0.44,
160
- "grad_norm": 0.5309009552001953,
161
- "learning_rate": 0.00019524145333581317,
162
- "loss": 0.3707,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 0.46,
167
- "grad_norm": 0.5601043105125427,
168
- "learning_rate": 0.00019454385155359702,
169
- "loss": 0.3234,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 0.48,
174
- "grad_norm": 0.5431475639343262,
175
- "learning_rate": 0.00019379997560189675,
176
- "loss": 0.2946,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 0.5,
181
- "grad_norm": 0.5860360860824585,
182
- "learning_rate": 0.00019301018956828964,
183
- "loss": 0.3075,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 0.52,
188
- "grad_norm": 0.5602842569351196,
189
- "learning_rate": 0.00019217488001088784,
190
- "loss": 0.3358,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 0.54,
195
- "grad_norm": 0.5663277506828308,
196
- "learning_rate": 0.00019129445576913888,
197
- "loss": 0.3259,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 0.56,
202
- "grad_norm": 0.5709177851676941,
203
- "learning_rate": 0.0001903693477637204,
204
- "loss": 0.3508,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 0.58,
209
- "grad_norm": 0.5128621459007263,
210
- "learning_rate": 0.00018940000878562758,
211
- "loss": 0.3175,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 0.6,
216
- "grad_norm": 0.5354805588722229,
217
- "learning_rate": 0.0001883869132745561,
218
- "loss": 0.3318,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 0.62,
223
- "grad_norm": 0.5368937253952026,
224
- "learning_rate": 0.00018733055708668926,
225
- "loss": 0.3451,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 0.64,
230
- "grad_norm": 0.5688962340354919,
231
- "learning_rate": 0.00018623145725200278,
232
- "loss": 0.3467,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 0.66,
237
- "grad_norm": 0.5590227246284485,
238
- "learning_rate": 0.00018509015172120621,
239
- "loss": 0.3303,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 0.68,
244
- "grad_norm": 0.5596187710762024,
245
- "learning_rate": 0.00018390719910244487,
246
- "loss": 0.3384,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 0.7,
251
- "grad_norm": 0.5584146976470947,
252
- "learning_rate": 0.00018268317838789088,
253
- "loss": 0.3057,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 0.72,
258
- "grad_norm": 0.5675226449966431,
259
- "learning_rate": 0.00018141868867035745,
260
- "loss": 0.3581,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 0.74,
265
- "grad_norm": 0.5121241807937622,
266
- "learning_rate": 0.00018011434885007482,
267
- "loss": 0.3716,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 0.76,
272
- "grad_norm": 0.540127694606781,
273
- "learning_rate": 0.00017877079733177184,
274
- "loss": 0.3458,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 0.78,
279
- "grad_norm": 0.5629216432571411,
280
- "learning_rate": 0.00017738869171221068,
281
- "loss": 0.3457,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 0.8,
286
- "grad_norm": 0.5627008080482483,
287
- "learning_rate": 0.0001759687084583285,
288
- "loss": 0.3462,
289
  "step": 40
290
  },
291
  {
292
- "epoch": 0.82,
293
- "grad_norm": 0.5851187705993652,
294
- "learning_rate": 0.00017451154257614287,
295
- "loss": 0.3455,
296
  "step": 41
297
  },
298
  {
299
- "epoch": 0.84,
300
- "grad_norm": 0.5467076301574707,
301
- "learning_rate": 0.00017301790727058345,
302
- "loss": 0.3289,
303
  "step": 42
304
  },
305
  {
306
- "epoch": 0.86,
307
- "grad_norm": 0.5140892267227173,
308
- "learning_rate": 0.00017148853359641626,
309
- "loss": 0.3478,
310
  "step": 43
311
  },
312
  {
313
- "epoch": 0.88,
314
- "grad_norm": 0.5295486450195312,
315
- "learning_rate": 0.00016992417010043142,
316
- "loss": 0.351,
317
  "step": 44
318
  },
319
  {
320
- "epoch": 0.9,
321
- "grad_norm": 0.5442476868629456,
322
- "learning_rate": 0.00016832558245506935,
323
- "loss": 0.3461,
324
  "step": 45
325
  },
326
  {
327
- "epoch": 0.92,
328
- "grad_norm": 0.5530596971511841,
329
- "learning_rate": 0.0001666935530836651,
330
- "loss": 0.3435,
331
  "step": 46
332
  },
333
  {
334
- "epoch": 0.94,
335
- "grad_norm": 0.5377740263938904,
336
- "learning_rate": 0.0001650288807774937,
337
- "loss": 0.3409,
338
  "step": 47
339
  },
340
  {
341
- "epoch": 0.96,
342
- "grad_norm": 0.5187397003173828,
343
- "learning_rate": 0.0001633323803048047,
344
- "loss": 0.338,
345
  "step": 48
346
  },
347
  {
348
- "epoch": 0.98,
349
- "grad_norm": 0.5382808446884155,
350
- "learning_rate": 0.00016160488201203644,
351
- "loss": 0.3398,
352
  "step": 49
353
  },
354
  {
355
- "epoch": 1.0,
356
- "grad_norm": 0.5559888482093811,
357
- "learning_rate": 0.00015984723141740576,
358
- "loss": 0.3366,
359
  "step": 50
360
  },
361
  {
362
- "epoch": 1.02,
363
- "grad_norm": 0.46435657143592834,
364
- "learning_rate": 0.0001580602887970721,
365
- "loss": 0.2239,
366
  "step": 51
367
  },
368
  {
369
- "epoch": 1.04,
370
- "grad_norm": 0.4675140678882599,
371
- "learning_rate": 0.0001562449287640781,
372
- "loss": 0.2007,
373
  "step": 52
374
  },
375
  {
376
- "epoch": 1.06,
377
- "grad_norm": 0.4767879247665405,
378
- "learning_rate": 0.00015440203984027324,
379
- "loss": 0.2223,
380
  "step": 53
381
  },
382
  {
383
- "epoch": 1.08,
384
- "grad_norm": 0.49781811237335205,
385
- "learning_rate": 0.00015253252402142988,
386
- "loss": 0.2262,
387
  "step": 54
388
  },
389
  {
390
- "epoch": 1.1,
391
- "grad_norm": 0.543898344039917,
392
- "learning_rate": 0.0001506372963357644,
393
- "loss": 0.19,
394
  "step": 55
395
  },
396
  {
397
- "epoch": 1.12,
398
- "grad_norm": 0.5789697170257568,
399
- "learning_rate": 0.00014871728439607966,
400
- "loss": 0.2011,
401
  "step": 56
402
  },
403
  {
404
- "epoch": 1.1400000000000001,
405
- "grad_norm": 0.5885554552078247,
406
- "learning_rate": 0.00014677342794574817,
407
- "loss": 0.2016,
408
  "step": 57
409
  },
410
  {
411
- "epoch": 1.16,
412
- "grad_norm": 0.5597030520439148,
413
- "learning_rate": 0.00014480667839875786,
414
- "loss": 0.1981,
415
  "step": 58
416
  },
417
  {
418
- "epoch": 1.18,
419
- "grad_norm": 0.5306347012519836,
420
- "learning_rate": 0.00014281799837404552,
421
- "loss": 0.1918,
422
  "step": 59
423
  },
424
  {
425
- "epoch": 1.2,
426
- "grad_norm": 0.5231032371520996,
427
- "learning_rate": 0.0001408083612243465,
428
- "loss": 0.1816,
429
  "step": 60
430
  },
431
  {
432
- "epoch": 1.22,
433
- "grad_norm": 0.4930267333984375,
434
- "learning_rate": 0.00013877875055979023,
435
- "loss": 0.1648,
436
  "step": 61
437
  },
438
  {
439
- "epoch": 1.24,
440
- "grad_norm": 0.5262163281440735,
441
- "learning_rate": 0.00013673015976647568,
442
- "loss": 0.2092,
443
  "step": 62
444
  },
445
  {
446
- "epoch": 1.26,
447
- "grad_norm": 0.5433812737464905,
448
- "learning_rate": 0.00013466359152026195,
449
- "loss": 0.2007,
450
  "step": 63
451
  },
452
  {
453
- "epoch": 1.28,
454
- "grad_norm": 0.47026363015174866,
455
- "learning_rate": 0.00013258005729601177,
456
- "loss": 0.1941,
457
  "step": 64
458
  },
459
  {
460
- "epoch": 1.3,
461
- "grad_norm": 0.4707397222518921,
462
- "learning_rate": 0.00013048057687252865,
463
- "loss": 0.2069,
464
  "step": 65
465
  },
466
  {
467
- "epoch": 1.32,
468
- "grad_norm": 0.48763400316238403,
469
- "learning_rate": 0.0001283661778334297,
470
- "loss": 0.1933,
471
  "step": 66
472
  },
473
  {
474
- "epoch": 1.34,
475
- "grad_norm": 0.4656035006046295,
476
- "learning_rate": 0.0001262378950641979,
477
- "loss": 0.1894,
478
  "step": 67
479
  },
480
  {
481
- "epoch": 1.3599999999999999,
482
- "grad_norm": 0.479379266500473,
483
- "learning_rate": 0.00012409677024566144,
484
- "loss": 0.2199,
485
  "step": 68
486
  },
487
  {
488
- "epoch": 1.38,
489
- "grad_norm": 0.5007523894309998,
490
- "learning_rate": 0.00012194385134414608,
491
- "loss": 0.2088,
492
  "step": 69
493
  },
494
  {
495
- "epoch": 1.4,
496
- "grad_norm": 0.48662200570106506,
497
- "learning_rate": 0.00011978019209855174,
498
- "loss": 0.2202,
499
  "step": 70
500
  },
501
  {
502
- "epoch": 1.42,
503
- "grad_norm": 0.46738380193710327,
504
- "learning_rate": 0.00011760685150460362,
505
- "loss": 0.1878,
506
  "step": 71
507
  },
508
  {
509
- "epoch": 1.44,
510
- "grad_norm": 0.4948503077030182,
511
- "learning_rate": 0.00011542489329653024,
512
- "loss": 0.1882,
513
  "step": 72
514
  },
515
  {
516
- "epoch": 1.46,
517
- "grad_norm": 0.4791596233844757,
518
- "learning_rate": 0.00011323538542642227,
519
- "loss": 0.1846,
520
  "step": 73
521
  },
522
  {
523
- "epoch": 1.48,
524
- "grad_norm": 0.4715379774570465,
525
- "learning_rate": 0.000111039399541527,
526
- "loss": 0.199,
527
  "step": 74
528
  },
529
  {
530
- "epoch": 1.5,
531
- "grad_norm": 0.4828183352947235,
532
- "learning_rate": 0.00010883801045973425,
533
- "loss": 0.2047,
534
  "step": 75
535
  },
536
  {
537
- "epoch": 1.52,
538
- "grad_norm": 0.48095616698265076,
539
- "learning_rate": 0.00010663229564351041,
540
- "loss": 0.218,
541
  "step": 76
542
  },
543
  {
544
- "epoch": 1.54,
545
- "grad_norm": 0.4789870083332062,
546
- "learning_rate": 0.00010442333467253789,
547
- "loss": 0.1924,
548
  "step": 77
549
  },
550
  {
551
- "epoch": 1.56,
552
- "grad_norm": 0.4771735966205597,
553
- "learning_rate": 0.00010221220871531869,
554
- "loss": 0.1864,
555
  "step": 78
556
  },
557
  {
558
- "epoch": 1.58,
559
- "grad_norm": 0.4639010727405548,
560
- "learning_rate": 0.0001,
561
- "loss": 0.1917,
562
  "step": 79
563
  },
564
  {
565
- "epoch": 1.6,
566
- "grad_norm": 0.49689823389053345,
567
- "learning_rate": 9.778779128468132e-05,
568
- "loss": 0.1769,
569
  "step": 80
570
  },
571
  {
572
- "epoch": 1.62,
573
- "grad_norm": 0.4767945408821106,
574
- "learning_rate": 9.557666532746213e-05,
575
- "loss": 0.1743,
576
  "step": 81
577
  },
578
  {
579
- "epoch": 1.6400000000000001,
580
- "grad_norm": 0.4821512699127197,
581
- "learning_rate": 9.336770435648964e-05,
582
- "loss": 0.1965,
583
  "step": 82
584
  },
585
  {
586
- "epoch": 1.6600000000000001,
587
- "grad_norm": 0.6080212593078613,
588
- "learning_rate": 9.116198954026577e-05,
589
- "loss": 0.1963,
590
  "step": 83
591
  },
592
  {
593
- "epoch": 1.6800000000000002,
594
- "grad_norm": 0.48027199506759644,
595
- "learning_rate": 8.896060045847304e-05,
596
- "loss": 0.2075,
597
  "step": 84
598
  },
599
  {
600
- "epoch": 1.7,
601
- "grad_norm": 0.5060182809829712,
602
- "learning_rate": 8.676461457357776e-05,
603
- "loss": 0.1882,
604
  "step": 85
605
  },
606
  {
607
- "epoch": 1.72,
608
- "grad_norm": 0.5219600796699524,
609
- "learning_rate": 8.457510670346976e-05,
610
- "loss": 0.2068,
611
  "step": 86
612
  },
613
  {
614
- "epoch": 1.74,
615
- "grad_norm": 0.47161611914634705,
616
- "learning_rate": 8.239314849539638e-05,
617
- "loss": 0.1924,
618
  "step": 87
619
  },
620
  {
621
- "epoch": 1.76,
622
- "grad_norm": 0.4543808400630951,
623
- "learning_rate": 8.021980790144827e-05,
624
- "loss": 0.1861,
625
  "step": 88
626
  },
627
  {
628
- "epoch": 1.78,
629
- "grad_norm": 0.4798787534236908,
630
- "learning_rate": 7.805614865585396e-05,
631
- "loss": 0.209,
632
  "step": 89
633
  },
634
  {
635
- "epoch": 1.8,
636
- "grad_norm": 0.4594615697860718,
637
- "learning_rate": 7.590322975433857e-05,
638
- "loss": 0.1721,
639
  "step": 90
640
  },
641
  {
642
- "epoch": 1.8199999999999998,
643
- "grad_norm": 0.49884089827537537,
644
- "learning_rate": 7.376210493580212e-05,
645
- "loss": 0.1964,
646
  "step": 91
647
  },
648
  {
649
- "epoch": 1.8399999999999999,
650
- "grad_norm": 0.4920552968978882,
651
- "learning_rate": 7.163382216657034e-05,
652
- "loss": 0.1944,
653
  "step": 92
654
  },
655
  {
656
- "epoch": 1.8599999999999999,
657
- "grad_norm": 0.4905566871166229,
658
- "learning_rate": 6.951942312747134e-05,
659
- "loss": 0.1876,
660
  "step": 93
661
  },
662
  {
663
- "epoch": 1.88,
664
- "grad_norm": 0.483819842338562,
665
- "learning_rate": 6.741994270398826e-05,
666
- "loss": 0.1825,
667
  "step": 94
668
  },
669
  {
670
- "epoch": 1.9,
671
- "grad_norm": 0.5140827894210815,
672
- "learning_rate": 6.533640847973808e-05,
673
- "loss": 0.1986,
674
  "step": 95
675
  },
676
  {
677
- "epoch": 1.92,
678
- "grad_norm": 0.49198630452156067,
679
- "learning_rate": 6.326984023352435e-05,
680
- "loss": 0.2113,
681
  "step": 96
682
  },
683
  {
684
- "epoch": 1.94,
685
- "grad_norm": 0.47631028294563293,
686
- "learning_rate": 6.122124944020977e-05,
687
- "loss": 0.1867,
688
  "step": 97
689
  },
690
  {
691
- "epoch": 1.96,
692
- "grad_norm": 0.47129324078559875,
693
- "learning_rate": 5.91916387756535e-05,
694
- "loss": 0.2016,
695
  "step": 98
696
  },
697
  {
698
- "epoch": 1.98,
699
- "grad_norm": 0.4497186839580536,
700
- "learning_rate": 5.718200162595449e-05,
701
- "loss": 0.1816,
702
  "step": 99
703
  },
704
  {
705
- "epoch": 2.0,
706
- "grad_norm": 0.4525175392627716,
707
- "learning_rate": 5.5193321601242156e-05,
708
- "loss": 0.1874,
709
- "step": 100
710
- },
711
- {
712
- "epoch": 2.02,
713
- "grad_norm": 0.35321784019470215,
714
- "learning_rate": 5.322657205425183e-05,
715
- "loss": 0.1384,
716
- "step": 101
717
- },
718
- {
719
- "epoch": 2.04,
720
- "grad_norm": 0.364757239818573,
721
- "learning_rate": 5.1282715603920374e-05,
722
- "loss": 0.1443,
723
- "step": 102
724
- },
725
- {
726
- "epoch": 2.06,
727
- "grad_norm": 0.3620398938655853,
728
- "learning_rate": 4.936270366423563e-05,
729
- "loss": 0.1378,
730
- "step": 103
731
- },
732
- {
733
- "epoch": 2.08,
734
- "grad_norm": 0.3393998444080353,
735
- "learning_rate": 4.746747597857014e-05,
736
- "loss": 0.1265,
737
- "step": 104
738
- },
739
- {
740
- "epoch": 2.1,
741
- "grad_norm": 0.3330179750919342,
742
- "learning_rate": 4.559796015972677e-05,
743
- "loss": 0.12,
744
- "step": 105
745
- },
746
- {
747
- "epoch": 2.12,
748
- "grad_norm": 0.3353124260902405,
749
- "learning_rate": 4.375507123592194e-05,
750
- "loss": 0.1203,
751
- "step": 106
752
- },
753
- {
754
- "epoch": 2.14,
755
- "grad_norm": 0.36515313386917114,
756
- "learning_rate": 4.1939711202927936e-05,
757
- "loss": 0.114,
758
- "step": 107
759
- },
760
- {
761
- "epoch": 2.16,
762
- "grad_norm": 0.3508182168006897,
763
- "learning_rate": 4.015276858259427e-05,
764
- "loss": 0.1231,
765
- "step": 108
766
- },
767
- {
768
- "epoch": 2.18,
769
- "grad_norm": 0.37940266728401184,
770
- "learning_rate": 3.839511798796357e-05,
771
- "loss": 0.1326,
772
- "step": 109
773
- },
774
- {
775
- "epoch": 2.2,
776
- "grad_norm": 0.35219907760620117,
777
- "learning_rate": 3.6667619695195285e-05,
778
- "loss": 0.1179,
779
- "step": 110
780
- },
781
- {
782
- "epoch": 2.22,
783
- "grad_norm": 0.34503066539764404,
784
- "learning_rate": 3.49711192225063e-05,
785
- "loss": 0.121,
786
- "step": 111
787
- },
788
- {
789
- "epoch": 2.24,
790
- "grad_norm": 0.36006444692611694,
791
- "learning_rate": 3.330644691633492e-05,
792
- "loss": 0.1167,
793
- "step": 112
794
- },
795
- {
796
- "epoch": 2.26,
797
- "grad_norm": 0.412020742893219,
798
- "learning_rate": 3.167441754493066e-05,
799
- "loss": 0.1367,
800
- "step": 113
801
- },
802
- {
803
- "epoch": 2.2800000000000002,
804
- "grad_norm": 0.45470723509788513,
805
- "learning_rate": 3.0075829899568597e-05,
806
- "loss": 0.1319,
807
- "step": 114
808
- },
809
- {
810
- "epoch": 2.3,
811
- "grad_norm": 0.3515799045562744,
812
- "learning_rate": 2.8511466403583766e-05,
813
- "loss": 0.1185,
814
- "step": 115
815
- },
816
- {
817
- "epoch": 2.32,
818
- "grad_norm": 0.37341800332069397,
819
- "learning_rate": 2.6982092729416587e-05,
820
- "loss": 0.1187,
821
- "step": 116
822
- },
823
- {
824
- "epoch": 2.34,
825
- "grad_norm": 0.3499296307563782,
826
- "learning_rate": 2.548845742385717e-05,
827
- "loss": 0.1113,
828
- "step": 117
829
- },
830
- {
831
- "epoch": 2.36,
832
- "grad_norm": 0.36934876441955566,
833
- "learning_rate": 2.403129154167153e-05,
834
- "loss": 0.115,
835
- "step": 118
836
- },
837
- {
838
- "epoch": 2.38,
839
- "grad_norm": 0.3478608727455139,
840
- "learning_rate": 2.2611308287789344e-05,
841
- "loss": 0.1207,
842
- "step": 119
843
- },
844
- {
845
- "epoch": 2.4,
846
- "grad_norm": 0.3456974923610687,
847
- "learning_rate": 2.1229202668228197e-05,
848
- "loss": 0.1139,
849
- "step": 120
850
- },
851
- {
852
- "epoch": 2.42,
853
- "grad_norm": 0.3334108293056488,
854
- "learning_rate": 1.988565114992519e-05,
855
- "loss": 0.1296,
856
- "step": 121
857
- },
858
- {
859
- "epoch": 2.44,
860
- "grad_norm": 0.348541259765625,
861
- "learning_rate": 1.858131132964259e-05,
862
- "loss": 0.1233,
863
- "step": 122
864
- },
865
- {
866
- "epoch": 2.46,
867
- "grad_norm": 0.33513638377189636,
868
- "learning_rate": 1.7316821612109136e-05,
869
- "loss": 0.1269,
870
- "step": 123
871
- },
872
- {
873
- "epoch": 2.48,
874
- "grad_norm": 0.35099372267723083,
875
- "learning_rate": 1.609280089755515e-05,
876
- "loss": 0.1132,
877
- "step": 124
878
- },
879
- {
880
- "epoch": 2.5,
881
- "grad_norm": 0.34724029898643494,
882
- "learning_rate": 1.4909848278793782e-05,
883
- "loss": 0.1381,
884
- "step": 125
885
- },
886
- {
887
- "epoch": 2.52,
888
- "grad_norm": 0.3333680033683777,
889
- "learning_rate": 1.3768542747997215e-05,
890
- "loss": 0.119,
891
- "step": 126
892
- },
893
- {
894
- "epoch": 2.54,
895
- "grad_norm": 0.35580337047576904,
896
- "learning_rate": 1.2669442913310725e-05,
897
- "loss": 0.1196,
898
- "step": 127
899
- },
900
- {
901
- "epoch": 2.56,
902
- "grad_norm": 0.35816505551338196,
903
- "learning_rate": 1.161308672544389e-05,
904
- "loss": 0.1259,
905
- "step": 128
906
- },
907
- {
908
- "epoch": 2.58,
909
- "grad_norm": 0.35583576560020447,
910
- "learning_rate": 1.059999121437244e-05,
911
- "loss": 0.1396,
912
- "step": 129
913
- },
914
- {
915
- "epoch": 2.6,
916
- "grad_norm": 0.37276527285575867,
917
- "learning_rate": 9.630652236279625e-06,
918
- "loss": 0.1369,
919
- "step": 130
920
- },
921
- {
922
- "epoch": 2.62,
923
- "grad_norm": 0.34090283513069153,
924
- "learning_rate": 8.70554423086114e-06,
925
- "loss": 0.1114,
926
- "step": 131
927
- },
928
- {
929
- "epoch": 2.64,
930
- "grad_norm": 0.32982465624809265,
931
- "learning_rate": 7.825119989112173e-06,
932
- "loss": 0.1232,
933
- "step": 132
934
- },
935
- {
936
- "epoch": 2.66,
937
- "grad_norm": 0.36222097277641296,
938
- "learning_rate": 6.989810431710375e-06,
939
- "loss": 0.1193,
940
- "step": 133
941
- },
942
- {
943
- "epoch": 2.68,
944
- "grad_norm": 0.33648136258125305,
945
- "learning_rate": 6.200024398103255e-06,
946
- "loss": 0.1046,
947
- "step": 134
948
- },
949
- {
950
- "epoch": 2.7,
951
- "grad_norm": 0.34745728969573975,
952
- "learning_rate": 5.456148446402976e-06,
953
- "loss": 0.1312,
954
- "step": 135
955
- },
956
- {
957
- "epoch": 2.7199999999999998,
958
- "grad_norm": 0.3730124235153198,
959
- "learning_rate": 4.758546664186869e-06,
960
- "loss": 0.1186,
961
- "step": 136
962
- },
963
- {
964
- "epoch": 2.74,
965
- "grad_norm": 0.3411957919597626,
966
- "learning_rate": 4.107560490295992e-06,
967
- "loss": 0.1282,
968
- "step": 137
969
- },
970
- {
971
- "epoch": 2.76,
972
- "grad_norm": 0.35611581802368164,
973
- "learning_rate": 3.5035085477190143e-06,
974
- "loss": 0.1286,
975
- "step": 138
976
- },
977
- {
978
- "epoch": 2.7800000000000002,
979
- "grad_norm": 0.37135371565818787,
980
- "learning_rate": 2.94668648764328e-06,
981
- "loss": 0.1335,
982
- "step": 139
983
- },
984
- {
985
- "epoch": 2.8,
986
- "grad_norm": 0.3284219205379486,
987
- "learning_rate": 2.4373668447493224e-06,
988
- "loss": 0.1089,
989
- "step": 140
990
- },
991
- {
992
- "epoch": 2.82,
993
- "grad_norm": 0.36662524938583374,
994
- "learning_rate": 1.9757989038197146e-06,
995
- "loss": 0.1355,
996
- "step": 141
997
- },
998
- {
999
- "epoch": 2.84,
1000
- "grad_norm": 0.36059364676475525,
1001
- "learning_rate": 1.562208577727442e-06,
1002
- "loss": 0.1229,
1003
- "step": 142
1004
- },
1005
- {
1006
- "epoch": 2.86,
1007
- "grad_norm": 0.3409566283226013,
1008
- "learning_rate": 1.1967982968635993e-06,
1009
- "loss": 0.1183,
1010
- "step": 143
1011
- },
1012
- {
1013
- "epoch": 2.88,
1014
- "grad_norm": 0.36703115701675415,
1015
- "learning_rate": 8.797469100585431e-07,
1016
- "loss": 0.1416,
1017
- "step": 144
1018
- },
1019
- {
1020
- "epoch": 2.9,
1021
- "grad_norm": 0.34956759214401245,
1022
- "learning_rate": 6.11209597044926e-07,
1023
- "loss": 0.1223,
1024
- "step": 145
1025
- },
1026
- {
1027
- "epoch": 2.92,
1028
- "grad_norm": 0.3447030782699585,
1029
- "learning_rate": 3.913177925055189e-07,
1030
- "loss": 0.1283,
1031
- "step": 146
1032
- },
1033
- {
1034
- "epoch": 2.94,
1035
- "grad_norm": 0.35175061225891113,
1036
- "learning_rate": 2.201791217428917e-07,
1037
- "loss": 0.131,
1038
- "step": 147
1039
- },
1040
- {
1041
- "epoch": 2.96,
1042
- "grad_norm": 0.335248738527298,
1043
- "learning_rate": 9.78773480026396e-08,
1044
- "loss": 0.1141,
1045
- "step": 148
1046
- },
1047
- {
1048
- "epoch": 2.98,
1049
- "grad_norm": 0.329517126083374,
1050
- "learning_rate": 2.447233147570005e-08,
1051
- "loss": 0.1162,
1052
- "step": 149
1053
- },
1054
- {
1055
- "epoch": 3.0,
1056
- "grad_norm": 0.3240882158279419,
1057
  "learning_rate": 0.0,
1058
- "loss": 0.1203,
1059
- "step": 150
1060
  }
1061
  ],
1062
  "logging_steps": 1,
1063
- "max_steps": 150,
1064
  "num_input_tokens_seen": 0,
1065
- "num_train_epochs": 3,
1066
  "save_steps": 500,
1067
  "stateful_callbacks": {
1068
  "TrainerControl": {
@@ -1076,7 +726,7 @@
1076
  "attributes": {}
1077
  }
1078
  },
1079
- "total_flos": 1.954170222658683e+17,
1080
  "train_batch_size": 1,
1081
  "trial_name": null,
1082
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "grad_norm": 0.8039671182632446,
14
+ "learning_rate": 4e-05,
15
+ "loss": 0.3593,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.02,
20
+ "grad_norm": 1.0575159788131714,
21
+ "learning_rate": 8e-05,
22
+ "loss": 0.4832,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.03,
27
+ "grad_norm": 0.9031288027763367,
28
+ "learning_rate": 0.00012,
29
+ "loss": 0.4249,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.04,
34
+ "grad_norm": 0.6510717868804932,
35
+ "learning_rate": 0.00016,
36
+ "loss": 0.4258,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.05,
41
+ "grad_norm": 0.577819287776947,
42
+ "learning_rate": 0.0002,
43
+ "loss": 0.3766,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.06,
48
+ "grad_norm": 0.5394201874732971,
49
+ "learning_rate": 0.00019994532573409262,
50
+ "loss": 0.3585,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.07,
55
+ "grad_norm": 0.4903915524482727,
56
+ "learning_rate": 0.00019978136272187747,
57
+ "loss": 0.3804,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.08,
62
+ "grad_norm": 0.48727235198020935,
63
+ "learning_rate": 0.00019950829025450114,
64
+ "loss": 0.4129,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.09,
69
+ "grad_norm": 0.5153201818466187,
70
+ "learning_rate": 0.00019912640693269752,
71
+ "loss": 0.4332,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.1,
76
+ "grad_norm": 0.502315104007721,
77
+ "learning_rate": 0.00019863613034027224,
78
+ "loss": 0.4077,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.11,
83
+ "grad_norm": 0.472267746925354,
84
+ "learning_rate": 0.00019803799658748094,
85
+ "loss": 0.4006,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.12,
90
+ "grad_norm": 0.5331538319587708,
91
+ "learning_rate": 0.0001973326597248006,
92
+ "loss": 0.4175,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.13,
97
+ "grad_norm": 0.5011341571807861,
98
+ "learning_rate": 0.00019652089102773488,
99
+ "loss": 0.4139,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.14,
104
+ "grad_norm": 0.5202248096466064,
105
+ "learning_rate": 0.00019560357815343577,
106
+ "loss": 0.3731,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.15,
111
+ "grad_norm": 0.5288619995117188,
112
+ "learning_rate": 0.00019458172417006347,
113
+ "loss": 0.3912,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.16,
118
+ "grad_norm": 0.5330358743667603,
119
+ "learning_rate": 0.0001934564464599461,
120
+ "loss": 0.4218,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.17,
125
+ "grad_norm": 0.528815507888794,
126
+ "learning_rate": 0.00019222897549773848,
127
+ "loss": 0.4527,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.18,
132
+ "grad_norm": 0.5266752243041992,
133
+ "learning_rate": 0.00019090065350491626,
134
+ "loss": 0.3762,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.19,
139
+ "grad_norm": 0.48899364471435547,
140
+ "learning_rate": 0.00018947293298207635,
141
+ "loss": 0.3838,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.2,
146
+ "grad_norm": 0.4758334159851074,
147
+ "learning_rate": 0.0001879473751206489,
148
+ "loss": 0.3798,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.21,
153
+ "grad_norm": 0.5223532319068909,
154
+ "learning_rate": 0.00018632564809575742,
155
+ "loss": 0.364,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.22,
160
+ "grad_norm": 0.5233363509178162,
161
+ "learning_rate": 0.00018460952524209355,
162
+ "loss": 0.373,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.23,
167
+ "grad_norm": 0.4868537485599518,
168
+ "learning_rate": 0.00018280088311480201,
169
+ "loss": 0.4269,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.24,
174
+ "grad_norm": 0.5180346965789795,
175
+ "learning_rate": 0.00018090169943749476,
176
+ "loss": 0.4044,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 0.25,
181
+ "grad_norm": 0.5078471899032593,
182
+ "learning_rate": 0.00017891405093963938,
183
+ "loss": 0.353,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 0.26,
188
+ "grad_norm": 0.4828091859817505,
189
+ "learning_rate": 0.00017684011108568592,
190
+ "loss": 0.373,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 0.27,
195
+ "grad_norm": 0.49357226490974426,
196
+ "learning_rate": 0.0001746821476984154,
197
+ "loss": 0.4001,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 0.28,
202
+ "grad_norm": 0.5360887050628662,
203
+ "learning_rate": 0.00017244252047910892,
204
+ "loss": 0.424,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 0.29,
209
+ "grad_norm": 0.5232270359992981,
210
+ "learning_rate": 0.00017012367842724887,
211
+ "loss": 0.3986,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 0.3,
216
+ "grad_norm": 0.5093458294868469,
217
+ "learning_rate": 0.00016772815716257412,
218
+ "loss": 0.3909,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 0.31,
223
+ "grad_norm": 0.49155759811401367,
224
+ "learning_rate": 0.00016525857615241687,
225
+ "loss": 0.409,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 0.32,
230
+ "grad_norm": 0.47392013669013977,
231
+ "learning_rate": 0.0001627176358473537,
232
+ "loss": 0.3658,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 0.33,
237
+ "grad_norm": 0.512052595615387,
238
+ "learning_rate": 0.00016010811472830252,
239
+ "loss": 0.3979,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 0.34,
244
+ "grad_norm": 0.49206939339637756,
245
+ "learning_rate": 0.00015743286626829437,
246
+ "loss": 0.3871,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 0.35,
251
+ "grad_norm": 0.5002603530883789,
252
+ "learning_rate": 0.00015469481581224272,
253
+ "loss": 0.3668,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 0.36,
258
+ "grad_norm": 0.5139701962471008,
259
+ "learning_rate": 0.00015189695737812152,
260
+ "loss": 0.4014,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 0.37,
265
+ "grad_norm": 0.5197802186012268,
266
+ "learning_rate": 0.00014904235038305083,
267
+ "loss": 0.3951,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 0.38,
272
+ "grad_norm": 0.5155587196350098,
273
+ "learning_rate": 0.0001461341162978688,
274
+ "loss": 0.4161,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 0.39,
279
+ "grad_norm": 0.49465492367744446,
280
+ "learning_rate": 0.00014317543523384928,
281
+ "loss": 0.3898,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 0.4,
286
+ "grad_norm": 0.4801078140735626,
287
+ "learning_rate": 0.00014016954246529696,
288
+ "loss": 0.3973,
289
  "step": 40
290
  },
291
  {
292
+ "epoch": 0.41,
293
+ "grad_norm": 0.48596322536468506,
294
+ "learning_rate": 0.00013711972489182208,
295
+ "loss": 0.4097,
296
  "step": 41
297
  },
298
  {
299
+ "epoch": 0.42,
300
+ "grad_norm": 0.5131967663764954,
301
+ "learning_rate": 0.00013402931744416433,
302
+ "loss": 0.3829,
303
  "step": 42
304
  },
305
  {
306
+ "epoch": 0.43,
307
+ "grad_norm": 0.4835667610168457,
308
+ "learning_rate": 0.00013090169943749476,
309
+ "loss": 0.3642,
310
  "step": 43
311
  },
312
  {
313
+ "epoch": 0.44,
314
+ "grad_norm": 0.5013434886932373,
315
+ "learning_rate": 0.00012774029087618446,
316
+ "loss": 0.3675,
317
  "step": 44
318
  },
319
  {
320
+ "epoch": 0.45,
321
+ "grad_norm": 0.4737671911716461,
322
+ "learning_rate": 0.00012454854871407994,
323
+ "loss": 0.4346,
324
  "step": 45
325
  },
326
  {
327
+ "epoch": 0.46,
328
+ "grad_norm": 0.544231116771698,
329
+ "learning_rate": 0.0001213299630743747,
330
+ "loss": 0.3914,
331
  "step": 46
332
  },
333
  {
334
+ "epoch": 0.47,
335
+ "grad_norm": 0.5040849447250366,
336
+ "learning_rate": 0.000118088053433211,
337
+ "loss": 0.3744,
338
  "step": 47
339
  },
340
  {
341
+ "epoch": 0.48,
342
+ "grad_norm": 0.5225382447242737,
343
+ "learning_rate": 0.0001148263647711842,
344
+ "loss": 0.3922,
345
  "step": 48
346
  },
347
  {
348
+ "epoch": 0.49,
349
+ "grad_norm": 0.4771358370780945,
350
+ "learning_rate": 0.00011154846369695863,
351
+ "loss": 0.3552,
352
  "step": 49
353
  },
354
  {
355
+ "epoch": 0.5,
356
+ "grad_norm": 0.4580378532409668,
357
+ "learning_rate": 0.00010825793454723325,
358
+ "loss": 0.3667,
359
  "step": 50
360
  },
361
  {
362
+ "epoch": 0.51,
363
+ "grad_norm": 0.5024124979972839,
364
+ "learning_rate": 0.00010495837546732224,
365
+ "loss": 0.3469,
366
  "step": 51
367
  },
368
  {
369
+ "epoch": 0.52,
370
+ "grad_norm": 0.4725678861141205,
371
+ "learning_rate": 0.00010165339447663587,
372
+ "loss": 0.3717,
373
  "step": 52
374
  },
375
  {
376
+ "epoch": 0.53,
377
+ "grad_norm": 0.47523003816604614,
378
+ "learning_rate": 9.834660552336415e-05,
379
+ "loss": 0.3779,
380
  "step": 53
381
  },
382
  {
383
+ "epoch": 0.54,
384
+ "grad_norm": 0.5019033551216125,
385
+ "learning_rate": 9.504162453267777e-05,
386
+ "loss": 0.3548,
387
  "step": 54
388
  },
389
  {
390
+ "epoch": 0.55,
391
+ "grad_norm": 0.5027766227722168,
392
+ "learning_rate": 9.174206545276677e-05,
393
+ "loss": 0.4236,
394
  "step": 55
395
  },
396
  {
397
+ "epoch": 0.56,
398
+ "grad_norm": 0.512511670589447,
399
+ "learning_rate": 8.845153630304139e-05,
400
+ "loss": 0.3492,
401
  "step": 56
402
  },
403
  {
404
+ "epoch": 0.57,
405
+ "grad_norm": 0.4811123311519623,
406
+ "learning_rate": 8.517363522881579e-05,
407
+ "loss": 0.3627,
408
  "step": 57
409
  },
410
  {
411
+ "epoch": 0.58,
412
+ "grad_norm": 0.5243905782699585,
413
+ "learning_rate": 8.191194656678904e-05,
414
+ "loss": 0.4194,
415
  "step": 58
416
  },
417
  {
418
+ "epoch": 0.59,
419
+ "grad_norm": 0.4740852117538452,
420
+ "learning_rate": 7.867003692562534e-05,
421
+ "loss": 0.3481,
422
  "step": 59
423
  },
424
  {
425
+ "epoch": 0.6,
426
+ "grad_norm": 0.4817480146884918,
427
+ "learning_rate": 7.54514512859201e-05,
428
+ "loss": 0.3156,
429
  "step": 60
430
  },
431
  {
432
+ "epoch": 0.61,
433
+ "grad_norm": 0.5003472566604614,
434
+ "learning_rate": 7.225970912381556e-05,
435
+ "loss": 0.3746,
436
  "step": 61
437
  },
438
  {
439
+ "epoch": 0.62,
440
+ "grad_norm": 0.4828045070171356,
441
+ "learning_rate": 6.909830056250527e-05,
442
+ "loss": 0.3616,
443
  "step": 62
444
  },
445
  {
446
+ "epoch": 0.63,
447
+ "grad_norm": 0.4666941463947296,
448
+ "learning_rate": 6.59706825558357e-05,
449
+ "loss": 0.3411,
450
  "step": 63
451
  },
452
  {
453
+ "epoch": 0.64,
454
+ "grad_norm": 0.5042151808738708,
455
+ "learning_rate": 6.28802751081779e-05,
456
+ "loss": 0.3658,
457
  "step": 64
458
  },
459
  {
460
+ "epoch": 0.65,
461
+ "grad_norm": 0.49939414858818054,
462
+ "learning_rate": 5.983045753470308e-05,
463
+ "loss": 0.3993,
464
  "step": 65
465
  },
466
  {
467
+ "epoch": 0.66,
468
+ "grad_norm": 0.48640677332878113,
469
+ "learning_rate": 5.6824564766150726e-05,
470
+ "loss": 0.3295,
471
  "step": 66
472
  },
473
  {
474
+ "epoch": 0.67,
475
+ "grad_norm": 0.4997316300868988,
476
+ "learning_rate": 5.386588370213124e-05,
477
+ "loss": 0.3571,
478
  "step": 67
479
  },
480
  {
481
+ "epoch": 0.68,
482
+ "grad_norm": 0.508797287940979,
483
+ "learning_rate": 5.095764961694922e-05,
484
+ "loss": 0.3626,
485
  "step": 68
486
  },
487
  {
488
+ "epoch": 0.69,
489
+ "grad_norm": 0.45878127217292786,
490
+ "learning_rate": 4.810304262187852e-05,
491
+ "loss": 0.3726,
492
  "step": 69
493
  },
494
  {
495
+ "epoch": 0.7,
496
+ "grad_norm": 0.49244609475135803,
497
+ "learning_rate": 4.530518418775733e-05,
498
+ "loss": 0.3577,
499
  "step": 70
500
  },
501
  {
502
+ "epoch": 0.71,
503
+ "grad_norm": 0.46602892875671387,
504
+ "learning_rate": 4.256713373170564e-05,
505
+ "loss": 0.3403,
506
  "step": 71
507
  },
508
  {
509
+ "epoch": 0.72,
510
+ "grad_norm": 0.502491295337677,
511
+ "learning_rate": 3.9891885271697496e-05,
512
+ "loss": 0.3662,
513
  "step": 72
514
  },
515
  {
516
+ "epoch": 0.73,
517
+ "grad_norm": 0.47285720705986023,
518
+ "learning_rate": 3.7282364152646297e-05,
519
+ "loss": 0.3187,
520
  "step": 73
521
  },
522
  {
523
+ "epoch": 0.74,
524
+ "grad_norm": 0.4815748333930969,
525
+ "learning_rate": 3.4741423847583134e-05,
526
+ "loss": 0.3742,
527
  "step": 74
528
  },
529
  {
530
+ "epoch": 0.75,
531
+ "grad_norm": 0.5235660672187805,
532
+ "learning_rate": 3.227184283742591e-05,
533
+ "loss": 0.3809,
534
  "step": 75
535
  },
536
  {
537
+ "epoch": 0.76,
538
+ "grad_norm": 0.46197509765625,
539
+ "learning_rate": 2.9876321572751144e-05,
540
+ "loss": 0.3298,
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 0.77,
545
+ "grad_norm": 0.455169141292572,
546
+ "learning_rate": 2.7557479520891104e-05,
547
+ "loss": 0.3544,
548
  "step": 77
549
  },
550
  {
551
+ "epoch": 0.78,
552
+ "grad_norm": 0.4787601828575134,
553
+ "learning_rate": 2.5317852301584643e-05,
554
+ "loss": 0.3466,
555
  "step": 78
556
  },
557
  {
558
+ "epoch": 0.79,
559
+ "grad_norm": 0.47747695446014404,
560
+ "learning_rate": 2.315988891431412e-05,
561
+ "loss": 0.3189,
562
  "step": 79
563
  },
564
  {
565
+ "epoch": 0.8,
566
+ "grad_norm": 0.475917786359787,
567
+ "learning_rate": 2.1085949060360654e-05,
568
+ "loss": 0.3832,
569
  "step": 80
570
  },
571
  {
572
+ "epoch": 0.81,
573
+ "grad_norm": 0.4622023105621338,
574
+ "learning_rate": 1.9098300562505266e-05,
575
+ "loss": 0.3414,
576
  "step": 81
577
  },
578
  {
579
+ "epoch": 0.82,
580
+ "grad_norm": 0.48533836007118225,
581
+ "learning_rate": 1.7199116885197995e-05,
582
+ "loss": 0.3587,
583
  "step": 82
584
  },
585
  {
586
+ "epoch": 0.83,
587
+ "grad_norm": 0.45964503288269043,
588
+ "learning_rate": 1.5390474757906446e-05,
589
+ "loss": 0.3244,
590
  "step": 83
591
  },
592
  {
593
+ "epoch": 0.84,
594
+ "grad_norm": 0.5024991631507874,
595
+ "learning_rate": 1.3674351904242611e-05,
596
+ "loss": 0.4067,
597
  "step": 84
598
  },
599
  {
600
+ "epoch": 0.85,
601
+ "grad_norm": 0.4903584420681,
602
+ "learning_rate": 1.2052624879351104e-05,
603
+ "loss": 0.3641,
604
  "step": 85
605
  },
606
  {
607
+ "epoch": 0.86,
608
+ "grad_norm": 0.44721242785453796,
609
+ "learning_rate": 1.0527067017923654e-05,
610
+ "loss": 0.3121,
611
  "step": 86
612
  },
613
  {
614
+ "epoch": 0.87,
615
+ "grad_norm": 0.48709988594055176,
616
+ "learning_rate": 9.09934649508375e-06,
617
+ "loss": 0.3692,
618
  "step": 87
619
  },
620
  {
621
+ "epoch": 0.88,
622
+ "grad_norm": 0.4607761800289154,
623
+ "learning_rate": 7.771024502261526e-06,
624
+ "loss": 0.3523,
625
  "step": 88
626
  },
627
  {
628
+ "epoch": 0.89,
629
+ "grad_norm": 0.4875771999359131,
630
+ "learning_rate": 6.543553540053926e-06,
631
+ "loss": 0.3445,
632
  "step": 89
633
  },
634
  {
635
+ "epoch": 0.9,
636
+ "grad_norm": 0.4596504271030426,
637
+ "learning_rate": 5.418275829936537e-06,
638
+ "loss": 0.3349,
639
  "step": 90
640
  },
641
  {
642
+ "epoch": 0.91,
643
+ "grad_norm": 0.46433568000793457,
644
+ "learning_rate": 4.3964218465642355e-06,
645
+ "loss": 0.323,
646
  "step": 91
647
  },
648
  {
649
+ "epoch": 0.92,
650
+ "grad_norm": 0.4667503833770752,
651
+ "learning_rate": 3.4791089722651436e-06,
652
+ "loss": 0.3228,
653
  "step": 92
654
  },
655
  {
656
+ "epoch": 0.93,
657
+ "grad_norm": 0.490509033203125,
658
+ "learning_rate": 2.667340275199426e-06,
659
+ "loss": 0.3673,
660
  "step": 93
661
  },
662
  {
663
+ "epoch": 0.94,
664
+ "grad_norm": 0.4769146144390106,
665
+ "learning_rate": 1.9620034125190644e-06,
666
+ "loss": 0.3211,
667
  "step": 94
668
  },
669
  {
670
+ "epoch": 0.95,
671
+ "grad_norm": 0.5546551942825317,
672
+ "learning_rate": 1.3638696597277679e-06,
673
+ "loss": 0.3613,
674
  "step": 95
675
  },
676
  {
677
+ "epoch": 0.96,
678
+ "grad_norm": 0.47154128551483154,
679
+ "learning_rate": 8.735930673024806e-07,
680
+ "loss": 0.35,
681
  "step": 96
682
  },
683
  {
684
+ "epoch": 0.97,
685
+ "grad_norm": 0.48646360635757446,
686
+ "learning_rate": 4.917097454988584e-07,
687
+ "loss": 0.349,
688
  "step": 97
689
  },
690
  {
691
+ "epoch": 0.98,
692
+ "grad_norm": 0.48640263080596924,
693
+ "learning_rate": 2.1863727812254653e-07,
694
+ "loss": 0.3713,
695
  "step": 98
696
  },
697
  {
698
+ "epoch": 0.99,
699
+ "grad_norm": 0.47486788034439087,
700
+ "learning_rate": 5.467426590739511e-08,
701
+ "loss": 0.296,
702
  "step": 99
703
  },
704
  {
705
+ "epoch": 1.0,
706
+ "grad_norm": 0.4658459424972534,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
  "learning_rate": 0.0,
708
+ "loss": 0.3369,
709
+ "step": 100
710
  }
711
  ],
712
  "logging_steps": 1,
713
+ "max_steps": 100,
714
  "num_input_tokens_seen": 0,
715
+ "num_train_epochs": 1,
716
  "save_steps": 500,
717
  "stateful_callbacks": {
718
  "TrainerControl": {
 
726
  "attributes": {}
727
  }
728
  },
729
+ "total_flos": 1.3312303903280333e+17,
730
  "train_batch_size": 1,
731
  "trial_name": null,
732
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f50a1f8500c6886b194f3cb1f9dec14b859ae7d726b38257900cde0c2d2f4eef
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84840f0c9d05975fb66dbe0df3b9fc3f7f2326fff03a93716d61f75cc3024fc7
3
  size 5240