LoMicha commited on
Commit
7ce8f7b
1 Parent(s): b7115d8

Training in progress, step 1700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca6d7b29067a6f1802f95f9ae9a63da450c0fea767f9619e952dfe7ded4bbac1
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0930d6e64875820c22c7cffca09ca1acaf6cdcc1e7968d0c5a856968a87824e8
3
  size 159967880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b22c239721aee44efc685d4b1334c0286d18e2d4e1a7cdd018c60a67b16bf89c
3
  size 81735892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b9d593632762fa95cd51142f1adb4c560e59f1f9d92ecb7de05d6485c887205
3
  size 81735892
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ca599b893276769adc2f6c1d4d029dfc6620646247d84a40c0c82d7f07af46e
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b283d316b0c499174401fc8457651f1fb183c6003c46a4d25e29dfecd151147
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3cac2327cb924e4f297c23ce5eb4e23debff02122db63c77a3bdcac410a9ffd
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a6797f0f81e1d80bc4d2d6295ad3c421b4b433370ca9e0c209b11267f3ef64f
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae270426c2cd8e1894df96f1ceb38a20e29481d4f6f7b2903c1da7e94dcbed0a
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bd17fa23f67ef7fbf6e377f7e0c23474bf385755bb96f63949a2752039f1de4
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:333db600402aeeecc3fab07acc6d09f62217db914fcc898662b20b1afa147c61
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:212929e3bfad92319ef54b8b509922f96991c6c7d7791e9983b6f58b96c35aff
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa0f2c31dda0951ee9cc2b00073ac9a44026193943adb54ebced21cb99784765
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:671918de7ffc87d6187292033f79bb1cacaa6a7d5996a986d5989df4cdad43d1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9324009324009324,
5
  "eval_steps": 500,
6
- "global_step": 1600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11207,6 +11207,706 @@
11207
  "learning_rate": 6.759906759906761e-05,
11208
  "loss": 0.0034,
11209
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11210
  }
11211
  ],
11212
  "logging_steps": 1,
@@ -11226,7 +11926,7 @@
11226
  "attributes": {}
11227
  }
11228
  },
11229
- "total_flos": 6.6914779581854515e+19,
11230
  "train_batch_size": 4,
11231
  "trial_name": null,
11232
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9906759906759907,
5
  "eval_steps": 500,
6
+ "global_step": 1700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11207
  "learning_rate": 6.759906759906761e-05,
11208
  "loss": 0.0034,
11209
  "step": 1600
11210
+ },
11211
+ {
11212
+ "epoch": 0.932983682983683,
11213
+ "grad_norm": 0.0013091769069433212,
11214
+ "learning_rate": 6.701631701631703e-05,
11215
+ "loss": 0.0038,
11216
+ "step": 1601
11217
+ },
11218
+ {
11219
+ "epoch": 0.9335664335664335,
11220
+ "grad_norm": 0.0009162210044451058,
11221
+ "learning_rate": 6.643356643356644e-05,
11222
+ "loss": 0.0028,
11223
+ "step": 1602
11224
+ },
11225
+ {
11226
+ "epoch": 0.9341491841491841,
11227
+ "grad_norm": 0.0008717044838704169,
11228
+ "learning_rate": 6.585081585081586e-05,
11229
+ "loss": 0.003,
11230
+ "step": 1603
11231
+ },
11232
+ {
11233
+ "epoch": 0.9347319347319347,
11234
+ "grad_norm": 0.0010865787044167519,
11235
+ "learning_rate": 6.526806526806527e-05,
11236
+ "loss": 0.0035,
11237
+ "step": 1604
11238
+ },
11239
+ {
11240
+ "epoch": 0.9353146853146853,
11241
+ "grad_norm": 0.0007628489984199405,
11242
+ "learning_rate": 6.46853146853147e-05,
11243
+ "loss": 0.0025,
11244
+ "step": 1605
11245
+ },
11246
+ {
11247
+ "epoch": 0.9358974358974359,
11248
+ "grad_norm": 0.0009772854391485453,
11249
+ "learning_rate": 6.41025641025641e-05,
11250
+ "loss": 0.0041,
11251
+ "step": 1606
11252
+ },
11253
+ {
11254
+ "epoch": 0.9364801864801865,
11255
+ "grad_norm": 0.0008730532717891037,
11256
+ "learning_rate": 6.351981351981353e-05,
11257
+ "loss": 0.0028,
11258
+ "step": 1607
11259
+ },
11260
+ {
11261
+ "epoch": 0.9370629370629371,
11262
+ "grad_norm": 0.0007411614060401917,
11263
+ "learning_rate": 6.293706293706295e-05,
11264
+ "loss": 0.0032,
11265
+ "step": 1608
11266
+ },
11267
+ {
11268
+ "epoch": 0.9376456876456877,
11269
+ "grad_norm": 0.0009279727819375694,
11270
+ "learning_rate": 6.235431235431236e-05,
11271
+ "loss": 0.0022,
11272
+ "step": 1609
11273
+ },
11274
+ {
11275
+ "epoch": 0.9382284382284383,
11276
+ "grad_norm": 0.0010616903891786933,
11277
+ "learning_rate": 6.177156177156177e-05,
11278
+ "loss": 0.0044,
11279
+ "step": 1610
11280
+ },
11281
+ {
11282
+ "epoch": 0.9388111888111889,
11283
+ "grad_norm": 0.0009532080148346722,
11284
+ "learning_rate": 6.118881118881119e-05,
11285
+ "loss": 0.0026,
11286
+ "step": 1611
11287
+ },
11288
+ {
11289
+ "epoch": 0.9393939393939394,
11290
+ "grad_norm": 0.0007426452939398587,
11291
+ "learning_rate": 6.060606060606061e-05,
11292
+ "loss": 0.0027,
11293
+ "step": 1612
11294
+ },
11295
+ {
11296
+ "epoch": 0.9399766899766899,
11297
+ "grad_norm": 0.0010714689269661903,
11298
+ "learning_rate": 6.002331002331003e-05,
11299
+ "loss": 0.0041,
11300
+ "step": 1613
11301
+ },
11302
+ {
11303
+ "epoch": 0.9405594405594405,
11304
+ "grad_norm": 0.0008739576442167163,
11305
+ "learning_rate": 5.944055944055944e-05,
11306
+ "loss": 0.0035,
11307
+ "step": 1614
11308
+ },
11309
+ {
11310
+ "epoch": 0.9411421911421911,
11311
+ "grad_norm": 0.0018457169644534588,
11312
+ "learning_rate": 5.885780885780886e-05,
11313
+ "loss": 0.0048,
11314
+ "step": 1615
11315
+ },
11316
+ {
11317
+ "epoch": 0.9417249417249417,
11318
+ "grad_norm": 0.0008747098036110401,
11319
+ "learning_rate": 5.8275058275058275e-05,
11320
+ "loss": 0.003,
11321
+ "step": 1616
11322
+ },
11323
+ {
11324
+ "epoch": 0.9423076923076923,
11325
+ "grad_norm": 0.001420872751623392,
11326
+ "learning_rate": 5.76923076923077e-05,
11327
+ "loss": 0.0048,
11328
+ "step": 1617
11329
+ },
11330
+ {
11331
+ "epoch": 0.9428904428904429,
11332
+ "grad_norm": 0.0008954692748375237,
11333
+ "learning_rate": 5.7109557109557114e-05,
11334
+ "loss": 0.0026,
11335
+ "step": 1618
11336
+ },
11337
+ {
11338
+ "epoch": 0.9434731934731935,
11339
+ "grad_norm": 0.000952814007177949,
11340
+ "learning_rate": 5.652680652680653e-05,
11341
+ "loss": 0.004,
11342
+ "step": 1619
11343
+ },
11344
+ {
11345
+ "epoch": 0.9440559440559441,
11346
+ "grad_norm": 0.0011160552967339754,
11347
+ "learning_rate": 5.5944055944055945e-05,
11348
+ "loss": 0.0034,
11349
+ "step": 1620
11350
+ },
11351
+ {
11352
+ "epoch": 0.9446386946386947,
11353
+ "grad_norm": 0.0009834656957536936,
11354
+ "learning_rate": 5.536130536130536e-05,
11355
+ "loss": 0.0042,
11356
+ "step": 1621
11357
+ },
11358
+ {
11359
+ "epoch": 0.9452214452214452,
11360
+ "grad_norm": 0.0007495367899537086,
11361
+ "learning_rate": 5.477855477855478e-05,
11362
+ "loss": 0.0028,
11363
+ "step": 1622
11364
+ },
11365
+ {
11366
+ "epoch": 0.9458041958041958,
11367
+ "grad_norm": 0.0011641675373539329,
11368
+ "learning_rate": 5.419580419580419e-05,
11369
+ "loss": 0.004,
11370
+ "step": 1623
11371
+ },
11372
+ {
11373
+ "epoch": 0.9463869463869464,
11374
+ "grad_norm": 0.0013844856293871999,
11375
+ "learning_rate": 5.3613053613053616e-05,
11376
+ "loss": 0.014,
11377
+ "step": 1624
11378
+ },
11379
+ {
11380
+ "epoch": 0.946969696969697,
11381
+ "grad_norm": 0.0008486348669975996,
11382
+ "learning_rate": 5.303030303030303e-05,
11383
+ "loss": 0.0029,
11384
+ "step": 1625
11385
+ },
11386
+ {
11387
+ "epoch": 0.9475524475524476,
11388
+ "grad_norm": 0.001107304822653532,
11389
+ "learning_rate": 5.244755244755245e-05,
11390
+ "loss": 0.0028,
11391
+ "step": 1626
11392
+ },
11393
+ {
11394
+ "epoch": 0.9481351981351981,
11395
+ "grad_norm": 0.001052669482305646,
11396
+ "learning_rate": 5.1864801864801863e-05,
11397
+ "loss": 0.0035,
11398
+ "step": 1627
11399
+ },
11400
+ {
11401
+ "epoch": 0.9487179487179487,
11402
+ "grad_norm": 0.001127295778132975,
11403
+ "learning_rate": 5.128205128205128e-05,
11404
+ "loss": 0.0033,
11405
+ "step": 1628
11406
+ },
11407
+ {
11408
+ "epoch": 0.9493006993006993,
11409
+ "grad_norm": 0.00099327159114182,
11410
+ "learning_rate": 5.0699300699300695e-05,
11411
+ "loss": 0.0029,
11412
+ "step": 1629
11413
+ },
11414
+ {
11415
+ "epoch": 0.9498834498834499,
11416
+ "grad_norm": 0.0008510001935064793,
11417
+ "learning_rate": 5.011655011655012e-05,
11418
+ "loss": 0.0031,
11419
+ "step": 1630
11420
+ },
11421
+ {
11422
+ "epoch": 0.9504662004662005,
11423
+ "grad_norm": 0.0006990230758674443,
11424
+ "learning_rate": 4.9533799533799534e-05,
11425
+ "loss": 0.0028,
11426
+ "step": 1631
11427
+ },
11428
+ {
11429
+ "epoch": 0.951048951048951,
11430
+ "grad_norm": 0.0008159316494129598,
11431
+ "learning_rate": 4.895104895104895e-05,
11432
+ "loss": 0.0028,
11433
+ "step": 1632
11434
+ },
11435
+ {
11436
+ "epoch": 0.9516317016317016,
11437
+ "grad_norm": 0.0008230661042034626,
11438
+ "learning_rate": 4.836829836829837e-05,
11439
+ "loss": 0.0033,
11440
+ "step": 1633
11441
+ },
11442
+ {
11443
+ "epoch": 0.9522144522144522,
11444
+ "grad_norm": 0.0009854782838374376,
11445
+ "learning_rate": 4.778554778554779e-05,
11446
+ "loss": 0.0034,
11447
+ "step": 1634
11448
+ },
11449
+ {
11450
+ "epoch": 0.9527972027972028,
11451
+ "grad_norm": 0.0010709573980420828,
11452
+ "learning_rate": 4.7202797202797204e-05,
11453
+ "loss": 0.0029,
11454
+ "step": 1635
11455
+ },
11456
+ {
11457
+ "epoch": 0.9533799533799534,
11458
+ "grad_norm": 0.0010027334792539477,
11459
+ "learning_rate": 4.662004662004663e-05,
11460
+ "loss": 0.0033,
11461
+ "step": 1636
11462
+ },
11463
+ {
11464
+ "epoch": 0.953962703962704,
11465
+ "grad_norm": 0.0009396614041179419,
11466
+ "learning_rate": 4.603729603729604e-05,
11467
+ "loss": 0.0088,
11468
+ "step": 1637
11469
+ },
11470
+ {
11471
+ "epoch": 0.9545454545454546,
11472
+ "grad_norm": 0.0007274977397173643,
11473
+ "learning_rate": 4.545454545454546e-05,
11474
+ "loss": 0.0032,
11475
+ "step": 1638
11476
+ },
11477
+ {
11478
+ "epoch": 0.9551282051282052,
11479
+ "grad_norm": 0.0010892600985243917,
11480
+ "learning_rate": 4.4871794871794874e-05,
11481
+ "loss": 0.0042,
11482
+ "step": 1639
11483
+ },
11484
+ {
11485
+ "epoch": 0.9557109557109557,
11486
+ "grad_norm": 0.0007885160739533603,
11487
+ "learning_rate": 4.428904428904429e-05,
11488
+ "loss": 0.0031,
11489
+ "step": 1640
11490
+ },
11491
+ {
11492
+ "epoch": 0.9562937062937062,
11493
+ "grad_norm": 0.0012906527845188975,
11494
+ "learning_rate": 4.3706293706293706e-05,
11495
+ "loss": 0.0044,
11496
+ "step": 1641
11497
+ },
11498
+ {
11499
+ "epoch": 0.9568764568764568,
11500
+ "grad_norm": 0.0012814976507797837,
11501
+ "learning_rate": 4.312354312354312e-05,
11502
+ "loss": 0.0038,
11503
+ "step": 1642
11504
+ },
11505
+ {
11506
+ "epoch": 0.9574592074592074,
11507
+ "grad_norm": 0.0011575610842555761,
11508
+ "learning_rate": 4.2540792540792545e-05,
11509
+ "loss": 0.004,
11510
+ "step": 1643
11511
+ },
11512
+ {
11513
+ "epoch": 0.958041958041958,
11514
+ "grad_norm": 0.0012061079032719135,
11515
+ "learning_rate": 4.195804195804196e-05,
11516
+ "loss": 0.0043,
11517
+ "step": 1644
11518
+ },
11519
+ {
11520
+ "epoch": 0.9586247086247086,
11521
+ "grad_norm": 0.0008972581708803773,
11522
+ "learning_rate": 4.1375291375291377e-05,
11523
+ "loss": 0.0038,
11524
+ "step": 1645
11525
+ },
11526
+ {
11527
+ "epoch": 0.9592074592074592,
11528
+ "grad_norm": 0.0008104901062324643,
11529
+ "learning_rate": 4.079254079254079e-05,
11530
+ "loss": 0.0028,
11531
+ "step": 1646
11532
+ },
11533
+ {
11534
+ "epoch": 0.9597902097902098,
11535
+ "grad_norm": 0.0007863112259656191,
11536
+ "learning_rate": 4.020979020979021e-05,
11537
+ "loss": 0.003,
11538
+ "step": 1647
11539
+ },
11540
+ {
11541
+ "epoch": 0.9603729603729604,
11542
+ "grad_norm": 0.0006893305107951164,
11543
+ "learning_rate": 3.9627039627039624e-05,
11544
+ "loss": 0.0026,
11545
+ "step": 1648
11546
+ },
11547
+ {
11548
+ "epoch": 0.960955710955711,
11549
+ "grad_norm": 0.0009396909736096859,
11550
+ "learning_rate": 3.904428904428905e-05,
11551
+ "loss": 0.0029,
11552
+ "step": 1649
11553
+ },
11554
+ {
11555
+ "epoch": 0.9615384615384616,
11556
+ "grad_norm": 0.0007228578324429691,
11557
+ "learning_rate": 3.846153846153846e-05,
11558
+ "loss": 0.0023,
11559
+ "step": 1650
11560
+ },
11561
+ {
11562
+ "epoch": 0.9621212121212122,
11563
+ "grad_norm": 0.0011825780384242535,
11564
+ "learning_rate": 3.787878787878788e-05,
11565
+ "loss": 0.0044,
11566
+ "step": 1651
11567
+ },
11568
+ {
11569
+ "epoch": 0.9627039627039627,
11570
+ "grad_norm": 0.0007647788152098656,
11571
+ "learning_rate": 3.7296037296037295e-05,
11572
+ "loss": 0.003,
11573
+ "step": 1652
11574
+ },
11575
+ {
11576
+ "epoch": 0.9632867132867133,
11577
+ "grad_norm": 0.0010042464127764106,
11578
+ "learning_rate": 3.671328671328671e-05,
11579
+ "loss": 0.0041,
11580
+ "step": 1653
11581
+ },
11582
+ {
11583
+ "epoch": 0.9638694638694638,
11584
+ "grad_norm": 0.0009029952925629914,
11585
+ "learning_rate": 3.6130536130536126e-05,
11586
+ "loss": 0.0032,
11587
+ "step": 1654
11588
+ },
11589
+ {
11590
+ "epoch": 0.9644522144522144,
11591
+ "grad_norm": 0.0010356158018112183,
11592
+ "learning_rate": 3.554778554778554e-05,
11593
+ "loss": 0.0031,
11594
+ "step": 1655
11595
+ },
11596
+ {
11597
+ "epoch": 0.965034965034965,
11598
+ "grad_norm": 0.0010094497120007873,
11599
+ "learning_rate": 3.4965034965034965e-05,
11600
+ "loss": 0.0043,
11601
+ "step": 1656
11602
+ },
11603
+ {
11604
+ "epoch": 0.9656177156177156,
11605
+ "grad_norm": 0.0008370497962459922,
11606
+ "learning_rate": 3.438228438228439e-05,
11607
+ "loss": 0.0035,
11608
+ "step": 1657
11609
+ },
11610
+ {
11611
+ "epoch": 0.9662004662004662,
11612
+ "grad_norm": 0.000803111121058464,
11613
+ "learning_rate": 3.3799533799533804e-05,
11614
+ "loss": 0.0029,
11615
+ "step": 1658
11616
+ },
11617
+ {
11618
+ "epoch": 0.9667832167832168,
11619
+ "grad_norm": 0.0011143162846565247,
11620
+ "learning_rate": 3.321678321678322e-05,
11621
+ "loss": 0.0035,
11622
+ "step": 1659
11623
+ },
11624
+ {
11625
+ "epoch": 0.9673659673659674,
11626
+ "grad_norm": 0.0008031773613765836,
11627
+ "learning_rate": 3.2634032634032635e-05,
11628
+ "loss": 0.0031,
11629
+ "step": 1660
11630
+ },
11631
+ {
11632
+ "epoch": 0.967948717948718,
11633
+ "grad_norm": 0.0011737227905541658,
11634
+ "learning_rate": 3.205128205128205e-05,
11635
+ "loss": 0.004,
11636
+ "step": 1661
11637
+ },
11638
+ {
11639
+ "epoch": 0.9685314685314685,
11640
+ "grad_norm": 0.001068048644810915,
11641
+ "learning_rate": 3.1468531468531474e-05,
11642
+ "loss": 0.0038,
11643
+ "step": 1662
11644
+ },
11645
+ {
11646
+ "epoch": 0.9691142191142191,
11647
+ "grad_norm": 0.0008520625997334719,
11648
+ "learning_rate": 3.088578088578088e-05,
11649
+ "loss": 0.0032,
11650
+ "step": 1663
11651
+ },
11652
+ {
11653
+ "epoch": 0.9696969696969697,
11654
+ "grad_norm": 0.0006241014925763011,
11655
+ "learning_rate": 3.0303030303030306e-05,
11656
+ "loss": 0.0023,
11657
+ "step": 1664
11658
+ },
11659
+ {
11660
+ "epoch": 0.9702797202797203,
11661
+ "grad_norm": 0.0013019571779295802,
11662
+ "learning_rate": 2.972027972027972e-05,
11663
+ "loss": 0.0048,
11664
+ "step": 1665
11665
+ },
11666
+ {
11667
+ "epoch": 0.9708624708624709,
11668
+ "grad_norm": 0.0011066205333918333,
11669
+ "learning_rate": 2.9137529137529138e-05,
11670
+ "loss": 0.0031,
11671
+ "step": 1666
11672
+ },
11673
+ {
11674
+ "epoch": 0.9714452214452215,
11675
+ "grad_norm": 0.0010444342624396086,
11676
+ "learning_rate": 2.8554778554778557e-05,
11677
+ "loss": 0.0033,
11678
+ "step": 1667
11679
+ },
11680
+ {
11681
+ "epoch": 0.972027972027972,
11682
+ "grad_norm": 0.0007983744144439697,
11683
+ "learning_rate": 2.7972027972027973e-05,
11684
+ "loss": 0.0032,
11685
+ "step": 1668
11686
+ },
11687
+ {
11688
+ "epoch": 0.9726107226107226,
11689
+ "grad_norm": 0.0008884937269613147,
11690
+ "learning_rate": 2.738927738927739e-05,
11691
+ "loss": 0.0037,
11692
+ "step": 1669
11693
+ },
11694
+ {
11695
+ "epoch": 0.9731934731934732,
11696
+ "grad_norm": 0.0012803805293515325,
11697
+ "learning_rate": 2.6806526806526808e-05,
11698
+ "loss": 0.0034,
11699
+ "step": 1670
11700
+ },
11701
+ {
11702
+ "epoch": 0.9737762237762237,
11703
+ "grad_norm": 0.0011096763191744685,
11704
+ "learning_rate": 2.6223776223776224e-05,
11705
+ "loss": 0.0035,
11706
+ "step": 1671
11707
+ },
11708
+ {
11709
+ "epoch": 0.9743589743589743,
11710
+ "grad_norm": 0.0011874607298523188,
11711
+ "learning_rate": 2.564102564102564e-05,
11712
+ "loss": 0.0043,
11713
+ "step": 1672
11714
+ },
11715
+ {
11716
+ "epoch": 0.9749417249417249,
11717
+ "grad_norm": 0.0009078698931261897,
11718
+ "learning_rate": 2.505827505827506e-05,
11719
+ "loss": 0.0027,
11720
+ "step": 1673
11721
+ },
11722
+ {
11723
+ "epoch": 0.9755244755244755,
11724
+ "grad_norm": 0.0009188731200993061,
11725
+ "learning_rate": 2.4475524475524475e-05,
11726
+ "loss": 0.0046,
11727
+ "step": 1674
11728
+ },
11729
+ {
11730
+ "epoch": 0.9761072261072261,
11731
+ "grad_norm": 0.0009996923618018627,
11732
+ "learning_rate": 2.3892773892773894e-05,
11733
+ "loss": 0.006,
11734
+ "step": 1675
11735
+ },
11736
+ {
11737
+ "epoch": 0.9766899766899767,
11738
+ "grad_norm": 0.0008249058737419546,
11739
+ "learning_rate": 2.3310023310023313e-05,
11740
+ "loss": 0.0033,
11741
+ "step": 1676
11742
+ },
11743
+ {
11744
+ "epoch": 0.9772727272727273,
11745
+ "grad_norm": 0.0008003967232070863,
11746
+ "learning_rate": 2.272727272727273e-05,
11747
+ "loss": 0.003,
11748
+ "step": 1677
11749
+ },
11750
+ {
11751
+ "epoch": 0.9778554778554779,
11752
+ "grad_norm": 0.0014063924318179488,
11753
+ "learning_rate": 2.2144522144522145e-05,
11754
+ "loss": 0.0036,
11755
+ "step": 1678
11756
+ },
11757
+ {
11758
+ "epoch": 0.9784382284382285,
11759
+ "grad_norm": 0.0008004964329302311,
11760
+ "learning_rate": 2.156177156177156e-05,
11761
+ "loss": 0.0028,
11762
+ "step": 1679
11763
+ },
11764
+ {
11765
+ "epoch": 0.9790209790209791,
11766
+ "grad_norm": 0.0009626666433177888,
11767
+ "learning_rate": 2.097902097902098e-05,
11768
+ "loss": 0.0027,
11769
+ "step": 1680
11770
+ },
11771
+ {
11772
+ "epoch": 0.9796037296037297,
11773
+ "grad_norm": 0.0008628361392766237,
11774
+ "learning_rate": 2.0396270396270396e-05,
11775
+ "loss": 0.0027,
11776
+ "step": 1681
11777
+ },
11778
+ {
11779
+ "epoch": 0.9801864801864801,
11780
+ "grad_norm": 0.0008874722989276052,
11781
+ "learning_rate": 1.9813519813519812e-05,
11782
+ "loss": 0.0042,
11783
+ "step": 1682
11784
+ },
11785
+ {
11786
+ "epoch": 0.9807692307692307,
11787
+ "grad_norm": 0.0015157037414610386,
11788
+ "learning_rate": 1.923076923076923e-05,
11789
+ "loss": 0.0052,
11790
+ "step": 1683
11791
+ },
11792
+ {
11793
+ "epoch": 0.9813519813519813,
11794
+ "grad_norm": 0.0011951492633670568,
11795
+ "learning_rate": 1.8648018648018647e-05,
11796
+ "loss": 0.0049,
11797
+ "step": 1684
11798
+ },
11799
+ {
11800
+ "epoch": 0.9819347319347319,
11801
+ "grad_norm": 0.0009483549511060119,
11802
+ "learning_rate": 1.8065268065268063e-05,
11803
+ "loss": 0.0026,
11804
+ "step": 1685
11805
+ },
11806
+ {
11807
+ "epoch": 0.9825174825174825,
11808
+ "grad_norm": 0.000737398280762136,
11809
+ "learning_rate": 1.7482517482517483e-05,
11810
+ "loss": 0.0027,
11811
+ "step": 1686
11812
+ },
11813
+ {
11814
+ "epoch": 0.9831002331002331,
11815
+ "grad_norm": 0.001031695050187409,
11816
+ "learning_rate": 1.6899766899766902e-05,
11817
+ "loss": 0.0025,
11818
+ "step": 1687
11819
+ },
11820
+ {
11821
+ "epoch": 0.9836829836829837,
11822
+ "grad_norm": 0.0009539015591144562,
11823
+ "learning_rate": 1.6317016317016318e-05,
11824
+ "loss": 0.0027,
11825
+ "step": 1688
11826
+ },
11827
+ {
11828
+ "epoch": 0.9842657342657343,
11829
+ "grad_norm": 0.0007205713191069663,
11830
+ "learning_rate": 1.5734265734265737e-05,
11831
+ "loss": 0.0026,
11832
+ "step": 1689
11833
+ },
11834
+ {
11835
+ "epoch": 0.9848484848484849,
11836
+ "grad_norm": 0.0009316445211879909,
11837
+ "learning_rate": 1.5151515151515153e-05,
11838
+ "loss": 0.0032,
11839
+ "step": 1690
11840
+ },
11841
+ {
11842
+ "epoch": 0.9854312354312355,
11843
+ "grad_norm": 0.0010426414664834738,
11844
+ "learning_rate": 1.4568764568764569e-05,
11845
+ "loss": 0.0041,
11846
+ "step": 1691
11847
+ },
11848
+ {
11849
+ "epoch": 0.986013986013986,
11850
+ "grad_norm": 0.0006400300771929324,
11851
+ "learning_rate": 1.3986013986013986e-05,
11852
+ "loss": 0.0024,
11853
+ "step": 1692
11854
+ },
11855
+ {
11856
+ "epoch": 0.9865967365967366,
11857
+ "grad_norm": 0.0009383864235132933,
11858
+ "learning_rate": 1.3403263403263404e-05,
11859
+ "loss": 0.0029,
11860
+ "step": 1693
11861
+ },
11862
+ {
11863
+ "epoch": 0.9871794871794872,
11864
+ "grad_norm": 0.0008284033392556012,
11865
+ "learning_rate": 1.282051282051282e-05,
11866
+ "loss": 0.0029,
11867
+ "step": 1694
11868
+ },
11869
+ {
11870
+ "epoch": 0.9877622377622378,
11871
+ "grad_norm": 0.0010577579960227013,
11872
+ "learning_rate": 1.2237762237762237e-05,
11873
+ "loss": 0.0052,
11874
+ "step": 1695
11875
+ },
11876
+ {
11877
+ "epoch": 0.9883449883449883,
11878
+ "grad_norm": 0.0009795171208679676,
11879
+ "learning_rate": 1.1655011655011657e-05,
11880
+ "loss": 0.0046,
11881
+ "step": 1696
11882
+ },
11883
+ {
11884
+ "epoch": 0.9889277389277389,
11885
+ "grad_norm": 0.0007197922095656395,
11886
+ "learning_rate": 1.1072261072261073e-05,
11887
+ "loss": 0.0031,
11888
+ "step": 1697
11889
+ },
11890
+ {
11891
+ "epoch": 0.9895104895104895,
11892
+ "grad_norm": 0.0010326344054192305,
11893
+ "learning_rate": 1.048951048951049e-05,
11894
+ "loss": 0.0028,
11895
+ "step": 1698
11896
+ },
11897
+ {
11898
+ "epoch": 0.9900932400932401,
11899
+ "grad_norm": 0.0008666233043186367,
11900
+ "learning_rate": 9.906759906759906e-06,
11901
+ "loss": 0.003,
11902
+ "step": 1699
11903
+ },
11904
+ {
11905
+ "epoch": 0.9906759906759907,
11906
+ "grad_norm": 0.0008774434681981802,
11907
+ "learning_rate": 9.324009324009324e-06,
11908
+ "loss": 0.003,
11909
+ "step": 1700
11910
  }
11911
  ],
11912
  "logging_steps": 1,
 
11926
  "attributes": {}
11927
  }
11928
  },
11929
+ "total_flos": 7.109695330572042e+19,
11930
  "train_batch_size": 4,
11931
  "trial_name": null,
11932
  "trial_params": null