root
add ckpt27
070e573
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.992248062015504,
"eval_steps": 1000,
"global_step": 579,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05167958656330749,
"grad_norm": 19073.798828125,
"learning_rate": 6.896551724137932e-06,
"loss": 5744.6172,
"step": 10
},
{
"epoch": 0.10335917312661498,
"grad_norm": 28021.30859375,
"learning_rate": 1.3793103448275863e-05,
"loss": 5765.657,
"step": 20
},
{
"epoch": 0.15503875968992248,
"grad_norm": 16764.3125,
"learning_rate": 2.0689655172413797e-05,
"loss": 5782.6844,
"step": 30
},
{
"epoch": 0.20671834625322996,
"grad_norm": 3233.643798828125,
"learning_rate": 2.7586206896551727e-05,
"loss": 5258.6949,
"step": 40
},
{
"epoch": 0.25839793281653745,
"grad_norm": 2031.9879150390625,
"learning_rate": 3.4482758620689657e-05,
"loss": 878.5299,
"step": 50
},
{
"epoch": 0.31007751937984496,
"grad_norm": 1673.1912841796875,
"learning_rate": 3.999854561620655e-05,
"loss": 709.5174,
"step": 60
},
{
"epoch": 0.36175710594315247,
"grad_norm": 1925.0462646484375,
"learning_rate": 3.994766438992882e-05,
"loss": 607.4455,
"step": 70
},
{
"epoch": 0.4134366925064599,
"grad_norm": 307.27044677734375,
"learning_rate": 3.982427535895982e-05,
"loss": 546.8521,
"step": 80
},
{
"epoch": 0.46511627906976744,
"grad_norm": 212.57342529296875,
"learning_rate": 3.962882703033195e-05,
"loss": 528.296,
"step": 90
},
{
"epoch": 0.5167958656330749,
"grad_norm": 150.5155487060547,
"learning_rate": 3.936202983956098e-05,
"loss": 485.7102,
"step": 100
},
{
"epoch": 0.5684754521963824,
"grad_norm": 214.31271362304688,
"learning_rate": 3.9024853568282615e-05,
"loss": 461.3895,
"step": 110
},
{
"epoch": 0.6201550387596899,
"grad_norm": 188.0852813720703,
"learning_rate": 3.861852381919132e-05,
"loss": 445.7223,
"step": 120
},
{
"epoch": 0.6718346253229974,
"grad_norm": 210.2677764892578,
"learning_rate": 3.8144517561094635e-05,
"loss": 432.4087,
"step": 130
},
{
"epoch": 0.7235142118863049,
"grad_norm": 147.7620086669922,
"learning_rate": 3.760455776027636e-05,
"loss": 425.508,
"step": 140
},
{
"epoch": 0.7751937984496124,
"grad_norm": 82.30839538574219,
"learning_rate": 3.700060711768302e-05,
"loss": 418.1849,
"step": 150
},
{
"epoch": 0.8268733850129198,
"grad_norm": 150.20848083496094,
"learning_rate": 3.633486093469829e-05,
"loss": 407.9412,
"step": 160
},
{
"epoch": 0.8785529715762274,
"grad_norm": 78.9103775024414,
"learning_rate": 3.5609739133437666e-05,
"loss": 403.488,
"step": 170
},
{
"epoch": 0.9302325581395349,
"grad_norm": 342.4495544433594,
"learning_rate": 3.482787746056881e-05,
"loss": 416.6063,
"step": 180
},
{
"epoch": 0.9819121447028424,
"grad_norm": 129.02053833007812,
"learning_rate": 3.3992117906630744e-05,
"loss": 423.7297,
"step": 190
},
{
"epoch": 1.0335917312661498,
"grad_norm": 219.63763427734375,
"learning_rate": 3.310549837567685e-05,
"loss": 402.1201,
"step": 200
},
{
"epoch": 1.0852713178294573,
"grad_norm": 88.36170959472656,
"learning_rate": 3.2171241642791443e-05,
"loss": 394.8549,
"step": 210
},
{
"epoch": 1.1369509043927648,
"grad_norm": 156.26210021972656,
"learning_rate": 3.119274363961821e-05,
"loss": 393.7007,
"step": 220
},
{
"epoch": 1.1886304909560723,
"grad_norm": 99.02982330322266,
"learning_rate": 3.0173561110481606e-05,
"loss": 386.1269,
"step": 230
},
{
"epoch": 1.2403100775193798,
"grad_norm": 72.19245147705078,
"learning_rate": 2.9117398683969857e-05,
"loss": 380.2696,
"step": 240
},
{
"epoch": 1.2919896640826873,
"grad_norm": 174.1908721923828,
"learning_rate": 2.80280954069732e-05,
"loss": 385.1289,
"step": 250
},
{
"epoch": 1.3436692506459949,
"grad_norm": 172.32257080078125,
"learning_rate": 2.6909610790124772e-05,
"loss": 382.0893,
"step": 260
},
{
"epoch": 1.3953488372093024,
"grad_norm": 147.81851196289062,
"learning_rate": 2.5766010415367567e-05,
"loss": 381.5982,
"step": 270
},
{
"epoch": 1.4470284237726099,
"grad_norm": 178.86167907714844,
"learning_rate": 2.4601451157962616e-05,
"loss": 381.9054,
"step": 280
},
{
"epoch": 1.4987080103359174,
"grad_norm": 63.298179626464844,
"learning_rate": 2.3420166076654873e-05,
"loss": 382.8202,
"step": 290
},
{
"epoch": 1.550387596899225,
"grad_norm": 40.09732437133789,
"learning_rate": 2.2226449026919637e-05,
"loss": 367.5341,
"step": 300
},
{
"epoch": 1.6020671834625322,
"grad_norm": 78.8369369506836,
"learning_rate": 2.102463905321881e-05,
"loss": 378.6904,
"step": 310
},
{
"epoch": 1.65374677002584,
"grad_norm": 99.34815216064453,
"learning_rate": 1.9819104616999584e-05,
"loss": 373.315,
"step": 320
},
{
"epoch": 1.7054263565891472,
"grad_norm": 59.13042449951172,
"learning_rate": 1.8614227717765327e-05,
"loss": 372.912,
"step": 330
},
{
"epoch": 1.757105943152455,
"grad_norm": 105.593017578125,
"learning_rate": 1.7414387964936913e-05,
"loss": 365.8675,
"step": 340
},
{
"epoch": 1.8087855297157622,
"grad_norm": 37.02623748779297,
"learning_rate": 1.6223946658401818e-05,
"loss": 363.1097,
"step": 350
},
{
"epoch": 1.8604651162790697,
"grad_norm": 70.97673797607422,
"learning_rate": 1.5047230935616497e-05,
"loss": 365.041,
"step": 360
},
{
"epoch": 1.9121447028423773,
"grad_norm": 98.18872833251953,
"learning_rate": 1.3888518042885934e-05,
"loss": 363.4152,
"step": 370
},
{
"epoch": 1.9638242894056848,
"grad_norm": 72.34111785888672,
"learning_rate": 1.2752019787992587e-05,
"loss": 367.2338,
"step": 380
},
{
"epoch": 2.0155038759689923,
"grad_norm": 85.07888793945312,
"learning_rate": 1.164186723068795e-05,
"loss": 354.9984,
"step": 390
},
{
"epoch": 2.0671834625322996,
"grad_norm": 61.755882263183594,
"learning_rate": 1.0562095666695352e-05,
"loss": 360.0696,
"step": 400
},
{
"epoch": 2.1188630490956073,
"grad_norm": 122.23174285888672,
"learning_rate": 9.516629959805468e-06,
"loss": 355.7537,
"step": 410
},
{
"epoch": 2.1705426356589146,
"grad_norm": 65.74703216552734,
"learning_rate": 8.50927027538128e-06,
"loss": 356.938,
"step": 420
},
{
"epoch": 2.2222222222222223,
"grad_norm": 43.818634033203125,
"learning_rate": 7.543678267129408e-06,
"loss": 358.646,
"step": 430
},
{
"epoch": 2.2739018087855296,
"grad_norm": 46.30788040161133,
"learning_rate": 6.623363767347874e-06,
"loss": 360.0613,
"step": 440
},
{
"epoch": 2.3255813953488373,
"grad_norm": 43.955589294433594,
"learning_rate": 5.751672029029734e-06,
"loss": 353.4506,
"step": 450
},
{
"epoch": 2.3772609819121446,
"grad_norm": 128.83164978027344,
"learning_rate": 4.931771566196332e-06,
"loss": 353.9634,
"step": 460
},
{
"epoch": 2.4289405684754524,
"grad_norm": 66.79071807861328,
"learning_rate": 4.166642636659495e-06,
"loss": 355.8042,
"step": 470
},
{
"epoch": 2.4806201550387597,
"grad_norm": 39.45721435546875,
"learning_rate": 3.459066409076448e-06,
"loss": 351.7647,
"step": 480
},
{
"epoch": 2.532299741602067,
"grad_norm": 42.53300094604492,
"learning_rate": 2.8116148536744448e-06,
"loss": 356.8211,
"step": 490
},
{
"epoch": 2.5839793281653747,
"grad_norm": 26.914024353027344,
"learning_rate": 2.2266413933910426e-06,
"loss": 354.2936,
"step": 500
},
{
"epoch": 2.6356589147286824,
"grad_norm": 28.409818649291992,
"learning_rate": 1.7062723494124545e-06,
"loss": 354.7556,
"step": 510
},
{
"epoch": 2.6873385012919897,
"grad_norm": 29.75973129272461,
"learning_rate": 1.252399212204467e-06,
"loss": 353.6137,
"step": 520
},
{
"epoch": 2.739018087855297,
"grad_norm": 24.494775772094727,
"learning_rate": 8.666717661299917e-07,
"loss": 354.3009,
"step": 530
},
{
"epoch": 2.7906976744186047,
"grad_norm": 34.55462646484375,
"learning_rate": 5.504920926446611e-07,
"loss": 356.5468,
"step": 540
},
{
"epoch": 2.842377260981912,
"grad_norm": 72.49002075195312,
"learning_rate": 3.0500947386812973e-07,
"loss": 351.6954,
"step": 550
},
{
"epoch": 2.8940568475452197,
"grad_norm": 23.444374084472656,
"learning_rate": 1.3111621505626616e-07,
"loss": 351.0193,
"step": 560
},
{
"epoch": 2.945736434108527,
"grad_norm": 18.557331085205078,
"learning_rate": 2.9444401158995606e-08,
"loss": 355.1985,
"step": 570
}
],
"logging_steps": 10,
"max_steps": 579,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.089388751627223e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}