|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.992248062015504, |
|
"eval_steps": 1000, |
|
"global_step": 579, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05167958656330749, |
|
"grad_norm": 19073.798828125, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 5744.6172, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10335917312661498, |
|
"grad_norm": 28021.30859375, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 5765.657, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15503875968992248, |
|
"grad_norm": 16764.3125, |
|
"learning_rate": 2.0689655172413797e-05, |
|
"loss": 5782.6844, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20671834625322996, |
|
"grad_norm": 3233.643798828125, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 5258.6949, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25839793281653745, |
|
"grad_norm": 2031.9879150390625, |
|
"learning_rate": 3.4482758620689657e-05, |
|
"loss": 878.5299, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.31007751937984496, |
|
"grad_norm": 1673.1912841796875, |
|
"learning_rate": 3.999854561620655e-05, |
|
"loss": 709.5174, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.36175710594315247, |
|
"grad_norm": 1925.0462646484375, |
|
"learning_rate": 3.994766438992882e-05, |
|
"loss": 607.4455, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4134366925064599, |
|
"grad_norm": 307.27044677734375, |
|
"learning_rate": 3.982427535895982e-05, |
|
"loss": 546.8521, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 212.57342529296875, |
|
"learning_rate": 3.962882703033195e-05, |
|
"loss": 528.296, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5167958656330749, |
|
"grad_norm": 150.5155487060547, |
|
"learning_rate": 3.936202983956098e-05, |
|
"loss": 485.7102, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5684754521963824, |
|
"grad_norm": 214.31271362304688, |
|
"learning_rate": 3.9024853568282615e-05, |
|
"loss": 461.3895, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6201550387596899, |
|
"grad_norm": 188.0852813720703, |
|
"learning_rate": 3.861852381919132e-05, |
|
"loss": 445.7223, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6718346253229974, |
|
"grad_norm": 210.2677764892578, |
|
"learning_rate": 3.8144517561094635e-05, |
|
"loss": 432.4087, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7235142118863049, |
|
"grad_norm": 147.7620086669922, |
|
"learning_rate": 3.760455776027636e-05, |
|
"loss": 425.508, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 82.30839538574219, |
|
"learning_rate": 3.700060711768302e-05, |
|
"loss": 418.1849, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8268733850129198, |
|
"grad_norm": 150.20848083496094, |
|
"learning_rate": 3.633486093469829e-05, |
|
"loss": 407.9412, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8785529715762274, |
|
"grad_norm": 78.9103775024414, |
|
"learning_rate": 3.5609739133437666e-05, |
|
"loss": 403.488, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 342.4495544433594, |
|
"learning_rate": 3.482787746056881e-05, |
|
"loss": 416.6063, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9819121447028424, |
|
"grad_norm": 129.02053833007812, |
|
"learning_rate": 3.3992117906630744e-05, |
|
"loss": 423.7297, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0335917312661498, |
|
"grad_norm": 219.63763427734375, |
|
"learning_rate": 3.310549837567685e-05, |
|
"loss": 402.1201, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0852713178294573, |
|
"grad_norm": 88.36170959472656, |
|
"learning_rate": 3.2171241642791443e-05, |
|
"loss": 394.8549, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1369509043927648, |
|
"grad_norm": 156.26210021972656, |
|
"learning_rate": 3.119274363961821e-05, |
|
"loss": 393.7007, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1886304909560723, |
|
"grad_norm": 99.02982330322266, |
|
"learning_rate": 3.0173561110481606e-05, |
|
"loss": 386.1269, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2403100775193798, |
|
"grad_norm": 72.19245147705078, |
|
"learning_rate": 2.9117398683969857e-05, |
|
"loss": 380.2696, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2919896640826873, |
|
"grad_norm": 174.1908721923828, |
|
"learning_rate": 2.80280954069732e-05, |
|
"loss": 385.1289, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3436692506459949, |
|
"grad_norm": 172.32257080078125, |
|
"learning_rate": 2.6909610790124772e-05, |
|
"loss": 382.0893, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 147.81851196289062, |
|
"learning_rate": 2.5766010415367567e-05, |
|
"loss": 381.5982, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4470284237726099, |
|
"grad_norm": 178.86167907714844, |
|
"learning_rate": 2.4601451157962616e-05, |
|
"loss": 381.9054, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4987080103359174, |
|
"grad_norm": 63.298179626464844, |
|
"learning_rate": 2.3420166076654873e-05, |
|
"loss": 382.8202, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 40.09732437133789, |
|
"learning_rate": 2.2226449026919637e-05, |
|
"loss": 367.5341, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6020671834625322, |
|
"grad_norm": 78.8369369506836, |
|
"learning_rate": 2.102463905321881e-05, |
|
"loss": 378.6904, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.65374677002584, |
|
"grad_norm": 99.34815216064453, |
|
"learning_rate": 1.9819104616999584e-05, |
|
"loss": 373.315, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7054263565891472, |
|
"grad_norm": 59.13042449951172, |
|
"learning_rate": 1.8614227717765327e-05, |
|
"loss": 372.912, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.757105943152455, |
|
"grad_norm": 105.593017578125, |
|
"learning_rate": 1.7414387964936913e-05, |
|
"loss": 365.8675, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8087855297157622, |
|
"grad_norm": 37.02623748779297, |
|
"learning_rate": 1.6223946658401818e-05, |
|
"loss": 363.1097, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 70.97673797607422, |
|
"learning_rate": 1.5047230935616497e-05, |
|
"loss": 365.041, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.9121447028423773, |
|
"grad_norm": 98.18872833251953, |
|
"learning_rate": 1.3888518042885934e-05, |
|
"loss": 363.4152, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9638242894056848, |
|
"grad_norm": 72.34111785888672, |
|
"learning_rate": 1.2752019787992587e-05, |
|
"loss": 367.2338, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.0155038759689923, |
|
"grad_norm": 85.07888793945312, |
|
"learning_rate": 1.164186723068795e-05, |
|
"loss": 354.9984, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0671834625322996, |
|
"grad_norm": 61.755882263183594, |
|
"learning_rate": 1.0562095666695352e-05, |
|
"loss": 360.0696, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.1188630490956073, |
|
"grad_norm": 122.23174285888672, |
|
"learning_rate": 9.516629959805468e-06, |
|
"loss": 355.7537, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.1705426356589146, |
|
"grad_norm": 65.74703216552734, |
|
"learning_rate": 8.50927027538128e-06, |
|
"loss": 356.938, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 43.818634033203125, |
|
"learning_rate": 7.543678267129408e-06, |
|
"loss": 358.646, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.2739018087855296, |
|
"grad_norm": 46.30788040161133, |
|
"learning_rate": 6.623363767347874e-06, |
|
"loss": 360.0613, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 43.955589294433594, |
|
"learning_rate": 5.751672029029734e-06, |
|
"loss": 353.4506, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3772609819121446, |
|
"grad_norm": 128.83164978027344, |
|
"learning_rate": 4.931771566196332e-06, |
|
"loss": 353.9634, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.4289405684754524, |
|
"grad_norm": 66.79071807861328, |
|
"learning_rate": 4.166642636659495e-06, |
|
"loss": 355.8042, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4806201550387597, |
|
"grad_norm": 39.45721435546875, |
|
"learning_rate": 3.459066409076448e-06, |
|
"loss": 351.7647, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.532299741602067, |
|
"grad_norm": 42.53300094604492, |
|
"learning_rate": 2.8116148536744448e-06, |
|
"loss": 356.8211, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5839793281653747, |
|
"grad_norm": 26.914024353027344, |
|
"learning_rate": 2.2266413933910426e-06, |
|
"loss": 354.2936, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.6356589147286824, |
|
"grad_norm": 28.409818649291992, |
|
"learning_rate": 1.7062723494124545e-06, |
|
"loss": 354.7556, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6873385012919897, |
|
"grad_norm": 29.75973129272461, |
|
"learning_rate": 1.252399212204467e-06, |
|
"loss": 353.6137, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.739018087855297, |
|
"grad_norm": 24.494775772094727, |
|
"learning_rate": 8.666717661299917e-07, |
|
"loss": 354.3009, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 34.55462646484375, |
|
"learning_rate": 5.504920926446611e-07, |
|
"loss": 356.5468, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.842377260981912, |
|
"grad_norm": 72.49002075195312, |
|
"learning_rate": 3.0500947386812973e-07, |
|
"loss": 351.6954, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.8940568475452197, |
|
"grad_norm": 23.444374084472656, |
|
"learning_rate": 1.3111621505626616e-07, |
|
"loss": 351.0193, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.945736434108527, |
|
"grad_norm": 18.557331085205078, |
|
"learning_rate": 2.9444401158995606e-08, |
|
"loss": 355.1985, |
|
"step": 570 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 579, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.089388751627223e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|