{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.992248062015504, "eval_steps": 1000, "global_step": 579, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05167958656330749, "grad_norm": 19073.798828125, "learning_rate": 6.896551724137932e-06, "loss": 5744.6172, "step": 10 }, { "epoch": 0.10335917312661498, "grad_norm": 28021.30859375, "learning_rate": 1.3793103448275863e-05, "loss": 5765.657, "step": 20 }, { "epoch": 0.15503875968992248, "grad_norm": 16764.3125, "learning_rate": 2.0689655172413797e-05, "loss": 5782.6844, "step": 30 }, { "epoch": 0.20671834625322996, "grad_norm": 3233.643798828125, "learning_rate": 2.7586206896551727e-05, "loss": 5258.6949, "step": 40 }, { "epoch": 0.25839793281653745, "grad_norm": 2031.9879150390625, "learning_rate": 3.4482758620689657e-05, "loss": 878.5299, "step": 50 }, { "epoch": 0.31007751937984496, "grad_norm": 1673.1912841796875, "learning_rate": 3.999854561620655e-05, "loss": 709.5174, "step": 60 }, { "epoch": 0.36175710594315247, "grad_norm": 1925.0462646484375, "learning_rate": 3.994766438992882e-05, "loss": 607.4455, "step": 70 }, { "epoch": 0.4134366925064599, "grad_norm": 307.27044677734375, "learning_rate": 3.982427535895982e-05, "loss": 546.8521, "step": 80 }, { "epoch": 0.46511627906976744, "grad_norm": 212.57342529296875, "learning_rate": 3.962882703033195e-05, "loss": 528.296, "step": 90 }, { "epoch": 0.5167958656330749, "grad_norm": 150.5155487060547, "learning_rate": 3.936202983956098e-05, "loss": 485.7102, "step": 100 }, { "epoch": 0.5684754521963824, "grad_norm": 214.31271362304688, "learning_rate": 3.9024853568282615e-05, "loss": 461.3895, "step": 110 }, { "epoch": 0.6201550387596899, "grad_norm": 188.0852813720703, "learning_rate": 3.861852381919132e-05, "loss": 445.7223, "step": 120 }, { "epoch": 0.6718346253229974, "grad_norm": 210.2677764892578, "learning_rate": 3.8144517561094635e-05, "loss": 432.4087, "step": 130 }, { "epoch": 0.7235142118863049, "grad_norm": 147.7620086669922, "learning_rate": 3.760455776027636e-05, "loss": 425.508, "step": 140 }, { "epoch": 0.7751937984496124, "grad_norm": 82.30839538574219, "learning_rate": 3.700060711768302e-05, "loss": 418.1849, "step": 150 }, { "epoch": 0.8268733850129198, "grad_norm": 150.20848083496094, "learning_rate": 3.633486093469829e-05, "loss": 407.9412, "step": 160 }, { "epoch": 0.8785529715762274, "grad_norm": 78.9103775024414, "learning_rate": 3.5609739133437666e-05, "loss": 403.488, "step": 170 }, { "epoch": 0.9302325581395349, "grad_norm": 342.4495544433594, "learning_rate": 3.482787746056881e-05, "loss": 416.6063, "step": 180 }, { "epoch": 0.9819121447028424, "grad_norm": 129.02053833007812, "learning_rate": 3.3992117906630744e-05, "loss": 423.7297, "step": 190 }, { "epoch": 1.0335917312661498, "grad_norm": 219.63763427734375, "learning_rate": 3.310549837567685e-05, "loss": 402.1201, "step": 200 }, { "epoch": 1.0852713178294573, "grad_norm": 88.36170959472656, "learning_rate": 3.2171241642791443e-05, "loss": 394.8549, "step": 210 }, { "epoch": 1.1369509043927648, "grad_norm": 156.26210021972656, "learning_rate": 3.119274363961821e-05, "loss": 393.7007, "step": 220 }, { "epoch": 1.1886304909560723, "grad_norm": 99.02982330322266, "learning_rate": 3.0173561110481606e-05, "loss": 386.1269, "step": 230 }, { "epoch": 1.2403100775193798, "grad_norm": 72.19245147705078, "learning_rate": 2.9117398683969857e-05, "loss": 380.2696, "step": 240 }, { "epoch": 1.2919896640826873, "grad_norm": 174.1908721923828, "learning_rate": 2.80280954069732e-05, "loss": 385.1289, "step": 250 }, { "epoch": 1.3436692506459949, "grad_norm": 172.32257080078125, "learning_rate": 2.6909610790124772e-05, "loss": 382.0893, "step": 260 }, { "epoch": 1.3953488372093024, "grad_norm": 147.81851196289062, "learning_rate": 2.5766010415367567e-05, "loss": 381.5982, "step": 270 }, { "epoch": 1.4470284237726099, "grad_norm": 178.86167907714844, "learning_rate": 2.4601451157962616e-05, "loss": 381.9054, "step": 280 }, { "epoch": 1.4987080103359174, "grad_norm": 63.298179626464844, "learning_rate": 2.3420166076654873e-05, "loss": 382.8202, "step": 290 }, { "epoch": 1.550387596899225, "grad_norm": 40.09732437133789, "learning_rate": 2.2226449026919637e-05, "loss": 367.5341, "step": 300 }, { "epoch": 1.6020671834625322, "grad_norm": 78.8369369506836, "learning_rate": 2.102463905321881e-05, "loss": 378.6904, "step": 310 }, { "epoch": 1.65374677002584, "grad_norm": 99.34815216064453, "learning_rate": 1.9819104616999584e-05, "loss": 373.315, "step": 320 }, { "epoch": 1.7054263565891472, "grad_norm": 59.13042449951172, "learning_rate": 1.8614227717765327e-05, "loss": 372.912, "step": 330 }, { "epoch": 1.757105943152455, "grad_norm": 105.593017578125, "learning_rate": 1.7414387964936913e-05, "loss": 365.8675, "step": 340 }, { "epoch": 1.8087855297157622, "grad_norm": 37.02623748779297, "learning_rate": 1.6223946658401818e-05, "loss": 363.1097, "step": 350 }, { "epoch": 1.8604651162790697, "grad_norm": 70.97673797607422, "learning_rate": 1.5047230935616497e-05, "loss": 365.041, "step": 360 }, { "epoch": 1.9121447028423773, "grad_norm": 98.18872833251953, "learning_rate": 1.3888518042885934e-05, "loss": 363.4152, "step": 370 }, { "epoch": 1.9638242894056848, "grad_norm": 72.34111785888672, "learning_rate": 1.2752019787992587e-05, "loss": 367.2338, "step": 380 }, { "epoch": 2.0155038759689923, "grad_norm": 85.07888793945312, "learning_rate": 1.164186723068795e-05, "loss": 354.9984, "step": 390 }, { "epoch": 2.0671834625322996, "grad_norm": 61.755882263183594, "learning_rate": 1.0562095666695352e-05, "loss": 360.0696, "step": 400 }, { "epoch": 2.1188630490956073, "grad_norm": 122.23174285888672, "learning_rate": 9.516629959805468e-06, "loss": 355.7537, "step": 410 }, { "epoch": 2.1705426356589146, "grad_norm": 65.74703216552734, "learning_rate": 8.50927027538128e-06, "loss": 356.938, "step": 420 }, { "epoch": 2.2222222222222223, "grad_norm": 43.818634033203125, "learning_rate": 7.543678267129408e-06, "loss": 358.646, "step": 430 }, { "epoch": 2.2739018087855296, "grad_norm": 46.30788040161133, "learning_rate": 6.623363767347874e-06, "loss": 360.0613, "step": 440 }, { "epoch": 2.3255813953488373, "grad_norm": 43.955589294433594, "learning_rate": 5.751672029029734e-06, "loss": 353.4506, "step": 450 }, { "epoch": 2.3772609819121446, "grad_norm": 128.83164978027344, "learning_rate": 4.931771566196332e-06, "loss": 353.9634, "step": 460 }, { "epoch": 2.4289405684754524, "grad_norm": 66.79071807861328, "learning_rate": 4.166642636659495e-06, "loss": 355.8042, "step": 470 }, { "epoch": 2.4806201550387597, "grad_norm": 39.45721435546875, "learning_rate": 3.459066409076448e-06, "loss": 351.7647, "step": 480 }, { "epoch": 2.532299741602067, "grad_norm": 42.53300094604492, "learning_rate": 2.8116148536744448e-06, "loss": 356.8211, "step": 490 }, { "epoch": 2.5839793281653747, "grad_norm": 26.914024353027344, "learning_rate": 2.2266413933910426e-06, "loss": 354.2936, "step": 500 }, { "epoch": 2.6356589147286824, "grad_norm": 28.409818649291992, "learning_rate": 1.7062723494124545e-06, "loss": 354.7556, "step": 510 }, { "epoch": 2.6873385012919897, "grad_norm": 29.75973129272461, "learning_rate": 1.252399212204467e-06, "loss": 353.6137, "step": 520 }, { "epoch": 2.739018087855297, "grad_norm": 24.494775772094727, "learning_rate": 8.666717661299917e-07, "loss": 354.3009, "step": 530 }, { "epoch": 2.7906976744186047, "grad_norm": 34.55462646484375, "learning_rate": 5.504920926446611e-07, "loss": 356.5468, "step": 540 }, { "epoch": 2.842377260981912, "grad_norm": 72.49002075195312, "learning_rate": 3.0500947386812973e-07, "loss": 351.6954, "step": 550 }, { "epoch": 2.8940568475452197, "grad_norm": 23.444374084472656, "learning_rate": 1.3111621505626616e-07, "loss": 351.0193, "step": 560 }, { "epoch": 2.945736434108527, "grad_norm": 18.557331085205078, "learning_rate": 2.9444401158995606e-08, "loss": 355.1985, "step": 570 } ], "logging_steps": 10, "max_steps": 579, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.089388751627223e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }