|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005405405405405406, |
|
"grad_norm": 61.001103964725324, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 1.9097, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 35.97997402423919, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 1.7102, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 30.10156193198182, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 1.5516, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 6.532111146208257, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 1.4536, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 4.983904851506608, |
|
"learning_rate": 1.999820922669738e-05, |
|
"loss": 1.4112, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 3.8732866550483545, |
|
"learning_rate": 1.993559947963185e-05, |
|
"loss": 1.4056, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 3.0620367608079846, |
|
"learning_rate": 1.9784091409455728e-05, |
|
"loss": 1.4016, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 3.2352358324557167, |
|
"learning_rate": 1.9545040627715554e-05, |
|
"loss": 1.4128, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 3.022703286965415, |
|
"learning_rate": 1.9220586030376135e-05, |
|
"loss": 1.4148, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 2.8955611484982033, |
|
"learning_rate": 1.881363066014649e-05, |
|
"loss": 1.4159, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 2.8660620524615688, |
|
"learning_rate": 1.8327815731637612e-05, |
|
"loss": 1.42, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 2.632475843797219, |
|
"learning_rate": 1.7767488051760858e-05, |
|
"loss": 1.4078, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 2.5436272914892872, |
|
"learning_rate": 1.713766112687139e-05, |
|
"loss": 1.4015, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 2.431239149211266, |
|
"learning_rate": 1.644397030464877e-05, |
|
"loss": 1.3807, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 2.328539582829742, |
|
"learning_rate": 1.5692622352080662e-05, |
|
"loss": 1.3639, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 2.708625235728292, |
|
"learning_rate": 1.4890339920698334e-05, |
|
"loss": 1.3836, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 2.4202434897618317, |
|
"learning_rate": 1.404430139595877e-05, |
|
"loss": 1.3823, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 2.243346482513894, |
|
"learning_rate": 1.316207666896824e-05, |
|
"loss": 1.3538, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 2.4109049967489433, |
|
"learning_rate": 1.2251559405226943e-05, |
|
"loss": 1.3503, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"grad_norm": 2.242883116591874, |
|
"learning_rate": 1.1320896416417026e-05, |
|
"loss": 1.3562, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 2.1959662600808203, |
|
"learning_rate": 1.0378414767176706e-05, |
|
"loss": 1.3447, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 2.2348646553535714, |
|
"learning_rate": 9.43254726906926e-06, |
|
"loss": 1.3398, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 2.21405256524929, |
|
"learning_rate": 8.491757028386262e-06, |
|
"loss": 1.3383, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"grad_norm": 2.1710850419967263, |
|
"learning_rate": 7.564461722890082e-06, |
|
"loss": 1.3271, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 2.1752417729097338, |
|
"learning_rate": 6.6589582850261025e-06, |
|
"loss": 1.3176, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 2.1322248363189686, |
|
"learning_rate": 5.78334866549816e-06, |
|
"loss": 1.321, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 2.1516864548066525, |
|
"learning_rate": 4.9454673414341945e-06, |
|
"loss": 1.3074, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 2.04797883118049, |
|
"learning_rate": 4.152811217759529e-06, |
|
"loss": 1.3123, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 2.1202023783270367, |
|
"learning_rate": 3.4124725489820643e-06, |
|
"loss": 1.3026, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"grad_norm": 2.1203746539007158, |
|
"learning_rate": 2.7310754815685627e-06, |
|
"loss": 1.2838, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 2.058016060238832, |
|
"learning_rate": 2.114716784696342e-06, |
|
"loss": 1.2805, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"grad_norm": 2.072702697896577, |
|
"learning_rate": 1.5689112996891576e-06, |
|
"loss": 1.276, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 2.070358297476048, |
|
"learning_rate": 1.0985425962260342e-06, |
|
"loss": 1.2768, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 1.993421902652665, |
|
"learning_rate": 7.078192768243486e-07, |
|
"loss": 1.2755, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 2.02321205406663, |
|
"learning_rate": 4.0023732056077235e-07, |
|
"loss": 1.276, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 2.0216958269361154, |
|
"learning_rate": 1.7854880295797406e-07, |
|
"loss": 1.2694, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 2.0002791761840157, |
|
"learning_rate": 4.473727191441124e-08, |
|
"loss": 1.2669, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.9847900492979156, |
|
"learning_rate": 0.0, |
|
"loss": 1.2583, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.027027027027027, |
|
"grad_norm": 5.410199324361583, |
|
"learning_rate": 1.853992867931721e-05, |
|
"loss": 0.944, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.054054054054054, |
|
"grad_norm": 5.152995409876598, |
|
"learning_rate": 1.8414852973000503e-05, |
|
"loss": 0.956, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 3.933917659184313, |
|
"learning_rate": 1.8285096492438424e-05, |
|
"loss": 0.9283, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1081081081081081, |
|
"grad_norm": 2.759086157792593, |
|
"learning_rate": 1.8150731414862623e-05, |
|
"loss": 0.9151, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 2.910077750970637, |
|
"learning_rate": 1.8011832481043577e-05, |
|
"loss": 0.9413, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1621621621621623, |
|
"grad_norm": 3.1297209827761985, |
|
"learning_rate": 1.78684769537159e-05, |
|
"loss": 0.9243, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1891891891891893, |
|
"grad_norm": 3.406559269295354, |
|
"learning_rate": 1.7720744574600865e-05, |
|
"loss": 0.9474, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 3.2575862603317876, |
|
"learning_rate": 1.756871752004992e-05, |
|
"loss": 0.9461, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.2432432432432432, |
|
"grad_norm": 3.2529118198639644, |
|
"learning_rate": 1.7412480355334006e-05, |
|
"loss": 0.9385, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2702702702702702, |
|
"grad_norm": 3.096374104973297, |
|
"learning_rate": 1.7252119987603976e-05, |
|
"loss": 0.9564, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 2.9780724324068695, |
|
"learning_rate": 1.7087725617548385e-05, |
|
"loss": 0.9456, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3243243243243243, |
|
"grad_norm": 2.887431987514818, |
|
"learning_rate": 1.6919388689775463e-05, |
|
"loss": 0.9608, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 3.420843173177096, |
|
"learning_rate": 1.6747202841946928e-05, |
|
"loss": 0.9817, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3783783783783785, |
|
"grad_norm": 2.817422226047505, |
|
"learning_rate": 1.6571263852691887e-05, |
|
"loss": 0.9794, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4054054054054055, |
|
"grad_norm": 2.8475793868122135, |
|
"learning_rate": 1.639166958832985e-05, |
|
"loss": 0.9249, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4324324324324325, |
|
"grad_norm": 3.084978828155213, |
|
"learning_rate": 1.6208519948432438e-05, |
|
"loss": 0.9694, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 2.9443084757542923, |
|
"learning_rate": 1.6021916810254096e-05, |
|
"loss": 0.9646, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 2.7978723842230555, |
|
"learning_rate": 1.5831963972062734e-05, |
|
"loss": 0.9614, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 2.8201773773365155, |
|
"learning_rate": 1.5638767095401778e-05, |
|
"loss": 0.9785, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5405405405405406, |
|
"grad_norm": 2.6511276981563907, |
|
"learning_rate": 1.5442433646315792e-05, |
|
"loss": 0.956, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5675675675675675, |
|
"grad_norm": 2.907386340027783, |
|
"learning_rate": 1.5243072835572319e-05, |
|
"loss": 0.9386, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5945945945945947, |
|
"grad_norm": 3.2379041081891455, |
|
"learning_rate": 1.5040795557913246e-05, |
|
"loss": 0.9891, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 2.659956186011908, |
|
"learning_rate": 1.4835714330369445e-05, |
|
"loss": 0.9806, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6486486486486487, |
|
"grad_norm": 2.794287463461555, |
|
"learning_rate": 1.4627943229672992e-05, |
|
"loss": 0.9856, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.6756756756756757, |
|
"grad_norm": 2.9825237321465647, |
|
"learning_rate": 1.4417597828801833e-05, |
|
"loss": 1.011, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7027027027027026, |
|
"grad_norm": 2.6599834596055145, |
|
"learning_rate": 1.4204795132692146e-05, |
|
"loss": 0.9682, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 2.63763723653628, |
|
"learning_rate": 1.3989653513154165e-05, |
|
"loss": 0.9703, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 2.702492331093115, |
|
"learning_rate": 1.37722926430277e-05, |
|
"loss": 0.9695, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 2.6668927523262234, |
|
"learning_rate": 1.3552833429613939e-05, |
|
"loss": 0.9621, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.810810810810811, |
|
"grad_norm": 2.830202295132874, |
|
"learning_rate": 1.3331397947420578e-05, |
|
"loss": 1.0006, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"grad_norm": 2.960495369045161, |
|
"learning_rate": 1.3108109370257714e-05, |
|
"loss": 0.9658, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.864864864864865, |
|
"grad_norm": 2.8089453201114005, |
|
"learning_rate": 1.288309190272222e-05, |
|
"loss": 0.978, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 2.690571777156308, |
|
"learning_rate": 1.2656470711108763e-05, |
|
"loss": 0.9694, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9189189189189189, |
|
"grad_norm": 3.072726161364189, |
|
"learning_rate": 1.2428371853785872e-05, |
|
"loss": 0.995, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 2.650492212953216, |
|
"learning_rate": 1.2198922211075779e-05, |
|
"loss": 0.9633, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.972972972972973, |
|
"grad_norm": 2.4600728300935524, |
|
"learning_rate": 1.1968249414677055e-05, |
|
"loss": 0.9518, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.660921417467904, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.9697, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.2888436317443848, |
|
"eval_runtime": 49.8535, |
|
"eval_samples_per_second": 13.219, |
|
"eval_steps_per_second": 0.421, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 4.650300715219466, |
|
"learning_rate": 1.150374821813937e-05, |
|
"loss": 0.4819, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.054054054054054, |
|
"grad_norm": 3.6035388443171055, |
|
"learning_rate": 1.1270178197468788e-05, |
|
"loss": 0.4496, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.081081081081081, |
|
"grad_norm": 3.2195791970064738, |
|
"learning_rate": 1.1035901638322392e-05, |
|
"loss": 0.4485, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 3.969479182342283, |
|
"learning_rate": 1.080104885737807e-05, |
|
"loss": 0.431, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.135135135135135, |
|
"grad_norm": 3.5220560829841108, |
|
"learning_rate": 1.0565750491837925e-05, |
|
"loss": 0.4358, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 3.054414879545972, |
|
"learning_rate": 1.0330137426761136e-05, |
|
"loss": 0.4195, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.189189189189189, |
|
"grad_norm": 2.8407451960442844, |
|
"learning_rate": 1.0094340722258969e-05, |
|
"loss": 0.4139, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.2162162162162162, |
|
"grad_norm": 3.1380887590616835, |
|
"learning_rate": 9.858491540592383e-06, |
|
"loss": 0.4288, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.2432432432432434, |
|
"grad_norm": 3.0000811662471607, |
|
"learning_rate": 9.622721073212831e-06, |
|
"loss": 0.4307, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 2.9078632951242, |
|
"learning_rate": 9.38716046778684e-06, |
|
"loss": 0.4346, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 3.0788575744315363, |
|
"learning_rate": 9.151940755244912e-06, |
|
"loss": 0.4354, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.3243243243243246, |
|
"grad_norm": 3.2452582293940764, |
|
"learning_rate": 8.917192776895382e-06, |
|
"loss": 0.43, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.3513513513513513, |
|
"grad_norm": 2.9959040185970345, |
|
"learning_rate": 8.683047111643764e-06, |
|
"loss": 0.4288, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 2.9976789493802642, |
|
"learning_rate": 8.449634003358022e-06, |
|
"loss": 0.4154, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.4054054054054053, |
|
"grad_norm": 3.342720495579085, |
|
"learning_rate": 8.217083288420241e-06, |
|
"loss": 0.4184, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 2.9954997310319817, |
|
"learning_rate": 7.985524323504948e-06, |
|
"loss": 0.4244, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.4594594594594597, |
|
"grad_norm": 3.1107476712409436, |
|
"learning_rate": 7.755085913624274e-06, |
|
"loss": 0.4201, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.4864864864864864, |
|
"grad_norm": 2.982814359917317, |
|
"learning_rate": 7.525896240479977e-06, |
|
"loss": 0.4223, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.5135135135135136, |
|
"grad_norm": 2.918536641642885, |
|
"learning_rate": 7.29808279116218e-06, |
|
"loss": 0.4355, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.5405405405405403, |
|
"grad_norm": 3.0794937831504914, |
|
"learning_rate": 7.071772287234497e-06, |
|
"loss": 0.4182, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 3.350485105176484, |
|
"learning_rate": 6.8470906142449764e-06, |
|
"loss": 0.4173, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 2.978358242778936, |
|
"learning_rate": 6.624162751702077e-06, |
|
"loss": 0.4257, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.6216216216216215, |
|
"grad_norm": 3.03990431149403, |
|
"learning_rate": 6.403112703554643e-06, |
|
"loss": 0.4198, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.6486486486486487, |
|
"grad_norm": 3.2537909886176797, |
|
"learning_rate": 6.184063429214515e-06, |
|
"loss": 0.4289, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.6756756756756754, |
|
"grad_norm": 2.9401645896134068, |
|
"learning_rate": 5.967136775160188e-06, |
|
"loss": 0.4229, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 2.8796497097799385, |
|
"learning_rate": 5.752453407159521e-06, |
|
"loss": 0.4134, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.72972972972973, |
|
"grad_norm": 3.240398354047165, |
|
"learning_rate": 5.5401327431492425e-06, |
|
"loss": 0.4212, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.756756756756757, |
|
"grad_norm": 3.0875752772388556, |
|
"learning_rate": 5.33029288680852e-06, |
|
"loss": 0.4366, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.7837837837837838, |
|
"grad_norm": 2.950418599154064, |
|
"learning_rate": 5.1230505618636575e-06, |
|
"loss": 0.4132, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 2.9691405536481708, |
|
"learning_rate": 4.918521047160309e-06, |
|
"loss": 0.421, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 3.288503082597523, |
|
"learning_rate": 4.716818112539485e-06, |
|
"loss": 0.4177, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.864864864864865, |
|
"grad_norm": 2.8513863896448304, |
|
"learning_rate": 4.518053955552903e-06, |
|
"loss": 0.4027, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.891891891891892, |
|
"grad_norm": 2.906451097506746, |
|
"learning_rate": 4.322339139052922e-06, |
|
"loss": 0.4104, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.918918918918919, |
|
"grad_norm": 2.8739958546058255, |
|
"learning_rate": 4.1297825296918145e-06, |
|
"loss": 0.4006, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.945945945945946, |
|
"grad_norm": 2.9696442817419113, |
|
"learning_rate": 3.940491237364519e-06, |
|
"loss": 0.4, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 3.0645448850809793, |
|
"learning_rate": 3.754570555628613e-06, |
|
"loss": 0.4043, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.9322335051873423, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 0.4114, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.4739221334457397, |
|
"eval_runtime": 47.6894, |
|
"eval_samples_per_second": 13.819, |
|
"eval_steps_per_second": 0.44, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.027027027027027, |
|
"grad_norm": 2.239642555101549, |
|
"learning_rate": 3.3932527660991877e-06, |
|
"loss": 0.1581, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.054054054054054, |
|
"grad_norm": 3.1082908400603464, |
|
"learning_rate": 3.2180566418533365e-06, |
|
"loss": 0.1515, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.081081081081081, |
|
"grad_norm": 2.152743576813358, |
|
"learning_rate": 3.0466329834968234e-06, |
|
"loss": 0.1438, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 2.232485171340327, |
|
"learning_rate": 2.879077145689746e-06, |
|
"loss": 0.1452, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.135135135135135, |
|
"grad_norm": 2.1315167934046464, |
|
"learning_rate": 2.715482331611393e-06, |
|
"loss": 0.1483, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.1621621621621623, |
|
"grad_norm": 1.9641583881749838, |
|
"learning_rate": 2.5559395411158116e-06, |
|
"loss": 0.1428, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.189189189189189, |
|
"grad_norm": 2.136285665522953, |
|
"learning_rate": 2.4005375201130275e-06, |
|
"loss": 0.1456, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.2162162162162162, |
|
"grad_norm": 2.3296203504449413, |
|
"learning_rate": 2.249362711203985e-06, |
|
"loss": 0.1451, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 2.1953889356463105, |
|
"learning_rate": 2.102499205596743e-06, |
|
"loss": 0.1412, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.27027027027027, |
|
"grad_norm": 1.9846220715180103, |
|
"learning_rate": 1.960028696330596e-06, |
|
"loss": 0.139, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.2972972972972974, |
|
"grad_norm": 2.0955721946100345, |
|
"learning_rate": 1.8220304328342253e-06, |
|
"loss": 0.1379, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.3243243243243246, |
|
"grad_norm": 2.13188633069558, |
|
"learning_rate": 1.688581176843066e-06, |
|
"loss": 0.1401, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.3513513513513513, |
|
"grad_norm": 2.152817300804356, |
|
"learning_rate": 1.5597551597004968e-06, |
|
"loss": 0.1389, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 2.0539946534038305, |
|
"learning_rate": 1.4356240410665435e-06, |
|
"loss": 0.141, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.4054054054054053, |
|
"grad_norm": 2.162669615931553, |
|
"learning_rate": 1.3162568690570743e-06, |
|
"loss": 0.1437, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.4324324324324325, |
|
"grad_norm": 2.153506447556338, |
|
"learning_rate": 1.2017200418357077e-06, |
|
"loss": 0.1417, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.4594594594594597, |
|
"grad_norm": 2.1701177657529964, |
|
"learning_rate": 1.0920772706797166e-06, |
|
"loss": 0.142, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.4864864864864864, |
|
"grad_norm": 2.0721670365744407, |
|
"learning_rate": 9.873895445405523e-07, |
|
"loss": 0.14, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 2.0406246464567612, |
|
"learning_rate": 8.87715096118642e-07, |
|
"loss": 0.1398, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.5405405405405403, |
|
"grad_norm": 2.0428479721186035, |
|
"learning_rate": 7.931093694713687e-07, |
|
"loss": 0.1382, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.5675675675675675, |
|
"grad_norm": 2.234111473839529, |
|
"learning_rate": 7.03624989172228e-07, |
|
"loss": 0.1428, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.5945945945945947, |
|
"grad_norm": 1.9632293170603063, |
|
"learning_rate": 6.193117310383412e-07, |
|
"loss": 0.136, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.6216216216216215, |
|
"grad_norm": 2.0540773804658286, |
|
"learning_rate": 5.402164944425758e-07, |
|
"loss": 0.1355, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 2.0789858869481144, |
|
"learning_rate": 4.66383276225707e-07, |
|
"loss": 0.1404, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.6756756756756754, |
|
"grad_norm": 2.2440192963067678, |
|
"learning_rate": 3.97853146223105e-07, |
|
"loss": 0.1384, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.7027027027027026, |
|
"grad_norm": 2.0732015708107703, |
|
"learning_rate": 3.346642244195863e-07, |
|
"loss": 0.1359, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.72972972972973, |
|
"grad_norm": 1.982186914698493, |
|
"learning_rate": 2.7685165974510987e-07, |
|
"loss": 0.1367, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.756756756756757, |
|
"grad_norm": 2.0636323678600306, |
|
"learning_rate": 2.2444761052313857e-07, |
|
"loss": 0.136, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 2.133975894121271, |
|
"learning_rate": 1.7748122658251877e-07, |
|
"loss": 0.1398, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.810810810810811, |
|
"grad_norm": 1.9445931920281623, |
|
"learning_rate": 1.3597863304285475e-07, |
|
"loss": 0.1384, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.8378378378378377, |
|
"grad_norm": 2.1247798692529414, |
|
"learning_rate": 9.996291578236228e-08, |
|
"loss": 0.1361, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.864864864864865, |
|
"grad_norm": 2.128261177290376, |
|
"learning_rate": 6.945410859632295e-08, |
|
"loss": 0.1326, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.891891891891892, |
|
"grad_norm": 2.059927006960735, |
|
"learning_rate": 4.44691820532539e-08, |
|
"loss": 0.1355, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 1.942567955434284, |
|
"learning_rate": 2.5022034055003363e-08, |
|
"loss": 0.1415, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.945945945945946, |
|
"grad_norm": 1.9857313689273417, |
|
"learning_rate": 1.1123482106021322e-08, |
|
"loss": 0.1356, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.972972972972973, |
|
"grad_norm": 2.0940341606759816, |
|
"learning_rate": 2.7812572961127825e-09, |
|
"loss": 0.1399, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.9231586161743661, |
|
"learning_rate": 0.0, |
|
"loss": 0.1361, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.7972235679626465, |
|
"eval_runtime": 46.1514, |
|
"eval_samples_per_second": 14.279, |
|
"eval_steps_per_second": 0.455, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 740, |
|
"total_flos": 77470472601600.0, |
|
"train_loss": 0.38161755220310106, |
|
"train_runtime": 4944.5179, |
|
"train_samples_per_second": 4.784, |
|
"train_steps_per_second": 0.15 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 800, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 77470472601600.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|