|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.444444444444445, |
|
"eval_steps": 10, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 6.322239398956299, |
|
"learning_rate": 4e-05, |
|
"loss": 2.1156, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"eval_loss": 1.5894473791122437, |
|
"eval_runtime": 9.6219, |
|
"eval_samples_per_second": 10.393, |
|
"eval_steps_per_second": 10.393, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 4.342599391937256, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1893, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"eval_loss": 0.6867849826812744, |
|
"eval_runtime": 9.506, |
|
"eval_samples_per_second": 10.52, |
|
"eval_steps_per_second": 10.52, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 2.0583019256591797, |
|
"learning_rate": 0.00012, |
|
"loss": 0.5218, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"eval_loss": 0.45551854372024536, |
|
"eval_runtime": 9.5056, |
|
"eval_samples_per_second": 10.52, |
|
"eval_steps_per_second": 10.52, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 1.83012056350708, |
|
"learning_rate": 0.00016, |
|
"loss": 0.5292, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"eval_loss": 0.3795148730278015, |
|
"eval_runtime": 9.3637, |
|
"eval_samples_per_second": 10.68, |
|
"eval_steps_per_second": 10.68, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 2.1445064544677734, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3866, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"eval_loss": 0.30648669600486755, |
|
"eval_runtime": 9.2398, |
|
"eval_samples_per_second": 10.823, |
|
"eval_steps_per_second": 10.823, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 1.3674068450927734, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 0.3232, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"eval_loss": 0.20737296342849731, |
|
"eval_runtime": 9.2548, |
|
"eval_samples_per_second": 10.805, |
|
"eval_steps_per_second": 10.805, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 1.0751285552978516, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 0.1802, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"eval_loss": 0.15315091609954834, |
|
"eval_runtime": 9.2724, |
|
"eval_samples_per_second": 10.785, |
|
"eval_steps_per_second": 10.785, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.568341076374054, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 0.21, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"eval_loss": 0.13480396568775177, |
|
"eval_runtime": 9.1987, |
|
"eval_samples_per_second": 10.871, |
|
"eval_steps_per_second": 10.871, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9985604286193848, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 0.158, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.13721750676631927, |
|
"eval_runtime": 9.2578, |
|
"eval_samples_per_second": 10.802, |
|
"eval_steps_per_second": 10.802, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.8096336722373962, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.1629, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"eval_loss": 0.12762245535850525, |
|
"eval_runtime": 9.2566, |
|
"eval_samples_per_second": 10.803, |
|
"eval_steps_per_second": 10.803, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 1.2173279523849487, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.0966, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"eval_loss": 0.10031093657016754, |
|
"eval_runtime": 9.365, |
|
"eval_samples_per_second": 10.678, |
|
"eval_steps_per_second": 10.678, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.7404722571372986, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 0.0643, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"eval_loss": 0.08794313669204712, |
|
"eval_runtime": 9.3908, |
|
"eval_samples_per_second": 10.649, |
|
"eval_steps_per_second": 10.649, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 0.9548615217208862, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 0.0726, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"eval_loss": 0.08720792084932327, |
|
"eval_runtime": 9.3262, |
|
"eval_samples_per_second": 10.722, |
|
"eval_steps_per_second": 10.722, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 1.0252933502197266, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.0493, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"eval_loss": 0.09057007730007172, |
|
"eval_runtime": 9.3221, |
|
"eval_samples_per_second": 10.727, |
|
"eval_steps_per_second": 10.727, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.5175765156745911, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.0746, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"eval_loss": 0.058685798197984695, |
|
"eval_runtime": 9.3758, |
|
"eval_samples_per_second": 10.666, |
|
"eval_steps_per_second": 10.666, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 0.33382534980773926, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 0.0473, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"eval_loss": 0.0560651533305645, |
|
"eval_runtime": 9.6017, |
|
"eval_samples_per_second": 10.415, |
|
"eval_steps_per_second": 10.415, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 1.0913978815078735, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.0644, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"eval_loss": 0.05025744438171387, |
|
"eval_runtime": 9.3986, |
|
"eval_samples_per_second": 10.64, |
|
"eval_steps_per_second": 10.64, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.8013315796852112, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 0.0366, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.030684156343340874, |
|
"eval_runtime": 9.3927, |
|
"eval_samples_per_second": 10.647, |
|
"eval_steps_per_second": 10.647, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 0.2898242175579071, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 0.0247, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"eval_loss": 0.023328043520450592, |
|
"eval_runtime": 9.6997, |
|
"eval_samples_per_second": 10.31, |
|
"eval_steps_per_second": 10.31, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.0667782798409462, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.01, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"eval_loss": 0.021545417606830597, |
|
"eval_runtime": 9.6129, |
|
"eval_samples_per_second": 10.403, |
|
"eval_steps_per_second": 10.403, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 1.8411855697631836, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 0.0393, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"eval_loss": 0.012232878245413303, |
|
"eval_runtime": 9.4898, |
|
"eval_samples_per_second": 10.538, |
|
"eval_steps_per_second": 10.538, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 2.9970459938049316, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 0.0299, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"eval_loss": 0.01798514649271965, |
|
"eval_runtime": 9.5905, |
|
"eval_samples_per_second": 10.427, |
|
"eval_steps_per_second": 10.427, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"grad_norm": 1.0432640314102173, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.0166, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"eval_loss": 0.008207106962800026, |
|
"eval_runtime": 9.2773, |
|
"eval_samples_per_second": 10.779, |
|
"eval_steps_per_second": 10.779, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 1.9301716089248657, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 0.0319, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"eval_loss": 0.008276881650090218, |
|
"eval_runtime": 9.327, |
|
"eval_samples_per_second": 10.722, |
|
"eval_steps_per_second": 10.722, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.07122544944286346, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.0077, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"eval_loss": 0.007150276098400354, |
|
"eval_runtime": 9.3785, |
|
"eval_samples_per_second": 10.663, |
|
"eval_steps_per_second": 10.663, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 0.16882538795471191, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 0.0141, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"eval_loss": 0.003109171986579895, |
|
"eval_runtime": 9.3703, |
|
"eval_samples_per_second": 10.672, |
|
"eval_steps_per_second": 10.672, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.037282537668943405, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 0.0017, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.012033730745315552, |
|
"eval_runtime": 9.2982, |
|
"eval_samples_per_second": 10.755, |
|
"eval_steps_per_second": 10.755, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 0.022591086104512215, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 0.0015, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"eval_loss": 0.015280201099812984, |
|
"eval_runtime": 9.4013, |
|
"eval_samples_per_second": 10.637, |
|
"eval_steps_per_second": 10.637, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"grad_norm": 1.2939496040344238, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 0.0126, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"eval_loss": 0.01406156551092863, |
|
"eval_runtime": 9.3181, |
|
"eval_samples_per_second": 10.732, |
|
"eval_steps_per_second": 10.732, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.009937470778822899, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.0043, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"eval_loss": 0.0021963752806186676, |
|
"eval_runtime": 9.3813, |
|
"eval_samples_per_second": 10.659, |
|
"eval_steps_per_second": 10.659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"grad_norm": 0.022694729268550873, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 0.0068, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"eval_loss": 0.001871286309324205, |
|
"eval_runtime": 9.7104, |
|
"eval_samples_per_second": 10.298, |
|
"eval_steps_per_second": 10.298, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 0.11103329062461853, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.0018, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"eval_loss": 0.002184124430641532, |
|
"eval_runtime": 9.5703, |
|
"eval_samples_per_second": 10.449, |
|
"eval_steps_per_second": 10.449, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 0.02097630314528942, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 0.0026, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"eval_loss": 0.0034216546919196844, |
|
"eval_runtime": 9.647, |
|
"eval_samples_per_second": 10.366, |
|
"eval_steps_per_second": 10.366, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.022222222222222, |
|
"grad_norm": 0.0426625981926918, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 0.0017, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.022222222222222, |
|
"eval_loss": 0.007565508596599102, |
|
"eval_runtime": 9.5797, |
|
"eval_samples_per_second": 10.439, |
|
"eval_steps_per_second": 10.439, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.0036587081849575043, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.0002, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"eval_loss": 0.010203778743743896, |
|
"eval_runtime": 9.5558, |
|
"eval_samples_per_second": 10.465, |
|
"eval_steps_per_second": 10.465, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.008132525719702244, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 0.0004, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.011154056526720524, |
|
"eval_runtime": 9.3832, |
|
"eval_samples_per_second": 10.657, |
|
"eval_steps_per_second": 10.657, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.2888888888888888, |
|
"grad_norm": 0.00727389520034194, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 0.006, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.2888888888888888, |
|
"eval_loss": 0.00937813799828291, |
|
"eval_runtime": 9.3745, |
|
"eval_samples_per_second": 10.667, |
|
"eval_steps_per_second": 10.667, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.3777777777777778, |
|
"grad_norm": 0.004803340416401625, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 0.0003, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.3777777777777778, |
|
"eval_loss": 0.007455301936715841, |
|
"eval_runtime": 9.5178, |
|
"eval_samples_per_second": 10.507, |
|
"eval_steps_per_second": 10.507, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.466666666666667, |
|
"grad_norm": 0.005293034482747316, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 0.0003, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.466666666666667, |
|
"eval_loss": 0.006887929514050484, |
|
"eval_runtime": 9.5617, |
|
"eval_samples_per_second": 10.458, |
|
"eval_steps_per_second": 10.458, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.01216947752982378, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.0002, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"eval_loss": 0.006745634134858847, |
|
"eval_runtime": 9.5509, |
|
"eval_samples_per_second": 10.47, |
|
"eval_steps_per_second": 10.47, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6444444444444444, |
|
"grad_norm": 0.008572138845920563, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.0005, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.6444444444444444, |
|
"eval_loss": 0.006609635427594185, |
|
"eval_runtime": 9.6817, |
|
"eval_samples_per_second": 10.329, |
|
"eval_steps_per_second": 10.329, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.7333333333333334, |
|
"grad_norm": 0.004443774465471506, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 0.0003, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.7333333333333334, |
|
"eval_loss": 0.007157918065786362, |
|
"eval_runtime": 9.5559, |
|
"eval_samples_per_second": 10.465, |
|
"eval_steps_per_second": 10.465, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.822222222222222, |
|
"grad_norm": 0.0011405730620026588, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 0.0037, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.822222222222222, |
|
"eval_loss": 0.006302958354353905, |
|
"eval_runtime": 9.2431, |
|
"eval_samples_per_second": 10.819, |
|
"eval_steps_per_second": 10.819, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.911111111111111, |
|
"grad_norm": 0.004764024633914232, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 0.004, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.911111111111111, |
|
"eval_loss": 0.005341523326933384, |
|
"eval_runtime": 9.5532, |
|
"eval_samples_per_second": 10.468, |
|
"eval_steps_per_second": 10.468, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.004020992666482925, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.0003, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.005209792871028185, |
|
"eval_runtime": 9.7502, |
|
"eval_samples_per_second": 10.256, |
|
"eval_steps_per_second": 10.256, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.088888888888889, |
|
"grad_norm": 0.013617518357932568, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 0.0002, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.088888888888889, |
|
"eval_loss": 0.005074501037597656, |
|
"eval_runtime": 9.3359, |
|
"eval_samples_per_second": 10.711, |
|
"eval_steps_per_second": 10.711, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.177777777777778, |
|
"grad_norm": 0.003857893170788884, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 0.0002, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.177777777777778, |
|
"eval_loss": 0.004982742480933666, |
|
"eval_runtime": 9.5873, |
|
"eval_samples_per_second": 10.43, |
|
"eval_steps_per_second": 10.43, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.266666666666667, |
|
"grad_norm": 0.006583555601537228, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 0.0006, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.266666666666667, |
|
"eval_loss": 0.004874282516539097, |
|
"eval_runtime": 9.6607, |
|
"eval_samples_per_second": 10.351, |
|
"eval_steps_per_second": 10.351, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.355555555555555, |
|
"grad_norm": 0.004742850083857775, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 0.0005, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.355555555555555, |
|
"eval_loss": 0.004786411300301552, |
|
"eval_runtime": 9.6649, |
|
"eval_samples_per_second": 10.347, |
|
"eval_steps_per_second": 10.347, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.010840805247426033, |
|
"learning_rate": 0.0, |
|
"loss": 0.0002, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"eval_loss": 0.005105508491396904, |
|
"eval_runtime": 9.3806, |
|
"eval_samples_per_second": 10.66, |
|
"eval_steps_per_second": 10.66, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"step": 500, |
|
"total_flos": 1.038706354200576e+16, |
|
"train_loss": 0.1301481350355316, |
|
"train_runtime": 1904.5573, |
|
"train_samples_per_second": 2.1, |
|
"train_steps_per_second": 0.263 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.038706354200576e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|