|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 50, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.887116432189941, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 1.6635, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.9456449747085571, |
|
"eval_runtime": 5.3786, |
|
"eval_samples_per_second": 29.004, |
|
"eval_steps_per_second": 1.487, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.283559799194336, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.8201, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.7792918682098389, |
|
"eval_runtime": 5.331, |
|
"eval_samples_per_second": 29.263, |
|
"eval_steps_per_second": 1.501, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.8983874320983887, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 0.7613, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 0.7527902722358704, |
|
"eval_runtime": 5.3406, |
|
"eval_samples_per_second": 29.21, |
|
"eval_steps_per_second": 1.498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.2681150436401367, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.7325, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.7382247447967529, |
|
"eval_runtime": 5.3526, |
|
"eval_samples_per_second": 29.145, |
|
"eval_steps_per_second": 1.495, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.064232587814331, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.7304, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.7373641729354858, |
|
"eval_runtime": 5.3943, |
|
"eval_samples_per_second": 28.92, |
|
"eval_steps_per_second": 1.483, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.030766248703003, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.732, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.7380843758583069, |
|
"eval_runtime": 5.3592, |
|
"eval_samples_per_second": 29.109, |
|
"eval_steps_per_second": 1.493, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.9495913982391357, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7425, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.7450628280639648, |
|
"eval_runtime": 5.3499, |
|
"eval_samples_per_second": 29.159, |
|
"eval_steps_per_second": 1.495, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.047274351119995, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 0.727, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.7441740036010742, |
|
"eval_runtime": 5.3431, |
|
"eval_samples_per_second": 29.197, |
|
"eval_steps_per_second": 1.497, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.082712411880493, |
|
"learning_rate": 1.2857142857142859e-05, |
|
"loss": 0.7059, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 0.7533753514289856, |
|
"eval_runtime": 5.3551, |
|
"eval_samples_per_second": 29.131, |
|
"eval_steps_per_second": 1.494, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.9532545804977417, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 0.7125, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.7590879797935486, |
|
"eval_runtime": 5.3446, |
|
"eval_samples_per_second": 29.188, |
|
"eval_steps_per_second": 1.497, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.6566646099090576, |
|
"learning_rate": 1.5714285714285715e-05, |
|
"loss": 0.751, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 0.7629729509353638, |
|
"eval_runtime": 5.3745, |
|
"eval_samples_per_second": 29.026, |
|
"eval_steps_per_second": 1.489, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.752972960472107, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 0.7929, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.7665635943412781, |
|
"eval_runtime": 5.3696, |
|
"eval_samples_per_second": 29.052, |
|
"eval_steps_per_second": 1.49, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.8425724506378174, |
|
"learning_rate": 1.8571428571428575e-05, |
|
"loss": 0.7803, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.7683877944946289, |
|
"eval_runtime": 5.3553, |
|
"eval_samples_per_second": 29.13, |
|
"eval_steps_per_second": 1.494, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.2257883548736572, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7182, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.7802882790565491, |
|
"eval_runtime": 5.3398, |
|
"eval_samples_per_second": 29.214, |
|
"eval_steps_per_second": 1.498, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.124093532562256, |
|
"learning_rate": 1.9996891820008165e-05, |
|
"loss": 0.7318, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.7831437587738037, |
|
"eval_runtime": 5.3755, |
|
"eval_samples_per_second": 29.02, |
|
"eval_steps_per_second": 1.488, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.1694772243499756, |
|
"learning_rate": 1.9987569212189224e-05, |
|
"loss": 0.7509, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7850997447967529, |
|
"eval_runtime": 5.3457, |
|
"eval_samples_per_second": 29.183, |
|
"eval_steps_per_second": 1.497, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.7642978429794312, |
|
"learning_rate": 1.9972037971811802e-05, |
|
"loss": 0.7617, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 0.78228759765625, |
|
"eval_runtime": 5.3474, |
|
"eval_samples_per_second": 29.173, |
|
"eval_steps_per_second": 1.496, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.7806785106658936, |
|
"learning_rate": 1.9950307753654016e-05, |
|
"loss": 0.7865, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.784203827381134, |
|
"eval_runtime": 5.3402, |
|
"eval_samples_per_second": 29.212, |
|
"eval_steps_per_second": 1.498, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.1907761096954346, |
|
"learning_rate": 1.9922392066001724e-05, |
|
"loss": 0.7619, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 0.7823160886764526, |
|
"eval_runtime": 5.3429, |
|
"eval_samples_per_second": 29.197, |
|
"eval_steps_per_second": 1.497, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.591108798980713, |
|
"learning_rate": 1.9888308262251286e-05, |
|
"loss": 0.7799, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7811964154243469, |
|
"eval_runtime": 5.3406, |
|
"eval_samples_per_second": 29.21, |
|
"eval_steps_per_second": 1.498, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.7470197677612305, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.4817, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_loss": 0.8039106726646423, |
|
"eval_runtime": 5.3508, |
|
"eval_samples_per_second": 29.154, |
|
"eval_steps_per_second": 1.495, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.3562027215957642, |
|
"learning_rate": 1.9801724878485438e-05, |
|
"loss": 0.4391, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.8185109496116638, |
|
"eval_runtime": 5.3553, |
|
"eval_samples_per_second": 29.13, |
|
"eval_steps_per_second": 1.494, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.015491247177124, |
|
"learning_rate": 1.9749279121818235e-05, |
|
"loss": 0.4847, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_loss": 0.8041576743125916, |
|
"eval_runtime": 5.3543, |
|
"eval_samples_per_second": 29.135, |
|
"eval_steps_per_second": 1.494, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.7701632976531982, |
|
"learning_rate": 1.969077286229078e-05, |
|
"loss": 0.4489, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.8140569925308228, |
|
"eval_runtime": 5.3403, |
|
"eval_samples_per_second": 29.212, |
|
"eval_steps_per_second": 1.498, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.6276088953018188, |
|
"learning_rate": 1.962624246950012e-05, |
|
"loss": 0.4593, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8152586221694946, |
|
"eval_runtime": 5.3674, |
|
"eval_samples_per_second": 29.064, |
|
"eval_steps_per_second": 1.49, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.7220439910888672, |
|
"learning_rate": 1.955572805786141e-05, |
|
"loss": 0.4547, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 0.8013993501663208, |
|
"eval_runtime": 5.3412, |
|
"eval_samples_per_second": 29.207, |
|
"eval_steps_per_second": 1.498, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.9381102323532104, |
|
"learning_rate": 1.947927346167132e-05, |
|
"loss": 0.4732, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 0.8070553541183472, |
|
"eval_runtime": 5.3586, |
|
"eval_samples_per_second": 29.112, |
|
"eval_steps_per_second": 1.493, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.3193718194961548, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.4751, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.8212087750434875, |
|
"eval_runtime": 5.3584, |
|
"eval_samples_per_second": 29.113, |
|
"eval_steps_per_second": 1.493, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.4072144031524658, |
|
"learning_rate": 1.9308737486442045e-05, |
|
"loss": 0.4711, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_loss": 0.8135795593261719, |
|
"eval_runtime": 5.3462, |
|
"eval_samples_per_second": 29.18, |
|
"eval_steps_per_second": 1.496, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.503187656402588, |
|
"learning_rate": 1.921476211870408e-05, |
|
"loss": 0.4486, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.8082787990570068, |
|
"eval_runtime": 5.356, |
|
"eval_samples_per_second": 29.126, |
|
"eval_steps_per_second": 1.494, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.4871103763580322, |
|
"learning_rate": 1.9115058523116734e-05, |
|
"loss": 0.458, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_loss": 0.8071246147155762, |
|
"eval_runtime": 5.5657, |
|
"eval_samples_per_second": 28.029, |
|
"eval_steps_per_second": 1.437, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.1439411640167236, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.4716, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.8042948246002197, |
|
"eval_runtime": 5.3482, |
|
"eval_samples_per_second": 29.169, |
|
"eval_steps_per_second": 1.496, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.5177081823349, |
|
"learning_rate": 1.8898718088114688e-05, |
|
"loss": 0.478, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"eval_loss": 0.8044167757034302, |
|
"eval_runtime": 5.3573, |
|
"eval_samples_per_second": 29.119, |
|
"eval_steps_per_second": 1.493, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.6844738721847534, |
|
"learning_rate": 1.8782215733702286e-05, |
|
"loss": 0.5012, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.7973454594612122, |
|
"eval_runtime": 5.3529, |
|
"eval_samples_per_second": 29.143, |
|
"eval_steps_per_second": 1.495, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.1086294651031494, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.4798, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.796762228012085, |
|
"eval_runtime": 5.3528, |
|
"eval_samples_per_second": 29.144, |
|
"eval_steps_per_second": 1.495, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.8831372261047363, |
|
"learning_rate": 1.8532908816321557e-05, |
|
"loss": 0.4658, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.8035383224487305, |
|
"eval_runtime": 5.3427, |
|
"eval_samples_per_second": 29.199, |
|
"eval_steps_per_second": 1.497, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.7962303161621094, |
|
"learning_rate": 1.8400259231507716e-05, |
|
"loss": 0.4761, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"eval_loss": 0.8024702072143555, |
|
"eval_runtime": 5.3585, |
|
"eval_samples_per_second": 29.113, |
|
"eval_steps_per_second": 1.493, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.5731513500213623, |
|
"learning_rate": 1.826238774315995e-05, |
|
"loss": 0.5029, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.8045446276664734, |
|
"eval_runtime": 5.3667, |
|
"eval_samples_per_second": 29.068, |
|
"eval_steps_per_second": 1.491, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.6794695854187012, |
|
"learning_rate": 1.811938005715857e-05, |
|
"loss": 0.4587, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 0.8037970066070557, |
|
"eval_runtime": 5.3483, |
|
"eval_samples_per_second": 29.168, |
|
"eval_steps_per_second": 1.496, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.8006577491760254, |
|
"learning_rate": 1.7971325072229227e-05, |
|
"loss": 0.487, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.806293785572052, |
|
"eval_runtime": 5.3609, |
|
"eval_samples_per_second": 29.1, |
|
"eval_steps_per_second": 1.492, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.353041648864746, |
|
"learning_rate": 1.78183148246803e-05, |
|
"loss": 0.228, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_loss": 0.8969957232475281, |
|
"eval_runtime": 5.3463, |
|
"eval_samples_per_second": 29.179, |
|
"eval_steps_per_second": 1.496, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.695069432258606, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.238, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 0.8847584128379822, |
|
"eval_runtime": 5.3661, |
|
"eval_samples_per_second": 29.072, |
|
"eval_steps_per_second": 1.491, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.4685120582580566, |
|
"learning_rate": 1.7497812029677344e-05, |
|
"loss": 0.2289, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"eval_loss": 0.9000833034515381, |
|
"eval_runtime": 5.3374, |
|
"eval_samples_per_second": 29.228, |
|
"eval_steps_per_second": 1.499, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.1362169981002808, |
|
"learning_rate": 1.7330518718298263e-05, |
|
"loss": 0.228, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 0.9024194478988647, |
|
"eval_runtime": 5.3582, |
|
"eval_samples_per_second": 29.114, |
|
"eval_steps_per_second": 1.493, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.252875804901123, |
|
"learning_rate": 1.7158668492597186e-05, |
|
"loss": 0.2262, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 0.9039773941040039, |
|
"eval_runtime": 5.3439, |
|
"eval_samples_per_second": 29.192, |
|
"eval_steps_per_second": 1.497, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.2293896675109863, |
|
"learning_rate": 1.698236818086073e-05, |
|
"loss": 0.2401, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_loss": 0.9025784134864807, |
|
"eval_runtime": 5.3641, |
|
"eval_samples_per_second": 29.082, |
|
"eval_steps_per_second": 1.491, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.5813263654708862, |
|
"learning_rate": 1.6801727377709195e-05, |
|
"loss": 0.2397, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_loss": 0.9166079759597778, |
|
"eval_runtime": 5.3804, |
|
"eval_samples_per_second": 28.994, |
|
"eval_steps_per_second": 1.487, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4593902826309204, |
|
"learning_rate": 1.6616858375968596e-05, |
|
"loss": 0.2341, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.9137348532676697, |
|
"eval_runtime": 5.3708, |
|
"eval_samples_per_second": 29.046, |
|
"eval_steps_per_second": 1.49, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.8817037343978882, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 0.2434, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_loss": 0.9133093357086182, |
|
"eval_runtime": 5.353, |
|
"eval_samples_per_second": 29.143, |
|
"eval_steps_per_second": 1.494, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.1660922765731812, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 0.238, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.9203407168388367, |
|
"eval_runtime": 5.3478, |
|
"eval_samples_per_second": 29.171, |
|
"eval_steps_per_second": 1.496, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.1189944744110107, |
|
"learning_rate": 1.6038044103254775e-05, |
|
"loss": 0.2555, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_loss": 0.9188438057899475, |
|
"eval_runtime": 5.3606, |
|
"eval_samples_per_second": 29.101, |
|
"eval_steps_per_second": 1.492, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.388344407081604, |
|
"learning_rate": 1.5837436722347902e-05, |
|
"loss": 0.245, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.9264757037162781, |
|
"eval_runtime": 5.3585, |
|
"eval_samples_per_second": 29.112, |
|
"eval_steps_per_second": 1.493, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.4485737085342407, |
|
"learning_rate": 1.563320058063622e-05, |
|
"loss": 0.2443, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"eval_loss": 0.9161367416381836, |
|
"eval_runtime": 5.3818, |
|
"eval_samples_per_second": 28.986, |
|
"eval_steps_per_second": 1.486, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.536858081817627, |
|
"learning_rate": 1.5425462638657597e-05, |
|
"loss": 0.2377, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"eval_loss": 0.919867992401123, |
|
"eval_runtime": 5.3578, |
|
"eval_samples_per_second": 29.116, |
|
"eval_steps_per_second": 1.493, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.163506031036377, |
|
"learning_rate": 1.5214352033794981e-05, |
|
"loss": 0.2429, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 0.9072566628456116, |
|
"eval_runtime": 5.362, |
|
"eval_samples_per_second": 29.094, |
|
"eval_steps_per_second": 1.492, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.5095975399017334, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.2437, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.9278251528739929, |
|
"eval_runtime": 5.3835, |
|
"eval_samples_per_second": 28.977, |
|
"eval_steps_per_second": 1.486, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.2110488414764404, |
|
"learning_rate": 1.4782539786213184e-05, |
|
"loss": 0.2535, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"eval_loss": 0.9261165261268616, |
|
"eval_runtime": 5.3818, |
|
"eval_samples_per_second": 28.987, |
|
"eval_steps_per_second": 1.486, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.4795968532562256, |
|
"learning_rate": 1.4562106573531632e-05, |
|
"loss": 0.2416, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"eval_loss": 0.9134149551391602, |
|
"eval_runtime": 5.3707, |
|
"eval_samples_per_second": 29.046, |
|
"eval_steps_per_second": 1.49, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.4406273365020752, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 0.2455, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"eval_loss": 0.911788821220398, |
|
"eval_runtime": 5.3593, |
|
"eval_samples_per_second": 29.108, |
|
"eval_steps_per_second": 1.493, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.336966633796692, |
|
"learning_rate": 1.4112871031306118e-05, |
|
"loss": 0.2367, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.9113795161247253, |
|
"eval_runtime": 5.3608, |
|
"eval_samples_per_second": 29.1, |
|
"eval_steps_per_second": 1.492, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 7000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 200, |
|
"total_flos": 2.7565823700343194e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|