|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.5384615384615383, |
|
"eval_steps": 10, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003076923076923077, |
|
"eval_loss": 1.4537198543548584, |
|
"eval_runtime": 2.2676, |
|
"eval_samples_per_second": 50.275, |
|
"eval_steps_per_second": 2.646, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015384615384615385, |
|
"grad_norm": 11.972827911376953, |
|
"learning_rate": 4.395604395604396e-07, |
|
"loss": 1.3965, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015384615384615385, |
|
"eval_loss": 1.444534182548523, |
|
"eval_runtime": 2.0489, |
|
"eval_samples_per_second": 55.638, |
|
"eval_steps_per_second": 2.928, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 14.562355995178223, |
|
"learning_rate": 8.791208791208792e-07, |
|
"loss": 1.3983, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 1.3285874128341675, |
|
"eval_runtime": 2.033, |
|
"eval_samples_per_second": 56.075, |
|
"eval_steps_per_second": 2.951, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.046153846153846156, |
|
"grad_norm": 13.817586898803711, |
|
"learning_rate": 1.3186813186813187e-06, |
|
"loss": 1.2094, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.046153846153846156, |
|
"eval_loss": 0.9445623159408569, |
|
"eval_runtime": 2.0359, |
|
"eval_samples_per_second": 55.994, |
|
"eval_steps_per_second": 2.947, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 7.5655837059021, |
|
"learning_rate": 1.7582417582417585e-06, |
|
"loss": 0.7584, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"eval_loss": 0.8109635710716248, |
|
"eval_runtime": 2.0521, |
|
"eval_samples_per_second": 55.553, |
|
"eval_steps_per_second": 2.924, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 12.146793365478516, |
|
"learning_rate": 2.197802197802198e-06, |
|
"loss": 0.7401, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"eval_loss": 0.7573030591011047, |
|
"eval_runtime": 2.0481, |
|
"eval_samples_per_second": 55.661, |
|
"eval_steps_per_second": 2.93, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 9.491755485534668, |
|
"learning_rate": 2.6373626373626375e-06, |
|
"loss": 0.6296, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.7523270845413208, |
|
"eval_runtime": 2.0501, |
|
"eval_samples_per_second": 55.606, |
|
"eval_steps_per_second": 2.927, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1076923076923077, |
|
"grad_norm": 5.857011795043945, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.7523, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1076923076923077, |
|
"eval_loss": 0.7429385185241699, |
|
"eval_runtime": 2.051, |
|
"eval_samples_per_second": 55.583, |
|
"eval_steps_per_second": 2.925, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 4.9665045738220215, |
|
"learning_rate": 3.516483516483517e-06, |
|
"loss": 0.6073, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"eval_loss": 0.7138068079948425, |
|
"eval_runtime": 2.0478, |
|
"eval_samples_per_second": 55.67, |
|
"eval_steps_per_second": 2.93, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13846153846153847, |
|
"grad_norm": 11.936062812805176, |
|
"learning_rate": 3.9560439560439565e-06, |
|
"loss": 0.6577, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13846153846153847, |
|
"eval_loss": 0.6989916563034058, |
|
"eval_runtime": 2.0579, |
|
"eval_samples_per_second": 55.397, |
|
"eval_steps_per_second": 2.916, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 13.15188980102539, |
|
"learning_rate": 4.395604395604396e-06, |
|
"loss": 0.698, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.7036928534507751, |
|
"eval_runtime": 2.0376, |
|
"eval_samples_per_second": 55.949, |
|
"eval_steps_per_second": 2.945, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16923076923076924, |
|
"grad_norm": 10.141645431518555, |
|
"learning_rate": 4.8351648351648355e-06, |
|
"loss": 0.6925, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16923076923076924, |
|
"eval_loss": 0.6939440369606018, |
|
"eval_runtime": 2.0357, |
|
"eval_samples_per_second": 56.002, |
|
"eval_steps_per_second": 2.947, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 3.710402488708496, |
|
"learning_rate": 5.274725274725275e-06, |
|
"loss": 0.6055, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.7082720994949341, |
|
"eval_runtime": 2.0484, |
|
"eval_samples_per_second": 55.652, |
|
"eval_steps_per_second": 2.929, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.347709655761719, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.5347, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.6877034306526184, |
|
"eval_runtime": 2.0344, |
|
"eval_samples_per_second": 56.037, |
|
"eval_steps_per_second": 2.949, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 8.17250919342041, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.6877, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"eval_loss": 0.6877639889717102, |
|
"eval_runtime": 2.0484, |
|
"eval_samples_per_second": 55.652, |
|
"eval_steps_per_second": 2.929, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 6.628317356109619, |
|
"learning_rate": 6.5934065934065935e-06, |
|
"loss": 0.5965, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"eval_loss": 0.6875787973403931, |
|
"eval_runtime": 2.0524, |
|
"eval_samples_per_second": 55.545, |
|
"eval_steps_per_second": 2.923, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 4.867484092712402, |
|
"learning_rate": 7.032967032967034e-06, |
|
"loss": 0.703, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"eval_loss": 0.6956614851951599, |
|
"eval_runtime": 2.0284, |
|
"eval_samples_per_second": 56.201, |
|
"eval_steps_per_second": 2.958, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.26153846153846155, |
|
"grad_norm": 3.452112913131714, |
|
"learning_rate": 7.472527472527473e-06, |
|
"loss": 0.6539, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26153846153846155, |
|
"eval_loss": 0.6873570084571838, |
|
"eval_runtime": 2.0311, |
|
"eval_samples_per_second": 56.127, |
|
"eval_steps_per_second": 2.954, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 5.422184944152832, |
|
"learning_rate": 7.912087912087913e-06, |
|
"loss": 0.6788, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 0.6954818367958069, |
|
"eval_runtime": 2.051, |
|
"eval_samples_per_second": 55.583, |
|
"eval_steps_per_second": 2.925, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2923076923076923, |
|
"grad_norm": 4.284252643585205, |
|
"learning_rate": 8.351648351648353e-06, |
|
"loss": 0.659, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2923076923076923, |
|
"eval_loss": 0.6905339360237122, |
|
"eval_runtime": 2.0478, |
|
"eval_samples_per_second": 55.67, |
|
"eval_steps_per_second": 2.93, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 4.645473003387451, |
|
"learning_rate": 8.791208791208792e-06, |
|
"loss": 0.6795, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.6910640001296997, |
|
"eval_runtime": 2.0348, |
|
"eval_samples_per_second": 56.026, |
|
"eval_steps_per_second": 2.949, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3230769230769231, |
|
"grad_norm": 3.9407358169555664, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.6608, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3230769230769231, |
|
"eval_loss": 0.6903011798858643, |
|
"eval_runtime": 2.0478, |
|
"eval_samples_per_second": 55.669, |
|
"eval_steps_per_second": 2.93, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 3.000676393508911, |
|
"learning_rate": 9.670329670329671e-06, |
|
"loss": 0.647, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"eval_loss": 0.6933804154396057, |
|
"eval_runtime": 2.0377, |
|
"eval_samples_per_second": 55.946, |
|
"eval_steps_per_second": 2.945, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.35384615384615387, |
|
"grad_norm": 5.05721378326416, |
|
"learning_rate": 1.010989010989011e-05, |
|
"loss": 0.669, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.35384615384615387, |
|
"eval_loss": 0.6953541040420532, |
|
"eval_runtime": 2.0502, |
|
"eval_samples_per_second": 55.604, |
|
"eval_steps_per_second": 2.927, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 2.5314972400665283, |
|
"learning_rate": 1.054945054945055e-05, |
|
"loss": 0.7088, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"eval_loss": 0.6936004757881165, |
|
"eval_runtime": 2.0376, |
|
"eval_samples_per_second": 55.948, |
|
"eval_steps_per_second": 2.945, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 9.507554054260254, |
|
"learning_rate": 1.098901098901099e-05, |
|
"loss": 0.7007, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"eval_loss": 0.7747458815574646, |
|
"eval_runtime": 2.042, |
|
"eval_samples_per_second": 55.827, |
|
"eval_steps_per_second": 2.938, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.4734535217285156, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 0.6827, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6972140669822693, |
|
"eval_runtime": 2.0689, |
|
"eval_samples_per_second": 55.101, |
|
"eval_steps_per_second": 2.9, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4153846153846154, |
|
"grad_norm": 6.09801721572876, |
|
"learning_rate": 1.186813186813187e-05, |
|
"loss": 0.6556, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4153846153846154, |
|
"eval_loss": 0.7027987837791443, |
|
"eval_runtime": 2.0282, |
|
"eval_samples_per_second": 56.208, |
|
"eval_steps_per_second": 2.958, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 4.644292831420898, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.6851, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"eval_loss": 0.7334415316581726, |
|
"eval_runtime": 2.0372, |
|
"eval_samples_per_second": 55.958, |
|
"eval_steps_per_second": 2.945, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4461538461538462, |
|
"grad_norm": 3.16902232170105, |
|
"learning_rate": 1.2747252747252747e-05, |
|
"loss": 0.7118, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4461538461538462, |
|
"eval_loss": 0.7018752694129944, |
|
"eval_runtime": 2.0467, |
|
"eval_samples_per_second": 55.699, |
|
"eval_steps_per_second": 2.932, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 3.3035457134246826, |
|
"learning_rate": 1.3186813186813187e-05, |
|
"loss": 0.6381, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.7047535181045532, |
|
"eval_runtime": 2.0352, |
|
"eval_samples_per_second": 56.015, |
|
"eval_steps_per_second": 2.948, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.47692307692307695, |
|
"grad_norm": 2.248196840286255, |
|
"learning_rate": 1.3626373626373627e-05, |
|
"loss": 0.6167, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.47692307692307695, |
|
"eval_loss": 0.7115533351898193, |
|
"eval_runtime": 2.0425, |
|
"eval_samples_per_second": 55.815, |
|
"eval_steps_per_second": 2.938, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 6.9102678298950195, |
|
"learning_rate": 1.4065934065934068e-05, |
|
"loss": 0.7308, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"eval_loss": 0.7213383913040161, |
|
"eval_runtime": 2.0226, |
|
"eval_samples_per_second": 56.362, |
|
"eval_steps_per_second": 2.966, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5076923076923077, |
|
"grad_norm": 3.5638887882232666, |
|
"learning_rate": 1.4505494505494506e-05, |
|
"loss": 0.6035, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5076923076923077, |
|
"eval_loss": 0.7181587815284729, |
|
"eval_runtime": 2.06, |
|
"eval_samples_per_second": 55.34, |
|
"eval_steps_per_second": 2.913, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 4.234628677368164, |
|
"learning_rate": 1.4945054945054947e-05, |
|
"loss": 0.5433, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"eval_loss": 0.7234803438186646, |
|
"eval_runtime": 2.0282, |
|
"eval_samples_per_second": 56.207, |
|
"eval_steps_per_second": 2.958, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 2.9397354125976562, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.6332, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"eval_loss": 0.7224182486534119, |
|
"eval_runtime": 2.1213, |
|
"eval_samples_per_second": 53.74, |
|
"eval_steps_per_second": 2.828, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 3.2468433380126953, |
|
"learning_rate": 1.5824175824175826e-05, |
|
"loss": 0.7297, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 0.7258623838424683, |
|
"eval_runtime": 2.0648, |
|
"eval_samples_per_second": 55.211, |
|
"eval_steps_per_second": 2.906, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5692307692307692, |
|
"grad_norm": 4.74401330947876, |
|
"learning_rate": 1.6263736263736265e-05, |
|
"loss": 0.6346, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5692307692307692, |
|
"eval_loss": 0.7451629638671875, |
|
"eval_runtime": 2.143, |
|
"eval_samples_per_second": 53.196, |
|
"eval_steps_per_second": 2.8, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 3.286860942840576, |
|
"learning_rate": 1.6703296703296707e-05, |
|
"loss": 0.6819, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"eval_loss": 0.7284204959869385, |
|
"eval_runtime": 2.0361, |
|
"eval_samples_per_second": 55.989, |
|
"eval_steps_per_second": 2.947, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.673974514007568, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 0.6864, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.7300311923027039, |
|
"eval_runtime": 2.1078, |
|
"eval_samples_per_second": 54.086, |
|
"eval_steps_per_second": 2.847, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 6.604970932006836, |
|
"learning_rate": 1.7582417582417584e-05, |
|
"loss": 0.6694, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.7328219413757324, |
|
"eval_runtime": 2.1266, |
|
"eval_samples_per_second": 53.608, |
|
"eval_steps_per_second": 2.821, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6307692307692307, |
|
"grad_norm": 2.9900333881378174, |
|
"learning_rate": 1.8021978021978023e-05, |
|
"loss": 0.513, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6307692307692307, |
|
"eval_loss": 0.7355452179908752, |
|
"eval_runtime": 2.0335, |
|
"eval_samples_per_second": 56.062, |
|
"eval_steps_per_second": 2.951, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 4.437044143676758, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.599, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"eval_loss": 0.7413352727890015, |
|
"eval_runtime": 2.0588, |
|
"eval_samples_per_second": 55.371, |
|
"eval_steps_per_second": 2.914, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6615384615384615, |
|
"grad_norm": 2.5650463104248047, |
|
"learning_rate": 1.8901098901098903e-05, |
|
"loss": 0.765, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6615384615384615, |
|
"eval_loss": 0.735922634601593, |
|
"eval_runtime": 2.0718, |
|
"eval_samples_per_second": 55.025, |
|
"eval_steps_per_second": 2.896, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 6.075387477874756, |
|
"learning_rate": 1.9340659340659342e-05, |
|
"loss": 0.6436, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"eval_loss": 0.7397021651268005, |
|
"eval_runtime": 2.0454, |
|
"eval_samples_per_second": 55.734, |
|
"eval_steps_per_second": 2.933, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 4.346602916717529, |
|
"learning_rate": 1.9780219780219784e-05, |
|
"loss": 0.7325, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"eval_loss": 0.7424116134643555, |
|
"eval_runtime": 2.1114, |
|
"eval_samples_per_second": 53.993, |
|
"eval_steps_per_second": 2.842, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 2.532031774520874, |
|
"learning_rate": 1.9999926429888597e-05, |
|
"loss": 0.675, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"eval_loss": 0.7457496523857117, |
|
"eval_runtime": 2.069, |
|
"eval_samples_per_second": 55.099, |
|
"eval_steps_per_second": 2.9, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7230769230769231, |
|
"grad_norm": 3.643791675567627, |
|
"learning_rate": 1.9999337875492412e-05, |
|
"loss": 0.7264, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7230769230769231, |
|
"eval_loss": 0.7425748705863953, |
|
"eval_runtime": 2.0709, |
|
"eval_samples_per_second": 55.05, |
|
"eval_steps_per_second": 2.897, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 1.3282146453857422, |
|
"learning_rate": 1.999816080133992e-05, |
|
"loss": 0.5571, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"eval_loss": 0.7501851916313171, |
|
"eval_runtime": 2.107, |
|
"eval_samples_per_second": 54.106, |
|
"eval_steps_per_second": 2.848, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7538461538461538, |
|
"grad_norm": 2.987581968307495, |
|
"learning_rate": 1.9996395276708856e-05, |
|
"loss": 0.7058, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7538461538461538, |
|
"eval_loss": 0.7441815733909607, |
|
"eval_runtime": 2.0491, |
|
"eval_samples_per_second": 55.634, |
|
"eval_steps_per_second": 2.928, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.506946563720703, |
|
"learning_rate": 1.9994041405510705e-05, |
|
"loss": 0.6256, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.7526156902313232, |
|
"eval_runtime": 2.0407, |
|
"eval_samples_per_second": 55.864, |
|
"eval_steps_per_second": 2.94, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7846153846153846, |
|
"grad_norm": 2.2201008796691895, |
|
"learning_rate": 1.9991099326284616e-05, |
|
"loss": 0.6102, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7846153846153846, |
|
"eval_loss": 0.7576336860656738, |
|
"eval_runtime": 2.1027, |
|
"eval_samples_per_second": 54.217, |
|
"eval_steps_per_second": 2.854, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.3949899673461914, |
|
"learning_rate": 1.9987569212189224e-05, |
|
"loss": 0.7756, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7575399279594421, |
|
"eval_runtime": 2.0449, |
|
"eval_samples_per_second": 55.749, |
|
"eval_steps_per_second": 2.934, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8153846153846154, |
|
"grad_norm": 2.082437753677368, |
|
"learning_rate": 1.998345127099248e-05, |
|
"loss": 0.7127, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8153846153846154, |
|
"eval_loss": 0.7561782598495483, |
|
"eval_runtime": 2.048, |
|
"eval_samples_per_second": 55.664, |
|
"eval_steps_per_second": 2.93, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 2.132066488265991, |
|
"learning_rate": 1.99787457450594e-05, |
|
"loss": 0.6398, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 0.7557066679000854, |
|
"eval_runtime": 2.0659, |
|
"eval_samples_per_second": 55.182, |
|
"eval_steps_per_second": 2.904, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 2.816959857940674, |
|
"learning_rate": 1.997345291133783e-05, |
|
"loss": 0.6956, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"eval_loss": 0.7591909170150757, |
|
"eval_runtime": 2.0335, |
|
"eval_samples_per_second": 56.061, |
|
"eval_steps_per_second": 2.951, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 1.8596259355545044, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 0.656, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"eval_loss": 0.76849365234375, |
|
"eval_runtime": 2.0556, |
|
"eval_samples_per_second": 55.457, |
|
"eval_steps_per_second": 2.919, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8769230769230769, |
|
"grad_norm": 2.643859386444092, |
|
"learning_rate": 1.996110660113475e-05, |
|
"loss": 0.6426, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8769230769230769, |
|
"eval_loss": 0.7682607769966125, |
|
"eval_runtime": 2.0831, |
|
"eval_samples_per_second": 54.725, |
|
"eval_steps_per_second": 2.88, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 3.8031773567199707, |
|
"learning_rate": 1.995405385130611e-05, |
|
"loss": 0.7259, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"eval_loss": 0.7607874274253845, |
|
"eval_runtime": 2.0943, |
|
"eval_samples_per_second": 54.433, |
|
"eval_steps_per_second": 2.865, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9076923076923077, |
|
"grad_norm": 2.6620101928710938, |
|
"learning_rate": 1.9946415246951928e-05, |
|
"loss": 0.739, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9076923076923077, |
|
"eval_loss": 0.7598668336868286, |
|
"eval_runtime": 2.0418, |
|
"eval_samples_per_second": 55.833, |
|
"eval_steps_per_second": 2.939, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 4.222265720367432, |
|
"learning_rate": 1.9938191237648924e-05, |
|
"loss": 0.7451, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"eval_loss": 0.7620775699615479, |
|
"eval_runtime": 2.0423, |
|
"eval_samples_per_second": 55.82, |
|
"eval_steps_per_second": 2.938, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9384615384615385, |
|
"grad_norm": 1.1073704957962036, |
|
"learning_rate": 1.992938230742835e-05, |
|
"loss": 0.705, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9384615384615385, |
|
"eval_loss": 0.7568953633308411, |
|
"eval_runtime": 2.1052, |
|
"eval_samples_per_second": 54.153, |
|
"eval_steps_per_second": 2.85, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 1.9715179204940796, |
|
"learning_rate": 1.9919988974747473e-05, |
|
"loss": 0.7293, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"eval_loss": 0.7628914713859558, |
|
"eval_runtime": 2.0473, |
|
"eval_samples_per_second": 55.684, |
|
"eval_steps_per_second": 2.931, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9692307692307692, |
|
"grad_norm": 3.2774569988250732, |
|
"learning_rate": 1.9910011792459086e-05, |
|
"loss": 0.7437, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9692307692307692, |
|
"eval_loss": 0.7696098685264587, |
|
"eval_runtime": 2.0418, |
|
"eval_samples_per_second": 55.833, |
|
"eval_steps_per_second": 2.939, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 2.902707099914551, |
|
"learning_rate": 1.9899451347778962e-05, |
|
"loss": 0.7946, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"eval_loss": 0.760672390460968, |
|
"eval_runtime": 2.054, |
|
"eval_samples_per_second": 55.501, |
|
"eval_steps_per_second": 2.921, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.315918922424316, |
|
"learning_rate": 1.9888308262251286e-05, |
|
"loss": 0.7553, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7670853734016418, |
|
"eval_runtime": 2.0439, |
|
"eval_samples_per_second": 55.775, |
|
"eval_steps_per_second": 2.936, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 0.9407129287719727, |
|
"learning_rate": 1.9876583191712083e-05, |
|
"loss": 0.4016, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"eval_loss": 0.8053916096687317, |
|
"eval_runtime": 2.0973, |
|
"eval_samples_per_second": 54.355, |
|
"eval_steps_per_second": 2.861, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0307692307692307, |
|
"grad_norm": 2.96834135055542, |
|
"learning_rate": 1.9864276826250608e-05, |
|
"loss": 0.3824, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0307692307692307, |
|
"eval_loss": 0.7907722592353821, |
|
"eval_runtime": 2.0495, |
|
"eval_samples_per_second": 55.624, |
|
"eval_steps_per_second": 2.928, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 3.3183233737945557, |
|
"learning_rate": 1.9851389890168738e-05, |
|
"loss": 0.3728, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"eval_loss": 0.785139799118042, |
|
"eval_runtime": 2.0474, |
|
"eval_samples_per_second": 55.682, |
|
"eval_steps_per_second": 2.931, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0615384615384615, |
|
"grad_norm": 2.4374091625213623, |
|
"learning_rate": 1.983792314193835e-05, |
|
"loss": 0.3597, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0615384615384615, |
|
"eval_loss": 0.7914989590644836, |
|
"eval_runtime": 2.0646, |
|
"eval_samples_per_second": 55.217, |
|
"eval_steps_per_second": 2.906, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 1.1855288743972778, |
|
"learning_rate": 1.9823877374156647e-05, |
|
"loss": 0.2332, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"eval_loss": 0.8562365770339966, |
|
"eval_runtime": 2.1227, |
|
"eval_samples_per_second": 53.706, |
|
"eval_steps_per_second": 2.827, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0923076923076924, |
|
"grad_norm": 1.8067868947982788, |
|
"learning_rate": 1.9809253413499565e-05, |
|
"loss": 0.3784, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0923076923076924, |
|
"eval_loss": 0.7976874709129333, |
|
"eval_runtime": 2.0503, |
|
"eval_samples_per_second": 55.601, |
|
"eval_steps_per_second": 2.926, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 1.8051857948303223, |
|
"learning_rate": 1.979405212067306e-05, |
|
"loss": 0.4172, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 0.8003087639808655, |
|
"eval_runtime": 2.0576, |
|
"eval_samples_per_second": 55.403, |
|
"eval_steps_per_second": 2.916, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.123076923076923, |
|
"grad_norm": 3.1192572116851807, |
|
"learning_rate": 1.9778274390362488e-05, |
|
"loss": 0.3841, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.123076923076923, |
|
"eval_loss": 0.7872086763381958, |
|
"eval_runtime": 2.0523, |
|
"eval_samples_per_second": 55.546, |
|
"eval_steps_per_second": 2.923, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 1.4112781286239624, |
|
"learning_rate": 1.9761921151179937e-05, |
|
"loss": 0.2886, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"eval_loss": 0.8142873048782349, |
|
"eval_runtime": 2.0242, |
|
"eval_samples_per_second": 56.319, |
|
"eval_steps_per_second": 2.964, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 2.2796831130981445, |
|
"learning_rate": 1.9744993365609563e-05, |
|
"loss": 0.4226, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"eval_loss": 0.7786396741867065, |
|
"eval_runtime": 2.0271, |
|
"eval_samples_per_second": 56.237, |
|
"eval_steps_per_second": 2.96, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 2.9644672870635986, |
|
"learning_rate": 1.9727492029950965e-05, |
|
"loss": 0.3813, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"eval_loss": 0.7970356941223145, |
|
"eval_runtime": 2.0329, |
|
"eval_samples_per_second": 56.077, |
|
"eval_steps_per_second": 2.951, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1846153846153846, |
|
"grad_norm": 3.2686288356781006, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 0.3995, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1846153846153846, |
|
"eval_loss": 0.813422441482544, |
|
"eval_runtime": 2.1424, |
|
"eval_samples_per_second": 53.211, |
|
"eval_steps_per_second": 2.801, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 7.485370635986328, |
|
"learning_rate": 1.969077286229078e-05, |
|
"loss": 0.399, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.8145797252655029, |
|
"eval_runtime": 2.0591, |
|
"eval_samples_per_second": 55.365, |
|
"eval_steps_per_second": 2.914, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2153846153846155, |
|
"grad_norm": 6.251548767089844, |
|
"learning_rate": 1.967155719142785e-05, |
|
"loss": 0.4973, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2153846153846155, |
|
"eval_loss": 0.7992886900901794, |
|
"eval_runtime": 2.0414, |
|
"eval_samples_per_second": 55.844, |
|
"eval_steps_per_second": 2.939, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 3.131122589111328, |
|
"learning_rate": 1.9651772292626804e-05, |
|
"loss": 0.3737, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"eval_loss": 0.7876585721969604, |
|
"eval_runtime": 2.0351, |
|
"eval_samples_per_second": 56.018, |
|
"eval_steps_per_second": 2.948, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2461538461538462, |
|
"grad_norm": 2.9482033252716064, |
|
"learning_rate": 1.9631419330345128e-05, |
|
"loss": 0.3664, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2461538461538462, |
|
"eval_loss": 0.8115803003311157, |
|
"eval_runtime": 2.0779, |
|
"eval_samples_per_second": 54.862, |
|
"eval_steps_per_second": 2.887, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 2.246203660964966, |
|
"learning_rate": 1.961049950247418e-05, |
|
"loss": 0.3071, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"eval_loss": 0.8049949407577515, |
|
"eval_runtime": 2.0439, |
|
"eval_samples_per_second": 55.775, |
|
"eval_steps_per_second": 2.936, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2769230769230768, |
|
"grad_norm": 4.451783180236816, |
|
"learning_rate": 1.9589014040268678e-05, |
|
"loss": 0.3496, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2769230769230768, |
|
"eval_loss": 0.8066652417182922, |
|
"eval_runtime": 2.0386, |
|
"eval_samples_per_second": 55.92, |
|
"eval_steps_per_second": 2.943, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 2.6668195724487305, |
|
"learning_rate": 1.9566964208274254e-05, |
|
"loss": 0.3639, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"eval_loss": 0.8018885850906372, |
|
"eval_runtime": 2.0355, |
|
"eval_samples_per_second": 56.007, |
|
"eval_steps_per_second": 2.948, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 2.0115177631378174, |
|
"learning_rate": 1.954435130425301e-05, |
|
"loss": 0.3899, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"eval_loss": 0.7964260578155518, |
|
"eval_runtime": 2.0482, |
|
"eval_samples_per_second": 55.658, |
|
"eval_steps_per_second": 2.929, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 2.5273962020874023, |
|
"learning_rate": 1.952117665910714e-05, |
|
"loss": 0.3742, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"eval_loss": 0.8197150826454163, |
|
"eval_runtime": 2.0642, |
|
"eval_samples_per_second": 55.227, |
|
"eval_steps_per_second": 2.907, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3384615384615386, |
|
"grad_norm": 1.823403239250183, |
|
"learning_rate": 1.949744163680062e-05, |
|
"loss": 0.3971, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3384615384615386, |
|
"eval_loss": 0.8292858004570007, |
|
"eval_runtime": 2.0374, |
|
"eval_samples_per_second": 55.955, |
|
"eval_steps_per_second": 2.945, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 3.496244430541992, |
|
"learning_rate": 1.9473147634278884e-05, |
|
"loss": 0.4178, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"eval_loss": 0.8014177083969116, |
|
"eval_runtime": 2.0358, |
|
"eval_samples_per_second": 55.997, |
|
"eval_steps_per_second": 2.947, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3692307692307693, |
|
"grad_norm": 1.5359822511672974, |
|
"learning_rate": 1.9448296081386656e-05, |
|
"loss": 0.3711, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3692307692307693, |
|
"eval_loss": 0.8050973415374756, |
|
"eval_runtime": 2.0345, |
|
"eval_samples_per_second": 56.032, |
|
"eval_steps_per_second": 2.949, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 3.2398881912231445, |
|
"learning_rate": 1.9422888440783773e-05, |
|
"loss": 0.3689, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 0.7915543913841248, |
|
"eval_runtime": 2.042, |
|
"eval_samples_per_second": 55.827, |
|
"eval_steps_per_second": 2.938, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.8013181686401367, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.497, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.7924531698226929, |
|
"eval_runtime": 2.0313, |
|
"eval_samples_per_second": 56.12, |
|
"eval_steps_per_second": 2.954, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"grad_norm": 2.0076115131378174, |
|
"learning_rate": 1.9370410910642473e-05, |
|
"loss": 0.3979, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"eval_loss": 0.8060823678970337, |
|
"eval_runtime": 2.0451, |
|
"eval_samples_per_second": 55.743, |
|
"eval_steps_per_second": 2.934, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4307692307692308, |
|
"grad_norm": 3.230391502380371, |
|
"learning_rate": 1.934334410971489e-05, |
|
"loss": 0.3975, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4307692307692308, |
|
"eval_loss": 0.7960232496261597, |
|
"eval_runtime": 2.0487, |
|
"eval_samples_per_second": 55.644, |
|
"eval_steps_per_second": 2.929, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"grad_norm": 2.0214502811431885, |
|
"learning_rate": 1.9315727398116516e-05, |
|
"loss": 0.3024, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"eval_loss": 0.8080164790153503, |
|
"eval_runtime": 2.0367, |
|
"eval_samples_per_second": 55.972, |
|
"eval_steps_per_second": 2.946, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 2.793571949005127, |
|
"learning_rate": 1.9287562401253023e-05, |
|
"loss": 0.42, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"eval_loss": 0.7998307943344116, |
|
"eval_runtime": 2.0337, |
|
"eval_samples_per_second": 56.056, |
|
"eval_steps_per_second": 2.95, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 3.0209238529205322, |
|
"learning_rate": 1.9258850776799875e-05, |
|
"loss": 0.4941, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"eval_loss": 0.7900478839874268, |
|
"eval_runtime": 2.08, |
|
"eval_samples_per_second": 54.807, |
|
"eval_steps_per_second": 2.885, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4923076923076923, |
|
"grad_norm": 1.660001516342163, |
|
"learning_rate": 1.9229594214604782e-05, |
|
"loss": 0.3708, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4923076923076923, |
|
"eval_loss": 0.7939229607582092, |
|
"eval_runtime": 2.0329, |
|
"eval_samples_per_second": 56.078, |
|
"eval_steps_per_second": 2.951, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"grad_norm": 2.3737564086914062, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 0.3624, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"eval_loss": 0.8061041831970215, |
|
"eval_runtime": 2.037, |
|
"eval_samples_per_second": 55.965, |
|
"eval_steps_per_second": 2.946, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.523076923076923, |
|
"grad_norm": 3.2474894523620605, |
|
"learning_rate": 1.9169453196642197e-05, |
|
"loss": 0.4532, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.523076923076923, |
|
"eval_loss": 0.8081569075584412, |
|
"eval_runtime": 2.0354, |
|
"eval_samples_per_second": 56.009, |
|
"eval_steps_per_second": 2.948, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.9144352674484253, |
|
"learning_rate": 1.9138572280526795e-05, |
|
"loss": 0.3437, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 0.799223005771637, |
|
"eval_runtime": 2.0389, |
|
"eval_samples_per_second": 55.913, |
|
"eval_steps_per_second": 2.943, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 1000, |
|
"total_flos": 4.82382464459735e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|