|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.8461538461538463, |
|
"eval_steps": 10, |
|
"global_step": 1250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006153846153846154, |
|
"eval_loss": 1.487821340560913, |
|
"eval_runtime": 1.2374, |
|
"eval_samples_per_second": 92.131, |
|
"eval_steps_per_second": 4.849, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 20.007999420166016, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 1.4703, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 1.4221965074539185, |
|
"eval_runtime": 1.1701, |
|
"eval_samples_per_second": 97.431, |
|
"eval_steps_per_second": 5.128, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 29.657033920288086, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 1.4402, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"eval_loss": 1.238189935684204, |
|
"eval_runtime": 1.1729, |
|
"eval_samples_per_second": 97.195, |
|
"eval_steps_per_second": 5.116, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 10.80688190460205, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 1.0957, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.908875048160553, |
|
"eval_runtime": 1.1739, |
|
"eval_samples_per_second": 97.111, |
|
"eval_steps_per_second": 5.111, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 12.914667129516602, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.8136, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"eval_loss": 0.7436326146125793, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.34, |
|
"eval_steps_per_second": 5.123, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 5.206438064575195, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.7864, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.6924600601196289, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.287, |
|
"eval_steps_per_second": 5.12, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 5.670599460601807, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.622, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.6791077852249146, |
|
"eval_runtime": 1.1678, |
|
"eval_samples_per_second": 97.62, |
|
"eval_steps_per_second": 5.138, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 3.6266868114471436, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.6672, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"eval_loss": 0.6631948947906494, |
|
"eval_runtime": 1.1626, |
|
"eval_samples_per_second": 98.06, |
|
"eval_steps_per_second": 5.161, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 3.1027884483337402, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.6097, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"eval_loss": 0.6625475883483887, |
|
"eval_runtime": 1.1717, |
|
"eval_samples_per_second": 97.294, |
|
"eval_steps_per_second": 5.121, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 3.233839511871338, |
|
"learning_rate": 5.538461538461539e-06, |
|
"loss": 0.6957, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 0.6590337157249451, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.738, |
|
"eval_steps_per_second": 5.144, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 4.932614803314209, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.6994, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.6592673659324646, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.502, |
|
"eval_steps_per_second": 5.132, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 2.642303466796875, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.6501, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"eval_loss": 0.6589992642402649, |
|
"eval_runtime": 1.1669, |
|
"eval_samples_per_second": 97.691, |
|
"eval_steps_per_second": 5.142, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 3.45409893989563, |
|
"learning_rate": 7.384615384615386e-06, |
|
"loss": 0.7512, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"eval_loss": 0.6568591594696045, |
|
"eval_runtime": 1.1778, |
|
"eval_samples_per_second": 96.79, |
|
"eval_steps_per_second": 5.094, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 8.460351943969727, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5996, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6578297019004822, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.575, |
|
"eval_steps_per_second": 5.136, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 2.4495465755462646, |
|
"learning_rate": 8.615384615384617e-06, |
|
"loss": 0.6498, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"eval_loss": 0.6588199138641357, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.739, |
|
"eval_steps_per_second": 5.144, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.6779162883758545, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.703, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.6626676321029663, |
|
"eval_runtime": 1.1764, |
|
"eval_samples_per_second": 96.904, |
|
"eval_steps_per_second": 5.1, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 8.147880554199219, |
|
"learning_rate": 9.846153846153848e-06, |
|
"loss": 0.6649, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"eval_loss": 0.6644802689552307, |
|
"eval_runtime": 1.1701, |
|
"eval_samples_per_second": 97.424, |
|
"eval_steps_per_second": 5.128, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 2.7410728931427, |
|
"learning_rate": 1.0461538461538463e-05, |
|
"loss": 0.6704, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"eval_loss": 0.6591455340385437, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.253, |
|
"eval_steps_per_second": 5.119, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 6.886598587036133, |
|
"learning_rate": 1.1076923076923079e-05, |
|
"loss": 0.6073, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 0.6617754697799683, |
|
"eval_runtime": 1.1682, |
|
"eval_samples_per_second": 97.588, |
|
"eval_steps_per_second": 5.136, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 4.26249361038208, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.7556, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"eval_loss": 0.6659030318260193, |
|
"eval_runtime": 1.1721, |
|
"eval_samples_per_second": 97.265, |
|
"eval_steps_per_second": 5.119, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 4.204222202301025, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.6583, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.6660778522491455, |
|
"eval_runtime": 1.1728, |
|
"eval_samples_per_second": 97.201, |
|
"eval_steps_per_second": 5.116, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 3.154658079147339, |
|
"learning_rate": 1.2923076923076925e-05, |
|
"loss": 0.7288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"eval_loss": 0.6642422676086426, |
|
"eval_runtime": 1.166, |
|
"eval_samples_per_second": 97.772, |
|
"eval_steps_per_second": 5.146, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 3.248359203338623, |
|
"learning_rate": 1.353846153846154e-05, |
|
"loss": 0.6702, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"eval_loss": 0.6680281758308411, |
|
"eval_runtime": 1.1691, |
|
"eval_samples_per_second": 97.507, |
|
"eval_steps_per_second": 5.132, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 2.5507216453552246, |
|
"learning_rate": 1.4153846153846156e-05, |
|
"loss": 0.66, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"eval_loss": 0.6713310480117798, |
|
"eval_runtime": 1.167, |
|
"eval_samples_per_second": 97.683, |
|
"eval_steps_per_second": 5.141, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 2.1180007457733154, |
|
"learning_rate": 1.4769230769230772e-05, |
|
"loss": 0.6172, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"eval_loss": 0.6759226322174072, |
|
"eval_runtime": 1.1758, |
|
"eval_samples_per_second": 96.954, |
|
"eval_steps_per_second": 5.103, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.8318371772766113, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.5954, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.6854314208030701, |
|
"eval_runtime": 1.1698, |
|
"eval_samples_per_second": 97.457, |
|
"eval_steps_per_second": 5.129, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.519829034805298, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.6639, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.6849851012229919, |
|
"eval_runtime": 1.1732, |
|
"eval_samples_per_second": 97.173, |
|
"eval_steps_per_second": 5.114, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 3.7909388542175293, |
|
"learning_rate": 1.6615384615384618e-05, |
|
"loss": 0.6892, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 0.6863855123519897, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.53, |
|
"eval_steps_per_second": 5.133, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 8.278326988220215, |
|
"learning_rate": 1.7230769230769234e-05, |
|
"loss": 0.6014, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"eval_loss": 0.6890373826026917, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.251, |
|
"eval_steps_per_second": 5.118, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 2.616001605987549, |
|
"learning_rate": 1.784615384615385e-05, |
|
"loss": 0.6775, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"eval_loss": 0.6797110438346863, |
|
"eval_runtime": 1.1888, |
|
"eval_samples_per_second": 95.897, |
|
"eval_steps_per_second": 5.047, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.4450528621673584, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.7434, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"eval_loss": 0.6779448986053467, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.254, |
|
"eval_steps_per_second": 5.119, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 4.121164798736572, |
|
"learning_rate": 1.907692307692308e-05, |
|
"loss": 0.6413, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"eval_loss": 0.6819297671318054, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.574, |
|
"eval_steps_per_second": 5.135, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 2.057229995727539, |
|
"learning_rate": 1.9692307692307696e-05, |
|
"loss": 0.6511, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"eval_loss": 0.6861481666564941, |
|
"eval_runtime": 1.1699, |
|
"eval_samples_per_second": 97.444, |
|
"eval_steps_per_second": 5.129, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 1.7295210361480713, |
|
"learning_rate": 1.9999855802751384e-05, |
|
"loss": 0.5872, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"eval_loss": 0.7481781244277954, |
|
"eval_runtime": 1.172, |
|
"eval_samples_per_second": 97.273, |
|
"eval_steps_per_second": 5.12, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 3.6910336017608643, |
|
"learning_rate": 1.9998702249713747e-05, |
|
"loss": 0.438, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"eval_loss": 0.7471169829368591, |
|
"eval_runtime": 1.175, |
|
"eval_samples_per_second": 97.019, |
|
"eval_steps_per_second": 5.106, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 3.9220831394195557, |
|
"learning_rate": 1.9996395276708856e-05, |
|
"loss": 0.4141, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"eval_loss": 0.7425897121429443, |
|
"eval_runtime": 1.1888, |
|
"eval_samples_per_second": 95.893, |
|
"eval_steps_per_second": 5.047, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 2.1136879920959473, |
|
"learning_rate": 1.9992935149862116e-05, |
|
"loss": 0.4173, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 0.7477542161941528, |
|
"eval_runtime": 1.1705, |
|
"eval_samples_per_second": 97.398, |
|
"eval_steps_per_second": 5.126, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 2.187299966812134, |
|
"learning_rate": 1.998832226832327e-05, |
|
"loss": 0.4242, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"eval_loss": 0.7475994229316711, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.582, |
|
"eval_steps_per_second": 5.136, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 2.683750629425049, |
|
"learning_rate": 1.9982557164220335e-05, |
|
"loss": 0.4413, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"eval_loss": 0.7552060484886169, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.529, |
|
"eval_steps_per_second": 5.133, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.312657117843628, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.3361, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7630029916763306, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.529, |
|
"eval_steps_per_second": 5.133, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 2.3290536403656006, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 0.4174, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"eval_loss": 0.7669293880462646, |
|
"eval_runtime": 1.1711, |
|
"eval_samples_per_second": 97.345, |
|
"eval_steps_per_second": 5.123, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 2.4483697414398193, |
|
"learning_rate": 1.9958355831085155e-05, |
|
"loss": 0.4045, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"eval_loss": 0.7652024626731873, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.461, |
|
"eval_steps_per_second": 5.13, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 1.703616976737976, |
|
"learning_rate": 1.9947989815101444e-05, |
|
"loss": 0.4292, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"eval_loss": 0.7553891539573669, |
|
"eval_runtime": 1.1719, |
|
"eval_samples_per_second": 97.277, |
|
"eval_steps_per_second": 5.12, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 1.6245399713516235, |
|
"learning_rate": 1.9936476229183133e-05, |
|
"loss": 0.4164, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"eval_loss": 0.7711736559867859, |
|
"eval_runtime": 1.1776, |
|
"eval_samples_per_second": 96.806, |
|
"eval_steps_per_second": 5.095, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 1.9511611461639404, |
|
"learning_rate": 1.992381640150257e-05, |
|
"loss": 0.4337, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"eval_loss": 0.7438372373580933, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.5, |
|
"eval_steps_per_second": 5.132, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 2.466737747192383, |
|
"learning_rate": 1.9910011792459086e-05, |
|
"loss": 0.3712, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 0.8045614361763, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.53, |
|
"eval_steps_per_second": 5.133, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"grad_norm": 2.473851203918457, |
|
"learning_rate": 1.9895063994510512e-05, |
|
"loss": 0.516, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"eval_loss": 0.7463352084159851, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.464, |
|
"eval_steps_per_second": 5.13, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"grad_norm": 2.757955312728882, |
|
"learning_rate": 1.9878974731989487e-05, |
|
"loss": 0.4965, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"eval_loss": 0.7571713924407959, |
|
"eval_runtime": 1.1688, |
|
"eval_samples_per_second": 97.534, |
|
"eval_steps_per_second": 5.133, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 3.9884915351867676, |
|
"learning_rate": 1.9861745860904538e-05, |
|
"loss": 0.3973, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"eval_loss": 0.7763081789016724, |
|
"eval_runtime": 1.168, |
|
"eval_samples_per_second": 97.605, |
|
"eval_steps_per_second": 5.137, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"grad_norm": 1.9474631547927856, |
|
"learning_rate": 1.9843379368725978e-05, |
|
"loss": 0.4625, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"eval_loss": 0.7595367431640625, |
|
"eval_runtime": 1.1708, |
|
"eval_samples_per_second": 97.37, |
|
"eval_steps_per_second": 5.125, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 2.2907869815826416, |
|
"learning_rate": 1.9823877374156647e-05, |
|
"loss": 0.4371, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 0.762865424156189, |
|
"eval_runtime": 1.1766, |
|
"eval_samples_per_second": 96.891, |
|
"eval_steps_per_second": 5.1, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"grad_norm": 3.067608594894409, |
|
"learning_rate": 1.9803242126887496e-05, |
|
"loss": 0.45, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"eval_loss": 0.7638718485832214, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.568, |
|
"eval_steps_per_second": 5.135, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.1113197803497314, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.4524, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.7560412883758545, |
|
"eval_runtime": 1.1719, |
|
"eval_samples_per_second": 97.278, |
|
"eval_steps_per_second": 5.12, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"grad_norm": 1.889585256576538, |
|
"learning_rate": 1.9758581526381878e-05, |
|
"loss": 0.4287, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"eval_loss": 0.7591350674629211, |
|
"eval_runtime": 1.1693, |
|
"eval_samples_per_second": 97.495, |
|
"eval_steps_per_second": 5.131, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"grad_norm": 2.519418716430664, |
|
"learning_rate": 1.973456132505684e-05, |
|
"loss": 0.4603, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"eval_loss": 0.7536805868148804, |
|
"eval_runtime": 1.1799, |
|
"eval_samples_per_second": 96.617, |
|
"eval_steps_per_second": 5.085, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 2.7455596923828125, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 0.4293, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"eval_loss": 0.763169527053833, |
|
"eval_runtime": 1.1696, |
|
"eval_samples_per_second": 97.47, |
|
"eval_steps_per_second": 5.13, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"grad_norm": 1.9823411703109741, |
|
"learning_rate": 1.9683154974430544e-05, |
|
"loss": 0.4643, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"eval_loss": 0.7607314586639404, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.336, |
|
"eval_steps_per_second": 5.123, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"grad_norm": 2.2437684535980225, |
|
"learning_rate": 1.965577475520999e-05, |
|
"loss": 0.452, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"eval_loss": 0.7615717649459839, |
|
"eval_runtime": 1.1946, |
|
"eval_samples_per_second": 95.425, |
|
"eval_steps_per_second": 5.022, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"grad_norm": 3.0740182399749756, |
|
"learning_rate": 1.962728067509791e-05, |
|
"loss": 0.4885, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"eval_loss": 0.7764689326286316, |
|
"eval_runtime": 1.1714, |
|
"eval_samples_per_second": 97.32, |
|
"eval_steps_per_second": 5.122, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"grad_norm": 2.050313711166382, |
|
"learning_rate": 1.9597676021084962e-05, |
|
"loss": 0.4992, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"eval_loss": 0.7457339763641357, |
|
"eval_runtime": 1.1747, |
|
"eval_samples_per_second": 97.045, |
|
"eval_steps_per_second": 5.108, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 2.349470853805542, |
|
"learning_rate": 1.9566964208274254e-05, |
|
"loss": 0.3963, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"eval_loss": 0.7712545990943909, |
|
"eval_runtime": 1.17, |
|
"eval_samples_per_second": 97.433, |
|
"eval_steps_per_second": 5.128, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"grad_norm": 1.947912573814392, |
|
"learning_rate": 1.9535148779487365e-05, |
|
"loss": 0.4388, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"eval_loss": 0.772519052028656, |
|
"eval_runtime": 1.1717, |
|
"eval_samples_per_second": 97.296, |
|
"eval_steps_per_second": 5.121, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"grad_norm": 1.8837225437164307, |
|
"learning_rate": 1.9502233404855672e-05, |
|
"loss": 0.3965, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"eval_loss": 0.7594517469406128, |
|
"eval_runtime": 1.2215, |
|
"eval_samples_per_second": 93.326, |
|
"eval_steps_per_second": 4.912, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"grad_norm": 1.8398792743682861, |
|
"learning_rate": 1.946822188139696e-05, |
|
"loss": 0.3898, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"eval_loss": 0.7748451828956604, |
|
"eval_runtime": 1.2222, |
|
"eval_samples_per_second": 93.273, |
|
"eval_steps_per_second": 4.909, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"grad_norm": 2.3971219062805176, |
|
"learning_rate": 1.9433118132577432e-05, |
|
"loss": 0.4253, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"eval_loss": 0.7729252576828003, |
|
"eval_runtime": 1.1743, |
|
"eval_samples_per_second": 97.081, |
|
"eval_steps_per_second": 5.11, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.657328724861145, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.4898, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7602188587188721, |
|
"eval_runtime": 1.1738, |
|
"eval_samples_per_second": 97.117, |
|
"eval_steps_per_second": 5.111, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"grad_norm": 1.3759773969650269, |
|
"learning_rate": 1.935965028223259e-05, |
|
"loss": 0.2182, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"eval_loss": 0.8499692678451538, |
|
"eval_runtime": 1.1719, |
|
"eval_samples_per_second": 97.276, |
|
"eval_steps_per_second": 5.12, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"grad_norm": 1.4757074117660522, |
|
"learning_rate": 1.932129465573568e-05, |
|
"loss": 0.2196, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"eval_loss": 0.866599977016449, |
|
"eval_runtime": 1.1739, |
|
"eval_samples_per_second": 97.115, |
|
"eval_steps_per_second": 5.111, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"grad_norm": 1.2500200271606445, |
|
"learning_rate": 1.9281863752957095e-05, |
|
"loss": 0.1772, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"eval_loss": 0.8642630577087402, |
|
"eval_runtime": 1.1753, |
|
"eval_samples_per_second": 96.999, |
|
"eval_steps_per_second": 5.105, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"grad_norm": 2.561828851699829, |
|
"learning_rate": 1.92413621225262e-05, |
|
"loss": 0.2014, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"eval_loss": 0.8872668743133545, |
|
"eval_runtime": 1.1761, |
|
"eval_samples_per_second": 96.933, |
|
"eval_steps_per_second": 5.102, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 1.8431521654129028, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 0.2245, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"eval_loss": 0.8799911737442017, |
|
"eval_runtime": 1.1766, |
|
"eval_samples_per_second": 96.89, |
|
"eval_steps_per_second": 5.099, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"grad_norm": 1.4718531370162964, |
|
"learning_rate": 1.915716549026541e-05, |
|
"loss": 0.1911, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"eval_loss": 0.8806982636451721, |
|
"eval_runtime": 1.1696, |
|
"eval_samples_per_second": 97.473, |
|
"eval_steps_per_second": 5.13, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"grad_norm": 2.061520576477051, |
|
"learning_rate": 1.9113480201103658e-05, |
|
"loss": 0.232, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"eval_loss": 0.8812103271484375, |
|
"eval_runtime": 1.1676, |
|
"eval_samples_per_second": 97.636, |
|
"eval_steps_per_second": 5.139, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"grad_norm": 2.530285596847534, |
|
"learning_rate": 1.9068743608505454e-05, |
|
"loss": 0.2232, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"eval_loss": 0.8682798147201538, |
|
"eval_runtime": 1.1687, |
|
"eval_samples_per_second": 97.545, |
|
"eval_steps_per_second": 5.134, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"grad_norm": 1.8433207273483276, |
|
"learning_rate": 1.902296087314845e-05, |
|
"loss": 0.2315, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"eval_loss": 0.8747490048408508, |
|
"eval_runtime": 1.1702, |
|
"eval_samples_per_second": 97.421, |
|
"eval_steps_per_second": 5.127, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 2.1529664993286133, |
|
"learning_rate": 1.8976137276390145e-05, |
|
"loss": 0.239, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"eval_loss": 0.8778530359268188, |
|
"eval_runtime": 1.1674, |
|
"eval_samples_per_second": 97.656, |
|
"eval_steps_per_second": 5.14, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"grad_norm": 1.839979887008667, |
|
"learning_rate": 1.892827821965864e-05, |
|
"loss": 0.2228, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"eval_loss": 0.8769873976707458, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.288, |
|
"eval_steps_per_second": 5.12, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"grad_norm": 1.3814674615859985, |
|
"learning_rate": 1.8879389223829592e-05, |
|
"loss": 0.1999, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"eval_loss": 0.8731983304023743, |
|
"eval_runtime": 1.1723, |
|
"eval_samples_per_second": 97.242, |
|
"eval_steps_per_second": 5.118, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.5382564067840576, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.1888, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.8794592022895813, |
|
"eval_runtime": 1.1678, |
|
"eval_samples_per_second": 97.622, |
|
"eval_steps_per_second": 5.138, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"grad_norm": 2.371277093887329, |
|
"learning_rate": 1.8778544091784047e-05, |
|
"loss": 0.2293, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"eval_loss": 0.8893064260482788, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.567, |
|
"eval_steps_per_second": 5.135, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 1.4892629384994507, |
|
"learning_rate": 1.8726599588756144e-05, |
|
"loss": 0.2326, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"eval_loss": 0.8676194548606873, |
|
"eval_runtime": 1.1699, |
|
"eval_samples_per_second": 97.441, |
|
"eval_steps_per_second": 5.128, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"grad_norm": 1.6047152280807495, |
|
"learning_rate": 1.8673648411665895e-05, |
|
"loss": 0.1986, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"eval_loss": 0.8942193984985352, |
|
"eval_runtime": 1.17, |
|
"eval_samples_per_second": 97.433, |
|
"eval_steps_per_second": 5.128, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"grad_norm": 2.106461524963379, |
|
"learning_rate": 1.8619696668800494e-05, |
|
"loss": 0.238, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"eval_loss": 0.8907297849655151, |
|
"eval_runtime": 1.1681, |
|
"eval_samples_per_second": 97.593, |
|
"eval_steps_per_second": 5.136, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"grad_norm": 1.7706941366195679, |
|
"learning_rate": 1.8564750583869374e-05, |
|
"loss": 0.2278, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"eval_loss": 0.8588199019432068, |
|
"eval_runtime": 1.1709, |
|
"eval_samples_per_second": 97.36, |
|
"eval_steps_per_second": 5.124, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"grad_norm": 1.4446183443069458, |
|
"learning_rate": 1.850881649528625e-05, |
|
"loss": 0.2033, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"eval_loss": 0.8720155954360962, |
|
"eval_runtime": 1.1716, |
|
"eval_samples_per_second": 97.304, |
|
"eval_steps_per_second": 5.121, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 1.5155885219573975, |
|
"learning_rate": 1.845190085543795e-05, |
|
"loss": 0.2211, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"eval_loss": 0.8732415437698364, |
|
"eval_runtime": 1.1708, |
|
"eval_samples_per_second": 97.371, |
|
"eval_steps_per_second": 5.125, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"grad_norm": 2.934957981109619, |
|
"learning_rate": 1.839401022994006e-05, |
|
"loss": 0.21, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"eval_loss": 0.8790932297706604, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.68, |
|
"eval_steps_per_second": 5.141, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"grad_norm": 2.930457353591919, |
|
"learning_rate": 1.8335151296879576e-05, |
|
"loss": 0.2313, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"eval_loss": 0.8555310368537903, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.284, |
|
"eval_steps_per_second": 5.12, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"grad_norm": 2.029303789138794, |
|
"learning_rate": 1.82753308460445e-05, |
|
"loss": 0.2285, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"eval_loss": 0.870873749256134, |
|
"eval_runtime": 1.1696, |
|
"eval_samples_per_second": 97.47, |
|
"eval_steps_per_second": 5.13, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"grad_norm": 1.949743390083313, |
|
"learning_rate": 1.821455577814062e-05, |
|
"loss": 0.2134, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"eval_loss": 0.8873838186264038, |
|
"eval_runtime": 1.1688, |
|
"eval_samples_per_second": 97.538, |
|
"eval_steps_per_second": 5.134, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 1.6945637464523315, |
|
"learning_rate": 1.8152833103995443e-05, |
|
"loss": 0.218, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"eval_loss": 0.8798539042472839, |
|
"eval_runtime": 1.1699, |
|
"eval_samples_per_second": 97.444, |
|
"eval_steps_per_second": 5.129, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.2784035205841064, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.2426, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.8753871321678162, |
|
"eval_runtime": 1.1723, |
|
"eval_samples_per_second": 97.246, |
|
"eval_steps_per_second": 5.118, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"grad_norm": 1.750718593597412, |
|
"learning_rate": 1.802657352603483e-05, |
|
"loss": 0.2059, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"eval_loss": 0.8761821985244751, |
|
"eval_runtime": 1.1762, |
|
"eval_samples_per_second": 96.92, |
|
"eval_steps_per_second": 5.101, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"grad_norm": 1.2204371690750122, |
|
"learning_rate": 1.7962051187141377e-05, |
|
"loss": 0.2229, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"eval_loss": 0.8772630095481873, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.458, |
|
"eval_steps_per_second": 5.129, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"grad_norm": 2.0748281478881836, |
|
"learning_rate": 1.7896610370170452e-05, |
|
"loss": 0.2582, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"eval_loss": 0.884286105632782, |
|
"eval_runtime": 1.1773, |
|
"eval_samples_per_second": 96.829, |
|
"eval_steps_per_second": 5.096, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.9051027297973633, |
|
"learning_rate": 1.7830258624176224e-05, |
|
"loss": 0.2102, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"eval_loss": 0.878767192363739, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.571, |
|
"eval_steps_per_second": 5.135, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"grad_norm": 1.8921849727630615, |
|
"learning_rate": 1.776300360329488e-05, |
|
"loss": 0.2331, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"eval_loss": 0.8681402206420898, |
|
"eval_runtime": 1.1693, |
|
"eval_samples_per_second": 97.497, |
|
"eval_steps_per_second": 5.131, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"grad_norm": 1.5593761205673218, |
|
"learning_rate": 1.769485306586166e-05, |
|
"loss": 0.2226, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"eval_loss": 0.8726215958595276, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.336, |
|
"eval_steps_per_second": 5.123, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0153846153846153, |
|
"grad_norm": 1.4075913429260254, |
|
"learning_rate": 1.762581487351587e-05, |
|
"loss": 0.1535, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0153846153846153, |
|
"eval_loss": 0.9113463163375854, |
|
"eval_runtime": 1.1794, |
|
"eval_samples_per_second": 96.661, |
|
"eval_steps_per_second": 5.087, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.046153846153846, |
|
"grad_norm": 1.410906434059143, |
|
"learning_rate": 1.7555896990294003e-05, |
|
"loss": 0.1021, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.046153846153846, |
|
"eval_loss": 0.9633604288101196, |
|
"eval_runtime": 1.1702, |
|
"eval_samples_per_second": 97.416, |
|
"eval_steps_per_second": 5.127, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.0470328330993652, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 0.1118, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 0.9647616744041443, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.574, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.1076923076923078, |
|
"grad_norm": 2.098592758178711, |
|
"learning_rate": 1.741345451382992e-05, |
|
"loss": 0.1224, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1076923076923078, |
|
"eval_loss": 0.9478728175163269, |
|
"eval_runtime": 1.1735, |
|
"eval_samples_per_second": 97.143, |
|
"eval_steps_per_second": 5.113, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1384615384615384, |
|
"grad_norm": 1.6263700723648071, |
|
"learning_rate": 1.7340946352319795e-05, |
|
"loss": 0.1377, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.1384615384615384, |
|
"eval_loss": 0.922594428062439, |
|
"eval_runtime": 1.1721, |
|
"eval_samples_per_second": 97.262, |
|
"eval_steps_per_second": 5.119, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.169230769230769, |
|
"grad_norm": 1.0413360595703125, |
|
"learning_rate": 1.7267591361502233e-05, |
|
"loss": 0.1249, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.169230769230769, |
|
"eval_loss": 0.9370183944702148, |
|
"eval_runtime": 1.169, |
|
"eval_samples_per_second": 97.519, |
|
"eval_steps_per_second": 5.133, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.6745877265930176, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 0.1186, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.9573487639427185, |
|
"eval_runtime": 1.1681, |
|
"eval_samples_per_second": 97.596, |
|
"eval_steps_per_second": 5.137, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 1.3979861736297607, |
|
"learning_rate": 1.7118374836693407e-05, |
|
"loss": 0.135, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"eval_loss": 0.9554935693740845, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.735, |
|
"eval_steps_per_second": 5.144, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.2615384615384615, |
|
"grad_norm": 1.4017126560211182, |
|
"learning_rate": 1.7042530515867897e-05, |
|
"loss": 0.1102, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.2615384615384615, |
|
"eval_loss": 0.9547311663627625, |
|
"eval_runtime": 1.1685, |
|
"eval_samples_per_second": 97.561, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.292307692307692, |
|
"grad_norm": 1.3390568494796753, |
|
"learning_rate": 1.6965873790080806e-05, |
|
"loss": 0.1358, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.292307692307692, |
|
"eval_loss": 0.9695214629173279, |
|
"eval_runtime": 1.1695, |
|
"eval_samples_per_second": 97.474, |
|
"eval_steps_per_second": 5.13, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.3230769230769233, |
|
"grad_norm": 1.9002848863601685, |
|
"learning_rate": 1.6888413502219534e-05, |
|
"loss": 0.1419, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.3230769230769233, |
|
"eval_loss": 0.9528248310089111, |
|
"eval_runtime": 1.174, |
|
"eval_samples_per_second": 97.107, |
|
"eval_steps_per_second": 5.111, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.353846153846154, |
|
"grad_norm": 1.7857142686843872, |
|
"learning_rate": 1.6810158587867973e-05, |
|
"loss": 0.128, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.353846153846154, |
|
"eval_loss": 0.9331367611885071, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.282, |
|
"eval_steps_per_second": 5.12, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 1.5178117752075195, |
|
"learning_rate": 1.67311180742757e-05, |
|
"loss": 0.112, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"eval_loss": 0.9547919631004333, |
|
"eval_runtime": 1.1695, |
|
"eval_samples_per_second": 97.48, |
|
"eval_steps_per_second": 5.131, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4153846153846152, |
|
"grad_norm": 1.3151490688323975, |
|
"learning_rate": 1.665130107931666e-05, |
|
"loss": 0.1119, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4153846153846152, |
|
"eval_loss": 0.9812674522399902, |
|
"eval_runtime": 1.1666, |
|
"eval_samples_per_second": 97.721, |
|
"eval_steps_per_second": 5.143, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4461538461538463, |
|
"grad_norm": 1.4334938526153564, |
|
"learning_rate": 1.657071681043731e-05, |
|
"loss": 0.1042, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.4461538461538463, |
|
"eval_loss": 0.9601649641990662, |
|
"eval_runtime": 1.1706, |
|
"eval_samples_per_second": 97.388, |
|
"eval_steps_per_second": 5.126, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.476923076923077, |
|
"grad_norm": 1.3328880071640015, |
|
"learning_rate": 1.648937456359451e-05, |
|
"loss": 0.1129, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.476923076923077, |
|
"eval_loss": 0.9889415502548218, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.527, |
|
"eval_steps_per_second": 5.133, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.5076923076923077, |
|
"grad_norm": 1.8561350107192993, |
|
"learning_rate": 1.640728372218317e-05, |
|
"loss": 0.1247, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5076923076923077, |
|
"eval_loss": 0.9682590961456299, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.461, |
|
"eval_steps_per_second": 5.13, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 1.1813424825668335, |
|
"learning_rate": 1.6324453755953772e-05, |
|
"loss": 0.1213, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"eval_loss": 0.9461174607276917, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.674, |
|
"eval_steps_per_second": 5.141, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.569230769230769, |
|
"grad_norm": 1.2129303216934204, |
|
"learning_rate": 1.624089421992003e-05, |
|
"loss": 0.1253, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.569230769230769, |
|
"eval_loss": 0.9655346274375916, |
|
"eval_runtime": 1.1837, |
|
"eval_samples_per_second": 96.312, |
|
"eval_steps_per_second": 5.069, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 2.2233059406280518, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 0.1147, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.9837222099304199, |
|
"eval_runtime": 1.168, |
|
"eval_samples_per_second": 97.6, |
|
"eval_steps_per_second": 5.137, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6307692307692307, |
|
"grad_norm": 1.382871389389038, |
|
"learning_rate": 1.6071625078187113e-05, |
|
"loss": 0.1272, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.6307692307692307, |
|
"eval_loss": 0.9973061680793762, |
|
"eval_runtime": 1.1757, |
|
"eval_samples_per_second": 96.964, |
|
"eval_steps_per_second": 5.103, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.6615384615384614, |
|
"grad_norm": 1.2613117694854736, |
|
"learning_rate": 1.5985934998862775e-05, |
|
"loss": 0.1367, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6615384615384614, |
|
"eval_loss": 0.9685854315757751, |
|
"eval_runtime": 1.1695, |
|
"eval_samples_per_second": 97.474, |
|
"eval_steps_per_second": 5.13, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 1.0769059658050537, |
|
"learning_rate": 1.5899554400231233e-05, |
|
"loss": 0.1208, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"eval_loss": 0.9511159062385559, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.566, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.723076923076923, |
|
"grad_norm": 2.335827350616455, |
|
"learning_rate": 1.5812493246896368e-05, |
|
"loss": 0.1315, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.723076923076923, |
|
"eval_loss": 0.9450687766075134, |
|
"eval_runtime": 1.1715, |
|
"eval_samples_per_second": 97.309, |
|
"eval_steps_per_second": 5.122, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.753846153846154, |
|
"grad_norm": 1.2259918451309204, |
|
"learning_rate": 1.572476158196879e-05, |
|
"loss": 0.1252, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.753846153846154, |
|
"eval_loss": 0.9482319355010986, |
|
"eval_runtime": 1.1688, |
|
"eval_samples_per_second": 97.537, |
|
"eval_steps_per_second": 5.134, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.7846153846153845, |
|
"grad_norm": 1.4782612323760986, |
|
"learning_rate": 1.5636369525907297e-05, |
|
"loss": 0.1424, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.7846153846153845, |
|
"eval_loss": 0.9551723599433899, |
|
"eval_runtime": 1.1701, |
|
"eval_samples_per_second": 97.427, |
|
"eval_steps_per_second": 5.128, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.815384615384615, |
|
"grad_norm": 1.3023622035980225, |
|
"learning_rate": 1.554732727535139e-05, |
|
"loss": 0.121, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.815384615384615, |
|
"eval_loss": 0.9602871537208557, |
|
"eval_runtime": 1.1726, |
|
"eval_samples_per_second": 97.217, |
|
"eval_steps_per_second": 5.117, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.2243452072143555, |
|
"learning_rate": 1.5457645101945046e-05, |
|
"loss": 0.1213, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 0.9587708711624146, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.253, |
|
"eval_steps_per_second": 5.119, |
|
"step": 1250 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 50, |
|
"total_flos": 7.79652957166633e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|