|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3846153846153846, |
|
"eval_steps": 10, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006153846153846154, |
|
"eval_loss": 1.487821340560913, |
|
"eval_runtime": 1.2374, |
|
"eval_samples_per_second": 92.131, |
|
"eval_steps_per_second": 4.849, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 20.007999420166016, |
|
"learning_rate": 6.153846153846155e-07, |
|
"loss": 1.4703, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 1.4221965074539185, |
|
"eval_runtime": 1.1701, |
|
"eval_samples_per_second": 97.431, |
|
"eval_steps_per_second": 5.128, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 29.657033920288086, |
|
"learning_rate": 1.230769230769231e-06, |
|
"loss": 1.4402, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"eval_loss": 1.238189935684204, |
|
"eval_runtime": 1.1729, |
|
"eval_samples_per_second": 97.195, |
|
"eval_steps_per_second": 5.116, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 10.80688190460205, |
|
"learning_rate": 1.8461538461538465e-06, |
|
"loss": 1.0957, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.908875048160553, |
|
"eval_runtime": 1.1739, |
|
"eval_samples_per_second": 97.111, |
|
"eval_steps_per_second": 5.111, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 12.914667129516602, |
|
"learning_rate": 2.461538461538462e-06, |
|
"loss": 0.8136, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"eval_loss": 0.7436326146125793, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.34, |
|
"eval_steps_per_second": 5.123, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 5.206438064575195, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.7864, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.6924600601196289, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.287, |
|
"eval_steps_per_second": 5.12, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 5.670599460601807, |
|
"learning_rate": 3.692307692307693e-06, |
|
"loss": 0.622, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.6791077852249146, |
|
"eval_runtime": 1.1678, |
|
"eval_samples_per_second": 97.62, |
|
"eval_steps_per_second": 5.138, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 3.6266868114471436, |
|
"learning_rate": 4.307692307692308e-06, |
|
"loss": 0.6672, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"eval_loss": 0.6631948947906494, |
|
"eval_runtime": 1.1626, |
|
"eval_samples_per_second": 98.06, |
|
"eval_steps_per_second": 5.161, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 3.1027884483337402, |
|
"learning_rate": 4.923076923076924e-06, |
|
"loss": 0.6097, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"eval_loss": 0.6625475883483887, |
|
"eval_runtime": 1.1717, |
|
"eval_samples_per_second": 97.294, |
|
"eval_steps_per_second": 5.121, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 3.233839511871338, |
|
"learning_rate": 5.538461538461539e-06, |
|
"loss": 0.6957, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 0.6590337157249451, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.738, |
|
"eval_steps_per_second": 5.144, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 4.932614803314209, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.6994, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.6592673659324646, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.502, |
|
"eval_steps_per_second": 5.132, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 2.642303466796875, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.6501, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"eval_loss": 0.6589992642402649, |
|
"eval_runtime": 1.1669, |
|
"eval_samples_per_second": 97.691, |
|
"eval_steps_per_second": 5.142, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 3.45409893989563, |
|
"learning_rate": 7.384615384615386e-06, |
|
"loss": 0.7512, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"eval_loss": 0.6568591594696045, |
|
"eval_runtime": 1.1778, |
|
"eval_samples_per_second": 96.79, |
|
"eval_steps_per_second": 5.094, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 8.460351943969727, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5996, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6578297019004822, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.575, |
|
"eval_steps_per_second": 5.136, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 2.4495465755462646, |
|
"learning_rate": 8.615384615384617e-06, |
|
"loss": 0.6498, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"eval_loss": 0.6588199138641357, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.739, |
|
"eval_steps_per_second": 5.144, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.6779162883758545, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.703, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.6626676321029663, |
|
"eval_runtime": 1.1764, |
|
"eval_samples_per_second": 96.904, |
|
"eval_steps_per_second": 5.1, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 8.147880554199219, |
|
"learning_rate": 9.846153846153848e-06, |
|
"loss": 0.6649, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"eval_loss": 0.6644802689552307, |
|
"eval_runtime": 1.1701, |
|
"eval_samples_per_second": 97.424, |
|
"eval_steps_per_second": 5.128, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 2.7410728931427, |
|
"learning_rate": 1.0461538461538463e-05, |
|
"loss": 0.6704, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"eval_loss": 0.6591455340385437, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.253, |
|
"eval_steps_per_second": 5.119, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 6.886598587036133, |
|
"learning_rate": 1.1076923076923079e-05, |
|
"loss": 0.6073, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 0.6617754697799683, |
|
"eval_runtime": 1.1682, |
|
"eval_samples_per_second": 97.588, |
|
"eval_steps_per_second": 5.136, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 4.26249361038208, |
|
"learning_rate": 1.1692307692307694e-05, |
|
"loss": 0.7556, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"eval_loss": 0.6659030318260193, |
|
"eval_runtime": 1.1721, |
|
"eval_samples_per_second": 97.265, |
|
"eval_steps_per_second": 5.119, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 4.204222202301025, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.6583, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.6660778522491455, |
|
"eval_runtime": 1.1728, |
|
"eval_samples_per_second": 97.201, |
|
"eval_steps_per_second": 5.116, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 3.154658079147339, |
|
"learning_rate": 1.2923076923076925e-05, |
|
"loss": 0.7288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"eval_loss": 0.6642422676086426, |
|
"eval_runtime": 1.166, |
|
"eval_samples_per_second": 97.772, |
|
"eval_steps_per_second": 5.146, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 3.248359203338623, |
|
"learning_rate": 1.353846153846154e-05, |
|
"loss": 0.6702, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"eval_loss": 0.6680281758308411, |
|
"eval_runtime": 1.1691, |
|
"eval_samples_per_second": 97.507, |
|
"eval_steps_per_second": 5.132, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 2.5507216453552246, |
|
"learning_rate": 1.4153846153846156e-05, |
|
"loss": 0.66, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"eval_loss": 0.6713310480117798, |
|
"eval_runtime": 1.167, |
|
"eval_samples_per_second": 97.683, |
|
"eval_steps_per_second": 5.141, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 2.1180007457733154, |
|
"learning_rate": 1.4769230769230772e-05, |
|
"loss": 0.6172, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"eval_loss": 0.6759226322174072, |
|
"eval_runtime": 1.1758, |
|
"eval_samples_per_second": 96.954, |
|
"eval_steps_per_second": 5.103, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.8318371772766113, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.5954, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.6854314208030701, |
|
"eval_runtime": 1.1698, |
|
"eval_samples_per_second": 97.457, |
|
"eval_steps_per_second": 5.129, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.519829034805298, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.6639, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.6849851012229919, |
|
"eval_runtime": 1.1732, |
|
"eval_samples_per_second": 97.173, |
|
"eval_steps_per_second": 5.114, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 3.7909388542175293, |
|
"learning_rate": 1.6615384615384618e-05, |
|
"loss": 0.6892, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 0.6863855123519897, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.53, |
|
"eval_steps_per_second": 5.133, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 8.278326988220215, |
|
"learning_rate": 1.7230769230769234e-05, |
|
"loss": 0.6014, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"eval_loss": 0.6890373826026917, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.251, |
|
"eval_steps_per_second": 5.118, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 2.616001605987549, |
|
"learning_rate": 1.784615384615385e-05, |
|
"loss": 0.6775, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"eval_loss": 0.6797110438346863, |
|
"eval_runtime": 1.1888, |
|
"eval_samples_per_second": 95.897, |
|
"eval_steps_per_second": 5.047, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.4450528621673584, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.7434, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"eval_loss": 0.6779448986053467, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.254, |
|
"eval_steps_per_second": 5.119, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 4.121164798736572, |
|
"learning_rate": 1.907692307692308e-05, |
|
"loss": 0.6413, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"eval_loss": 0.6819297671318054, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.574, |
|
"eval_steps_per_second": 5.135, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 2.057229995727539, |
|
"learning_rate": 1.9692307692307696e-05, |
|
"loss": 0.6511, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"eval_loss": 0.6861481666564941, |
|
"eval_runtime": 1.1699, |
|
"eval_samples_per_second": 97.444, |
|
"eval_steps_per_second": 5.129, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 1.7295210361480713, |
|
"learning_rate": 1.9999855802751384e-05, |
|
"loss": 0.5872, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"eval_loss": 0.7481781244277954, |
|
"eval_runtime": 1.172, |
|
"eval_samples_per_second": 97.273, |
|
"eval_steps_per_second": 5.12, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 3.6910336017608643, |
|
"learning_rate": 1.9998702249713747e-05, |
|
"loss": 0.438, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"eval_loss": 0.7471169829368591, |
|
"eval_runtime": 1.175, |
|
"eval_samples_per_second": 97.019, |
|
"eval_steps_per_second": 5.106, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 3.9220831394195557, |
|
"learning_rate": 1.9996395276708856e-05, |
|
"loss": 0.4141, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"eval_loss": 0.7425897121429443, |
|
"eval_runtime": 1.1888, |
|
"eval_samples_per_second": 95.893, |
|
"eval_steps_per_second": 5.047, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 2.1136879920959473, |
|
"learning_rate": 1.9992935149862116e-05, |
|
"loss": 0.4173, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 0.7477542161941528, |
|
"eval_runtime": 1.1705, |
|
"eval_samples_per_second": 97.398, |
|
"eval_steps_per_second": 5.126, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 2.187299966812134, |
|
"learning_rate": 1.998832226832327e-05, |
|
"loss": 0.4242, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"eval_loss": 0.7475994229316711, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.582, |
|
"eval_steps_per_second": 5.136, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 2.683750629425049, |
|
"learning_rate": 1.9982557164220335e-05, |
|
"loss": 0.4413, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"eval_loss": 0.7552060484886169, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.529, |
|
"eval_steps_per_second": 5.133, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.312657117843628, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.3361, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7630029916763306, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.529, |
|
"eval_steps_per_second": 5.133, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 2.3290536403656006, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 0.4174, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"eval_loss": 0.7669293880462646, |
|
"eval_runtime": 1.1711, |
|
"eval_samples_per_second": 97.345, |
|
"eval_steps_per_second": 5.123, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 2.4483697414398193, |
|
"learning_rate": 1.9958355831085155e-05, |
|
"loss": 0.4045, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"eval_loss": 0.7652024626731873, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.461, |
|
"eval_steps_per_second": 5.13, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 1.703616976737976, |
|
"learning_rate": 1.9947989815101444e-05, |
|
"loss": 0.4292, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"eval_loss": 0.7553891539573669, |
|
"eval_runtime": 1.1719, |
|
"eval_samples_per_second": 97.277, |
|
"eval_steps_per_second": 5.12, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 1.6245399713516235, |
|
"learning_rate": 1.9936476229183133e-05, |
|
"loss": 0.4164, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"eval_loss": 0.7711736559867859, |
|
"eval_runtime": 1.1776, |
|
"eval_samples_per_second": 96.806, |
|
"eval_steps_per_second": 5.095, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 1.9511611461639404, |
|
"learning_rate": 1.992381640150257e-05, |
|
"loss": 0.4337, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"eval_loss": 0.7438372373580933, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.5, |
|
"eval_steps_per_second": 5.132, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 2.466737747192383, |
|
"learning_rate": 1.9910011792459086e-05, |
|
"loss": 0.3712, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 0.8045614361763, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.53, |
|
"eval_steps_per_second": 5.133, |
|
"step": 450 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 50, |
|
"total_flos": 2.8101720768774144e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|