{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5384615384615383, "eval_steps": 10, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006153846153846154, "eval_loss": 1.487821340560913, "eval_runtime": 1.2374, "eval_samples_per_second": 92.131, "eval_steps_per_second": 4.849, "step": 2 }, { "epoch": 0.03076923076923077, "grad_norm": 20.007999420166016, "learning_rate": 6.153846153846155e-07, "loss": 1.4703, "step": 10 }, { "epoch": 0.03076923076923077, "eval_loss": 1.4221965074539185, "eval_runtime": 1.1701, "eval_samples_per_second": 97.431, "eval_steps_per_second": 5.128, "step": 10 }, { "epoch": 0.06153846153846154, "grad_norm": 29.657033920288086, "learning_rate": 1.230769230769231e-06, "loss": 1.4402, "step": 20 }, { "epoch": 0.06153846153846154, "eval_loss": 1.238189935684204, "eval_runtime": 1.1729, "eval_samples_per_second": 97.195, "eval_steps_per_second": 5.116, "step": 20 }, { "epoch": 0.09230769230769231, "grad_norm": 10.80688190460205, "learning_rate": 1.8461538461538465e-06, "loss": 1.0957, "step": 30 }, { "epoch": 0.09230769230769231, "eval_loss": 0.908875048160553, "eval_runtime": 1.1739, "eval_samples_per_second": 97.111, "eval_steps_per_second": 5.111, "step": 30 }, { "epoch": 0.12307692307692308, "grad_norm": 12.914667129516602, "learning_rate": 2.461538461538462e-06, "loss": 0.8136, "step": 40 }, { "epoch": 0.12307692307692308, "eval_loss": 0.7436326146125793, "eval_runtime": 1.1712, "eval_samples_per_second": 97.34, "eval_steps_per_second": 5.123, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 5.206438064575195, "learning_rate": 3.0769230769230774e-06, "loss": 0.7864, "step": 50 }, { "epoch": 0.15384615384615385, "eval_loss": 0.6924600601196289, "eval_runtime": 1.1718, "eval_samples_per_second": 97.287, "eval_steps_per_second": 5.12, "step": 50 }, { "epoch": 0.18461538461538463, "grad_norm": 5.670599460601807, "learning_rate": 3.692307692307693e-06, "loss": 0.622, "step": 60 }, { "epoch": 0.18461538461538463, "eval_loss": 0.6791077852249146, "eval_runtime": 1.1678, "eval_samples_per_second": 97.62, "eval_steps_per_second": 5.138, "step": 60 }, { "epoch": 0.2153846153846154, "grad_norm": 3.6266868114471436, "learning_rate": 4.307692307692308e-06, "loss": 0.6672, "step": 70 }, { "epoch": 0.2153846153846154, "eval_loss": 0.6631948947906494, "eval_runtime": 1.1626, "eval_samples_per_second": 98.06, "eval_steps_per_second": 5.161, "step": 70 }, { "epoch": 0.24615384615384617, "grad_norm": 3.1027884483337402, "learning_rate": 4.923076923076924e-06, "loss": 0.6097, "step": 80 }, { "epoch": 0.24615384615384617, "eval_loss": 0.6625475883483887, "eval_runtime": 1.1717, "eval_samples_per_second": 97.294, "eval_steps_per_second": 5.121, "step": 80 }, { "epoch": 0.27692307692307694, "grad_norm": 3.233839511871338, "learning_rate": 5.538461538461539e-06, "loss": 0.6957, "step": 90 }, { "epoch": 0.27692307692307694, "eval_loss": 0.6590337157249451, "eval_runtime": 1.1664, "eval_samples_per_second": 97.738, "eval_steps_per_second": 5.144, "step": 90 }, { "epoch": 0.3076923076923077, "grad_norm": 4.932614803314209, "learning_rate": 6.153846153846155e-06, "loss": 0.6994, "step": 100 }, { "epoch": 0.3076923076923077, "eval_loss": 0.6592673659324646, "eval_runtime": 1.1692, "eval_samples_per_second": 97.502, "eval_steps_per_second": 5.132, "step": 100 }, { "epoch": 0.3384615384615385, "grad_norm": 2.642303466796875, "learning_rate": 6.76923076923077e-06, "loss": 0.6501, "step": 110 }, { "epoch": 0.3384615384615385, "eval_loss": 0.6589992642402649, "eval_runtime": 1.1669, "eval_samples_per_second": 97.691, "eval_steps_per_second": 5.142, "step": 110 }, { "epoch": 0.36923076923076925, "grad_norm": 3.45409893989563, "learning_rate": 7.384615384615386e-06, "loss": 0.7512, "step": 120 }, { "epoch": 0.36923076923076925, "eval_loss": 0.6568591594696045, "eval_runtime": 1.1778, "eval_samples_per_second": 96.79, "eval_steps_per_second": 5.094, "step": 120 }, { "epoch": 0.4, "grad_norm": 8.460351943969727, "learning_rate": 8.000000000000001e-06, "loss": 0.5996, "step": 130 }, { "epoch": 0.4, "eval_loss": 0.6578297019004822, "eval_runtime": 1.1683, "eval_samples_per_second": 97.575, "eval_steps_per_second": 5.136, "step": 130 }, { "epoch": 0.4307692307692308, "grad_norm": 2.4495465755462646, "learning_rate": 8.615384615384617e-06, "loss": 0.6498, "step": 140 }, { "epoch": 0.4307692307692308, "eval_loss": 0.6588199138641357, "eval_runtime": 1.1664, "eval_samples_per_second": 97.739, "eval_steps_per_second": 5.144, "step": 140 }, { "epoch": 0.46153846153846156, "grad_norm": 2.6779162883758545, "learning_rate": 9.230769230769232e-06, "loss": 0.703, "step": 150 }, { "epoch": 0.46153846153846156, "eval_loss": 0.6626676321029663, "eval_runtime": 1.1764, "eval_samples_per_second": 96.904, "eval_steps_per_second": 5.1, "step": 150 }, { "epoch": 0.49230769230769234, "grad_norm": 8.147880554199219, "learning_rate": 9.846153846153848e-06, "loss": 0.6649, "step": 160 }, { "epoch": 0.49230769230769234, "eval_loss": 0.6644802689552307, "eval_runtime": 1.1701, "eval_samples_per_second": 97.424, "eval_steps_per_second": 5.128, "step": 160 }, { "epoch": 0.5230769230769231, "grad_norm": 2.7410728931427, "learning_rate": 1.0461538461538463e-05, "loss": 0.6704, "step": 170 }, { "epoch": 0.5230769230769231, "eval_loss": 0.6591455340385437, "eval_runtime": 1.1722, "eval_samples_per_second": 97.253, "eval_steps_per_second": 5.119, "step": 170 }, { "epoch": 0.5538461538461539, "grad_norm": 6.886598587036133, "learning_rate": 1.1076923076923079e-05, "loss": 0.6073, "step": 180 }, { "epoch": 0.5538461538461539, "eval_loss": 0.6617754697799683, "eval_runtime": 1.1682, "eval_samples_per_second": 97.588, "eval_steps_per_second": 5.136, "step": 180 }, { "epoch": 0.5846153846153846, "grad_norm": 4.26249361038208, "learning_rate": 1.1692307692307694e-05, "loss": 0.7556, "step": 190 }, { "epoch": 0.5846153846153846, "eval_loss": 0.6659030318260193, "eval_runtime": 1.1721, "eval_samples_per_second": 97.265, "eval_steps_per_second": 5.119, "step": 190 }, { "epoch": 0.6153846153846154, "grad_norm": 4.204222202301025, "learning_rate": 1.230769230769231e-05, "loss": 0.6583, "step": 200 }, { "epoch": 0.6153846153846154, "eval_loss": 0.6660778522491455, "eval_runtime": 1.1728, "eval_samples_per_second": 97.201, "eval_steps_per_second": 5.116, "step": 200 }, { "epoch": 0.6461538461538462, "grad_norm": 3.154658079147339, "learning_rate": 1.2923076923076925e-05, "loss": 0.7288, "step": 210 }, { "epoch": 0.6461538461538462, "eval_loss": 0.6642422676086426, "eval_runtime": 1.166, "eval_samples_per_second": 97.772, "eval_steps_per_second": 5.146, "step": 210 }, { "epoch": 0.676923076923077, "grad_norm": 3.248359203338623, "learning_rate": 1.353846153846154e-05, "loss": 0.6702, "step": 220 }, { "epoch": 0.676923076923077, "eval_loss": 0.6680281758308411, "eval_runtime": 1.1691, "eval_samples_per_second": 97.507, "eval_steps_per_second": 5.132, "step": 220 }, { "epoch": 0.7076923076923077, "grad_norm": 2.5507216453552246, "learning_rate": 1.4153846153846156e-05, "loss": 0.66, "step": 230 }, { "epoch": 0.7076923076923077, "eval_loss": 0.6713310480117798, "eval_runtime": 1.167, "eval_samples_per_second": 97.683, "eval_steps_per_second": 5.141, "step": 230 }, { "epoch": 0.7384615384615385, "grad_norm": 2.1180007457733154, "learning_rate": 1.4769230769230772e-05, "loss": 0.6172, "step": 240 }, { "epoch": 0.7384615384615385, "eval_loss": 0.6759226322174072, "eval_runtime": 1.1758, "eval_samples_per_second": 96.954, "eval_steps_per_second": 5.103, "step": 240 }, { "epoch": 0.7692307692307693, "grad_norm": 2.8318371772766113, "learning_rate": 1.5384615384615387e-05, "loss": 0.5954, "step": 250 }, { "epoch": 0.7692307692307693, "eval_loss": 0.6854314208030701, "eval_runtime": 1.1698, "eval_samples_per_second": 97.457, "eval_steps_per_second": 5.129, "step": 250 }, { "epoch": 0.8, "grad_norm": 3.519829034805298, "learning_rate": 1.6000000000000003e-05, "loss": 0.6639, "step": 260 }, { "epoch": 0.8, "eval_loss": 0.6849851012229919, "eval_runtime": 1.1732, "eval_samples_per_second": 97.173, "eval_steps_per_second": 5.114, "step": 260 }, { "epoch": 0.8307692307692308, "grad_norm": 3.7909388542175293, "learning_rate": 1.6615384615384618e-05, "loss": 0.6892, "step": 270 }, { "epoch": 0.8307692307692308, "eval_loss": 0.6863855123519897, "eval_runtime": 1.1689, "eval_samples_per_second": 97.53, "eval_steps_per_second": 5.133, "step": 270 }, { "epoch": 0.8615384615384616, "grad_norm": 8.278326988220215, "learning_rate": 1.7230769230769234e-05, "loss": 0.6014, "step": 280 }, { "epoch": 0.8615384615384616, "eval_loss": 0.6890373826026917, "eval_runtime": 1.1722, "eval_samples_per_second": 97.251, "eval_steps_per_second": 5.118, "step": 280 }, { "epoch": 0.8923076923076924, "grad_norm": 2.616001605987549, "learning_rate": 1.784615384615385e-05, "loss": 0.6775, "step": 290 }, { "epoch": 0.8923076923076924, "eval_loss": 0.6797110438346863, "eval_runtime": 1.1888, "eval_samples_per_second": 95.897, "eval_steps_per_second": 5.047, "step": 290 }, { "epoch": 0.9230769230769231, "grad_norm": 2.4450528621673584, "learning_rate": 1.8461538461538465e-05, "loss": 0.7434, "step": 300 }, { "epoch": 0.9230769230769231, "eval_loss": 0.6779448986053467, "eval_runtime": 1.1722, "eval_samples_per_second": 97.254, "eval_steps_per_second": 5.119, "step": 300 }, { "epoch": 0.9538461538461539, "grad_norm": 4.121164798736572, "learning_rate": 1.907692307692308e-05, "loss": 0.6413, "step": 310 }, { "epoch": 0.9538461538461539, "eval_loss": 0.6819297671318054, "eval_runtime": 1.1683, "eval_samples_per_second": 97.574, "eval_steps_per_second": 5.135, "step": 310 }, { "epoch": 0.9846153846153847, "grad_norm": 2.057229995727539, "learning_rate": 1.9692307692307696e-05, "loss": 0.6511, "step": 320 }, { "epoch": 0.9846153846153847, "eval_loss": 0.6861481666564941, "eval_runtime": 1.1699, "eval_samples_per_second": 97.444, "eval_steps_per_second": 5.129, "step": 320 }, { "epoch": 1.0153846153846153, "grad_norm": 1.7295210361480713, "learning_rate": 1.9999855802751384e-05, "loss": 0.5872, "step": 330 }, { "epoch": 1.0153846153846153, "eval_loss": 0.7481781244277954, "eval_runtime": 1.172, "eval_samples_per_second": 97.273, "eval_steps_per_second": 5.12, "step": 330 }, { "epoch": 1.0461538461538462, "grad_norm": 3.6910336017608643, "learning_rate": 1.9998702249713747e-05, "loss": 0.438, "step": 340 }, { "epoch": 1.0461538461538462, "eval_loss": 0.7471169829368591, "eval_runtime": 1.175, "eval_samples_per_second": 97.019, "eval_steps_per_second": 5.106, "step": 340 }, { "epoch": 1.0769230769230769, "grad_norm": 3.9220831394195557, "learning_rate": 1.9996395276708856e-05, "loss": 0.4141, "step": 350 }, { "epoch": 1.0769230769230769, "eval_loss": 0.7425897121429443, "eval_runtime": 1.1888, "eval_samples_per_second": 95.893, "eval_steps_per_second": 5.047, "step": 350 }, { "epoch": 1.1076923076923078, "grad_norm": 2.1136879920959473, "learning_rate": 1.9992935149862116e-05, "loss": 0.4173, "step": 360 }, { "epoch": 1.1076923076923078, "eval_loss": 0.7477542161941528, "eval_runtime": 1.1705, "eval_samples_per_second": 97.398, "eval_steps_per_second": 5.126, "step": 360 }, { "epoch": 1.1384615384615384, "grad_norm": 2.187299966812134, "learning_rate": 1.998832226832327e-05, "loss": 0.4242, "step": 370 }, { "epoch": 1.1384615384615384, "eval_loss": 0.7475994229316711, "eval_runtime": 1.1683, "eval_samples_per_second": 97.582, "eval_steps_per_second": 5.136, "step": 370 }, { "epoch": 1.1692307692307693, "grad_norm": 2.683750629425049, "learning_rate": 1.9982557164220335e-05, "loss": 0.4413, "step": 380 }, { "epoch": 1.1692307692307693, "eval_loss": 0.7552060484886169, "eval_runtime": 1.1689, "eval_samples_per_second": 97.529, "eval_steps_per_second": 5.133, "step": 380 }, { "epoch": 1.2, "grad_norm": 3.312657117843628, "learning_rate": 1.9975640502598243e-05, "loss": 0.3361, "step": 390 }, { "epoch": 1.2, "eval_loss": 0.7630029916763306, "eval_runtime": 1.1689, "eval_samples_per_second": 97.529, "eval_steps_per_second": 5.133, "step": 390 }, { "epoch": 1.2307692307692308, "grad_norm": 2.3290536403656006, "learning_rate": 1.9967573081342103e-05, "loss": 0.4174, "step": 400 }, { "epoch": 1.2307692307692308, "eval_loss": 0.7669293880462646, "eval_runtime": 1.1711, "eval_samples_per_second": 97.345, "eval_steps_per_second": 5.123, "step": 400 }, { "epoch": 1.2615384615384615, "grad_norm": 2.4483697414398193, "learning_rate": 1.9958355831085155e-05, "loss": 0.4045, "step": 410 }, { "epoch": 1.2615384615384615, "eval_loss": 0.7652024626731873, "eval_runtime": 1.1697, "eval_samples_per_second": 97.461, "eval_steps_per_second": 5.13, "step": 410 }, { "epoch": 1.2923076923076924, "grad_norm": 1.703616976737976, "learning_rate": 1.9947989815101444e-05, "loss": 0.4292, "step": 420 }, { "epoch": 1.2923076923076924, "eval_loss": 0.7553891539573669, "eval_runtime": 1.1719, "eval_samples_per_second": 97.277, "eval_steps_per_second": 5.12, "step": 420 }, { "epoch": 1.323076923076923, "grad_norm": 1.6245399713516235, "learning_rate": 1.9936476229183133e-05, "loss": 0.4164, "step": 430 }, { "epoch": 1.323076923076923, "eval_loss": 0.7711736559867859, "eval_runtime": 1.1776, "eval_samples_per_second": 96.806, "eval_steps_per_second": 5.095, "step": 430 }, { "epoch": 1.353846153846154, "grad_norm": 1.9511611461639404, "learning_rate": 1.992381640150257e-05, "loss": 0.4337, "step": 440 }, { "epoch": 1.353846153846154, "eval_loss": 0.7438372373580933, "eval_runtime": 1.1692, "eval_samples_per_second": 97.5, "eval_steps_per_second": 5.132, "step": 440 }, { "epoch": 1.3846153846153846, "grad_norm": 2.466737747192383, "learning_rate": 1.9910011792459086e-05, "loss": 0.3712, "step": 450 }, { "epoch": 1.3846153846153846, "eval_loss": 0.8045614361763, "eval_runtime": 1.1689, "eval_samples_per_second": 97.53, "eval_steps_per_second": 5.133, "step": 450 }, { "epoch": 1.4153846153846155, "grad_norm": 2.473851203918457, "learning_rate": 1.9895063994510512e-05, "loss": 0.516, "step": 460 }, { "epoch": 1.4153846153846155, "eval_loss": 0.7463352084159851, "eval_runtime": 1.1697, "eval_samples_per_second": 97.464, "eval_steps_per_second": 5.13, "step": 460 }, { "epoch": 1.4461538461538461, "grad_norm": 2.757955312728882, "learning_rate": 1.9878974731989487e-05, "loss": 0.4965, "step": 470 }, { "epoch": 1.4461538461538461, "eval_loss": 0.7571713924407959, "eval_runtime": 1.1688, "eval_samples_per_second": 97.534, "eval_steps_per_second": 5.133, "step": 470 }, { "epoch": 1.476923076923077, "grad_norm": 3.9884915351867676, "learning_rate": 1.9861745860904538e-05, "loss": 0.3973, "step": 480 }, { "epoch": 1.476923076923077, "eval_loss": 0.7763081789016724, "eval_runtime": 1.168, "eval_samples_per_second": 97.605, "eval_steps_per_second": 5.137, "step": 480 }, { "epoch": 1.5076923076923077, "grad_norm": 1.9474631547927856, "learning_rate": 1.9843379368725978e-05, "loss": 0.4625, "step": 490 }, { "epoch": 1.5076923076923077, "eval_loss": 0.7595367431640625, "eval_runtime": 1.1708, "eval_samples_per_second": 97.37, "eval_steps_per_second": 5.125, "step": 490 }, { "epoch": 1.5384615384615383, "grad_norm": 2.2907869815826416, "learning_rate": 1.9823877374156647e-05, "loss": 0.4371, "step": 500 }, { "epoch": 1.5384615384615383, "eval_loss": 0.762865424156189, "eval_runtime": 1.1766, "eval_samples_per_second": 96.891, "eval_steps_per_second": 5.1, "step": 500 } ], "logging_steps": 10, "max_steps": 3250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "total_flos": 3.118537414331597e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }