{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9230769230769231, "eval_steps": 10, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006153846153846154, "eval_loss": 1.487821340560913, "eval_runtime": 1.2374, "eval_samples_per_second": 92.131, "eval_steps_per_second": 4.849, "step": 2 }, { "epoch": 0.03076923076923077, "grad_norm": 20.007999420166016, "learning_rate": 6.153846153846155e-07, "loss": 1.4703, "step": 10 }, { "epoch": 0.03076923076923077, "eval_loss": 1.4221965074539185, "eval_runtime": 1.1701, "eval_samples_per_second": 97.431, "eval_steps_per_second": 5.128, "step": 10 }, { "epoch": 0.06153846153846154, "grad_norm": 29.657033920288086, "learning_rate": 1.230769230769231e-06, "loss": 1.4402, "step": 20 }, { "epoch": 0.06153846153846154, "eval_loss": 1.238189935684204, "eval_runtime": 1.1729, "eval_samples_per_second": 97.195, "eval_steps_per_second": 5.116, "step": 20 }, { "epoch": 0.09230769230769231, "grad_norm": 10.80688190460205, "learning_rate": 1.8461538461538465e-06, "loss": 1.0957, "step": 30 }, { "epoch": 0.09230769230769231, "eval_loss": 0.908875048160553, "eval_runtime": 1.1739, "eval_samples_per_second": 97.111, "eval_steps_per_second": 5.111, "step": 30 }, { "epoch": 0.12307692307692308, "grad_norm": 12.914667129516602, "learning_rate": 2.461538461538462e-06, "loss": 0.8136, "step": 40 }, { "epoch": 0.12307692307692308, "eval_loss": 0.7436326146125793, "eval_runtime": 1.1712, "eval_samples_per_second": 97.34, "eval_steps_per_second": 5.123, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 5.206438064575195, "learning_rate": 3.0769230769230774e-06, "loss": 0.7864, "step": 50 }, { "epoch": 0.15384615384615385, "eval_loss": 0.6924600601196289, "eval_runtime": 1.1718, "eval_samples_per_second": 97.287, "eval_steps_per_second": 5.12, "step": 50 }, { "epoch": 0.18461538461538463, "grad_norm": 5.670599460601807, "learning_rate": 3.692307692307693e-06, "loss": 0.622, "step": 60 }, { "epoch": 0.18461538461538463, "eval_loss": 0.6791077852249146, "eval_runtime": 1.1678, "eval_samples_per_second": 97.62, "eval_steps_per_second": 5.138, "step": 60 }, { "epoch": 0.2153846153846154, "grad_norm": 3.6266868114471436, "learning_rate": 4.307692307692308e-06, "loss": 0.6672, "step": 70 }, { "epoch": 0.2153846153846154, "eval_loss": 0.6631948947906494, "eval_runtime": 1.1626, "eval_samples_per_second": 98.06, "eval_steps_per_second": 5.161, "step": 70 }, { "epoch": 0.24615384615384617, "grad_norm": 3.1027884483337402, "learning_rate": 4.923076923076924e-06, "loss": 0.6097, "step": 80 }, { "epoch": 0.24615384615384617, "eval_loss": 0.6625475883483887, "eval_runtime": 1.1717, "eval_samples_per_second": 97.294, "eval_steps_per_second": 5.121, "step": 80 }, { "epoch": 0.27692307692307694, "grad_norm": 3.233839511871338, "learning_rate": 5.538461538461539e-06, "loss": 0.6957, "step": 90 }, { "epoch": 0.27692307692307694, "eval_loss": 0.6590337157249451, "eval_runtime": 1.1664, "eval_samples_per_second": 97.738, "eval_steps_per_second": 5.144, "step": 90 }, { "epoch": 0.3076923076923077, "grad_norm": 4.932614803314209, "learning_rate": 6.153846153846155e-06, "loss": 0.6994, "step": 100 }, { "epoch": 0.3076923076923077, "eval_loss": 0.6592673659324646, "eval_runtime": 1.1692, "eval_samples_per_second": 97.502, "eval_steps_per_second": 5.132, "step": 100 }, { "epoch": 0.3384615384615385, "grad_norm": 2.642303466796875, "learning_rate": 6.76923076923077e-06, "loss": 0.6501, "step": 110 }, { "epoch": 0.3384615384615385, "eval_loss": 0.6589992642402649, "eval_runtime": 1.1669, "eval_samples_per_second": 97.691, "eval_steps_per_second": 5.142, "step": 110 }, { "epoch": 0.36923076923076925, "grad_norm": 3.45409893989563, "learning_rate": 7.384615384615386e-06, "loss": 0.7512, "step": 120 }, { "epoch": 0.36923076923076925, "eval_loss": 0.6568591594696045, "eval_runtime": 1.1778, "eval_samples_per_second": 96.79, "eval_steps_per_second": 5.094, "step": 120 }, { "epoch": 0.4, "grad_norm": 8.460351943969727, "learning_rate": 8.000000000000001e-06, "loss": 0.5996, "step": 130 }, { "epoch": 0.4, "eval_loss": 0.6578297019004822, "eval_runtime": 1.1683, "eval_samples_per_second": 97.575, "eval_steps_per_second": 5.136, "step": 130 }, { "epoch": 0.4307692307692308, "grad_norm": 2.4495465755462646, "learning_rate": 8.615384615384617e-06, "loss": 0.6498, "step": 140 }, { "epoch": 0.4307692307692308, "eval_loss": 0.6588199138641357, "eval_runtime": 1.1664, "eval_samples_per_second": 97.739, "eval_steps_per_second": 5.144, "step": 140 }, { "epoch": 0.46153846153846156, "grad_norm": 2.6779162883758545, "learning_rate": 9.230769230769232e-06, "loss": 0.703, "step": 150 }, { "epoch": 0.46153846153846156, "eval_loss": 0.6626676321029663, "eval_runtime": 1.1764, "eval_samples_per_second": 96.904, "eval_steps_per_second": 5.1, "step": 150 }, { "epoch": 0.49230769230769234, "grad_norm": 8.147880554199219, "learning_rate": 9.846153846153848e-06, "loss": 0.6649, "step": 160 }, { "epoch": 0.49230769230769234, "eval_loss": 0.6644802689552307, "eval_runtime": 1.1701, "eval_samples_per_second": 97.424, "eval_steps_per_second": 5.128, "step": 160 }, { "epoch": 0.5230769230769231, "grad_norm": 2.7410728931427, "learning_rate": 1.0461538461538463e-05, "loss": 0.6704, "step": 170 }, { "epoch": 0.5230769230769231, "eval_loss": 0.6591455340385437, "eval_runtime": 1.1722, "eval_samples_per_second": 97.253, "eval_steps_per_second": 5.119, "step": 170 }, { "epoch": 0.5538461538461539, "grad_norm": 6.886598587036133, "learning_rate": 1.1076923076923079e-05, "loss": 0.6073, "step": 180 }, { "epoch": 0.5538461538461539, "eval_loss": 0.6617754697799683, "eval_runtime": 1.1682, "eval_samples_per_second": 97.588, "eval_steps_per_second": 5.136, "step": 180 }, { "epoch": 0.5846153846153846, "grad_norm": 4.26249361038208, "learning_rate": 1.1692307692307694e-05, "loss": 0.7556, "step": 190 }, { "epoch": 0.5846153846153846, "eval_loss": 0.6659030318260193, "eval_runtime": 1.1721, "eval_samples_per_second": 97.265, "eval_steps_per_second": 5.119, "step": 190 }, { "epoch": 0.6153846153846154, "grad_norm": 4.204222202301025, "learning_rate": 1.230769230769231e-05, "loss": 0.6583, "step": 200 }, { "epoch": 0.6153846153846154, "eval_loss": 0.6660778522491455, "eval_runtime": 1.1728, "eval_samples_per_second": 97.201, "eval_steps_per_second": 5.116, "step": 200 }, { "epoch": 0.6461538461538462, "grad_norm": 3.154658079147339, "learning_rate": 1.2923076923076925e-05, "loss": 0.7288, "step": 210 }, { "epoch": 0.6461538461538462, "eval_loss": 0.6642422676086426, "eval_runtime": 1.166, "eval_samples_per_second": 97.772, "eval_steps_per_second": 5.146, "step": 210 }, { "epoch": 0.676923076923077, "grad_norm": 3.248359203338623, "learning_rate": 1.353846153846154e-05, "loss": 0.6702, "step": 220 }, { "epoch": 0.676923076923077, "eval_loss": 0.6680281758308411, "eval_runtime": 1.1691, "eval_samples_per_second": 97.507, "eval_steps_per_second": 5.132, "step": 220 }, { "epoch": 0.7076923076923077, "grad_norm": 2.5507216453552246, "learning_rate": 1.4153846153846156e-05, "loss": 0.66, "step": 230 }, { "epoch": 0.7076923076923077, "eval_loss": 0.6713310480117798, "eval_runtime": 1.167, "eval_samples_per_second": 97.683, "eval_steps_per_second": 5.141, "step": 230 }, { "epoch": 0.7384615384615385, "grad_norm": 2.1180007457733154, "learning_rate": 1.4769230769230772e-05, "loss": 0.6172, "step": 240 }, { "epoch": 0.7384615384615385, "eval_loss": 0.6759226322174072, "eval_runtime": 1.1758, "eval_samples_per_second": 96.954, "eval_steps_per_second": 5.103, "step": 240 }, { "epoch": 0.7692307692307693, "grad_norm": 2.8318371772766113, "learning_rate": 1.5384615384615387e-05, "loss": 0.5954, "step": 250 }, { "epoch": 0.7692307692307693, "eval_loss": 0.6854314208030701, "eval_runtime": 1.1698, "eval_samples_per_second": 97.457, "eval_steps_per_second": 5.129, "step": 250 }, { "epoch": 0.8, "grad_norm": 3.519829034805298, "learning_rate": 1.6000000000000003e-05, "loss": 0.6639, "step": 260 }, { "epoch": 0.8, "eval_loss": 0.6849851012229919, "eval_runtime": 1.1732, "eval_samples_per_second": 97.173, "eval_steps_per_second": 5.114, "step": 260 }, { "epoch": 0.8307692307692308, "grad_norm": 3.7909388542175293, "learning_rate": 1.6615384615384618e-05, "loss": 0.6892, "step": 270 }, { "epoch": 0.8307692307692308, "eval_loss": 0.6863855123519897, "eval_runtime": 1.1689, "eval_samples_per_second": 97.53, "eval_steps_per_second": 5.133, "step": 270 }, { "epoch": 0.8615384615384616, "grad_norm": 8.278326988220215, "learning_rate": 1.7230769230769234e-05, "loss": 0.6014, "step": 280 }, { "epoch": 0.8615384615384616, "eval_loss": 0.6890373826026917, "eval_runtime": 1.1722, "eval_samples_per_second": 97.251, "eval_steps_per_second": 5.118, "step": 280 }, { "epoch": 0.8923076923076924, "grad_norm": 2.616001605987549, "learning_rate": 1.784615384615385e-05, "loss": 0.6775, "step": 290 }, { "epoch": 0.8923076923076924, "eval_loss": 0.6797110438346863, "eval_runtime": 1.1888, "eval_samples_per_second": 95.897, "eval_steps_per_second": 5.047, "step": 290 }, { "epoch": 0.9230769230769231, "grad_norm": 2.4450528621673584, "learning_rate": 1.8461538461538465e-05, "loss": 0.7434, "step": 300 }, { "epoch": 0.9230769230769231, "eval_loss": 0.6779448986053467, "eval_runtime": 1.1722, "eval_samples_per_second": 97.254, "eval_steps_per_second": 5.119, "step": 300 } ], "logging_steps": 10, "max_steps": 3250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "total_flos": 1.87056529604608e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }