{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 10695, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 34.351593017578125, "learning_rate": 3.327102803738318e-05, "loss": 1.2122, "step": 712 }, { "epoch": 0.4, "grad_norm": 10.322906494140625, "learning_rate": 4.816103896103896e-05, "loss": 0.8692, "step": 1424 }, { "epoch": 0.6, "grad_norm": 23.850841522216797, "learning_rate": 4.446233766233767e-05, "loss": 0.747, "step": 2136 }, { "epoch": 0.8, "grad_norm": 26.398853302001953, "learning_rate": 4.076363636363636e-05, "loss": 0.6957, "step": 2848 }, { "epoch": 1.0, "grad_norm": 39.551414489746094, "learning_rate": 3.706493506493507e-05, "loss": 0.6323, "step": 3560 }, { "epoch": 1.2, "grad_norm": 20.38348388671875, "learning_rate": 3.3366233766233766e-05, "loss": 0.5724, "step": 4272 }, { "epoch": 1.4, "grad_norm": 18.263744354248047, "learning_rate": 2.9667532467532467e-05, "loss": 0.5256, "step": 4984 }, { "epoch": 1.6, "grad_norm": 7.960543632507324, "learning_rate": 2.596883116883117e-05, "loss": 0.522, "step": 5696 }, { "epoch": 1.8, "grad_norm": 5.682173252105713, "learning_rate": 2.227012987012987e-05, "loss": 0.4766, "step": 6408 }, { "epoch": 2.0, "grad_norm": 7.151096820831299, "learning_rate": 1.8571428571428572e-05, "loss": 0.466, "step": 7120 }, { "epoch": 2.2, "grad_norm": 48.665584564208984, "learning_rate": 1.4872727272727275e-05, "loss": 0.3953, "step": 7832 }, { "epoch": 2.4, "grad_norm": 3.8058860301971436, "learning_rate": 1.1174025974025975e-05, "loss": 0.3673, "step": 8544 }, { "epoch": 2.6, "grad_norm": 17.926410675048828, "learning_rate": 7.475324675324675e-06, "loss": 0.3625, "step": 9256 }, { "epoch": 2.8, "grad_norm": 25.310287475585938, "learning_rate": 3.776623376623377e-06, "loss": 0.348, "step": 9968 }, { "epoch": 3.0, "grad_norm": 18.61994743347168, "learning_rate": 7.792207792207792e-08, "loss": 0.3617, "step": 10680 } ], "logging_steps": 712, "max_steps": 10695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 6.702780767795675e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }