{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2247, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04450378282153983, "grad_norm": 0.13366538286209106, "learning_rate": 2.0000000000000003e-06, "loss": 2.4614, "step": 100 }, { "epoch": 0.08900756564307966, "grad_norm": 0.40120357275009155, "learning_rate": 4.000000000000001e-06, "loss": 2.4038, "step": 200 }, { "epoch": 0.13351134846461948, "grad_norm": 0.207975834608078, "learning_rate": 6e-06, "loss": 2.4, "step": 300 }, { "epoch": 0.1780151312861593, "grad_norm": 0.2991727888584137, "learning_rate": 8.000000000000001e-06, "loss": 2.3774, "step": 400 }, { "epoch": 0.22251891410769917, "grad_norm": 0.3771696984767914, "learning_rate": 1e-05, "loss": 2.3316, "step": 500 }, { "epoch": 0.26702269692923897, "grad_norm": 0.5076782703399658, "learning_rate": 1.2e-05, "loss": 2.3049, "step": 600 }, { "epoch": 0.3115264797507788, "grad_norm": 0.46594780683517456, "learning_rate": 1.4e-05, "loss": 2.2432, "step": 700 }, { "epoch": 0.3560302625723186, "grad_norm": 0.5553820729255676, "learning_rate": 1.6000000000000003e-05, "loss": 2.2903, "step": 800 }, { "epoch": 0.40053404539385845, "grad_norm": 0.5826830863952637, "learning_rate": 1.8e-05, "loss": 2.2219, "step": 900 }, { "epoch": 0.44503782821539833, "grad_norm": 0.6430906057357788, "learning_rate": 2e-05, "loss": 2.2133, "step": 1000 }, { "epoch": 0.48954161103693816, "grad_norm": 0.5223941206932068, "learning_rate": 1.9684326170007365e-05, "loss": 2.2328, "step": 1100 }, { "epoch": 0.5340453938584779, "grad_norm": 0.6351125836372375, "learning_rate": 1.8757234673417892e-05, "loss": 2.181, "step": 1200 }, { "epoch": 0.5785491766800178, "grad_norm": 0.7824538350105286, "learning_rate": 1.7277257214927987e-05, "loss": 2.1317, "step": 1300 }, { "epoch": 0.6230529595015576, "grad_norm": 0.8329874277114868, "learning_rate": 1.5337831825062507e-05, "loss": 2.1331, "step": 1400 }, { "epoch": 0.6675567423230975, "grad_norm": 0.7226102352142334, "learning_rate": 1.306140367198221e-05, "loss": 2.0982, "step": 1500 }, { "epoch": 0.7120605251446372, "grad_norm": 0.758540689945221, "learning_rate": 1.0591694514444284e-05, "loss": 2.1411, "step": 1600 }, { "epoch": 0.7565643079661771, "grad_norm": 0.7841065526008606, "learning_rate": 8.084628862194305e-06, "loss": 2.1329, "step": 1700 }, { "epoch": 0.8010680907877169, "grad_norm": 0.7534396648406982, "learning_rate": 5.698489718530022e-06, "loss": 2.1179, "step": 1800 }, { "epoch": 0.8455718736092568, "grad_norm": 0.809190034866333, "learning_rate": 3.583925419926607e-06, "loss": 2.132, "step": 1900 }, { "epoch": 0.8900756564307967, "grad_norm": 0.8342956304550171, "learning_rate": 1.874438488565231e-06, "loss": 2.0948, "step": 2000 }, { "epoch": 0.9345794392523364, "grad_norm": 0.8505709171295166, "learning_rate": 6.779569818349296e-07, "loss": 2.1061, "step": 2100 }, { "epoch": 0.9790832220738763, "grad_norm": 0.9227008819580078, "learning_rate": 7.00204796846915e-08, "loss": 2.084, "step": 2200 }, { "epoch": 1.0, "step": 2247, "total_flos": 4.1045168553984e+16, "train_loss": 2.2167040354207086, "train_runtime": 618.2687, "train_samples_per_second": 7.269, "train_steps_per_second": 3.634 } ], "logging_steps": 100, "max_steps": 2247, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.1045168553984e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }