{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005405405405405406, "grad_norm": 61.001103964725324, "learning_rate": 1.0526315789473685e-06, "loss": 1.9097, "step": 1 }, { "epoch": 0.02702702702702703, "grad_norm": 35.97997402423919, "learning_rate": 5.263157894736842e-06, "loss": 1.7102, "step": 5 }, { "epoch": 0.05405405405405406, "grad_norm": 30.10156193198182, "learning_rate": 1.0526315789473684e-05, "loss": 1.5516, "step": 10 }, { "epoch": 0.08108108108108109, "grad_norm": 6.532111146208257, "learning_rate": 1.578947368421053e-05, "loss": 1.4536, "step": 15 }, { "epoch": 0.10810810810810811, "grad_norm": 4.983904851506608, "learning_rate": 1.999820922669738e-05, "loss": 1.4112, "step": 20 }, { "epoch": 0.13513513513513514, "grad_norm": 3.8732866550483545, "learning_rate": 1.993559947963185e-05, "loss": 1.4056, "step": 25 }, { "epoch": 0.16216216216216217, "grad_norm": 3.0620367608079846, "learning_rate": 1.9784091409455728e-05, "loss": 1.4016, "step": 30 }, { "epoch": 0.1891891891891892, "grad_norm": 3.2352358324557167, "learning_rate": 1.9545040627715554e-05, "loss": 1.4128, "step": 35 }, { "epoch": 0.21621621621621623, "grad_norm": 3.022703286965415, "learning_rate": 1.9220586030376135e-05, "loss": 1.4148, "step": 40 }, { "epoch": 0.24324324324324326, "grad_norm": 2.8955611484982033, "learning_rate": 1.881363066014649e-05, "loss": 1.4159, "step": 45 }, { "epoch": 0.2702702702702703, "grad_norm": 2.8660620524615688, "learning_rate": 1.8327815731637612e-05, "loss": 1.42, "step": 50 }, { "epoch": 0.2972972972972973, "grad_norm": 2.632475843797219, "learning_rate": 1.7767488051760858e-05, "loss": 1.4078, "step": 55 }, { "epoch": 0.32432432432432434, "grad_norm": 2.5436272914892872, "learning_rate": 1.713766112687139e-05, "loss": 1.4015, "step": 60 }, { "epoch": 0.35135135135135137, "grad_norm": 2.431239149211266, "learning_rate": 1.644397030464877e-05, "loss": 1.3807, "step": 65 }, { "epoch": 0.3783783783783784, "grad_norm": 2.328539582829742, "learning_rate": 1.5692622352080662e-05, "loss": 1.3639, "step": 70 }, { "epoch": 0.40540540540540543, "grad_norm": 2.708625235728292, "learning_rate": 1.4890339920698334e-05, "loss": 1.3836, "step": 75 }, { "epoch": 0.43243243243243246, "grad_norm": 2.4202434897618317, "learning_rate": 1.404430139595877e-05, "loss": 1.3823, "step": 80 }, { "epoch": 0.4594594594594595, "grad_norm": 2.243346482513894, "learning_rate": 1.316207666896824e-05, "loss": 1.3538, "step": 85 }, { "epoch": 0.4864864864864865, "grad_norm": 2.4109049967489433, "learning_rate": 1.2251559405226943e-05, "loss": 1.3503, "step": 90 }, { "epoch": 0.5135135135135135, "grad_norm": 2.242883116591874, "learning_rate": 1.1320896416417026e-05, "loss": 1.3562, "step": 95 }, { "epoch": 0.5405405405405406, "grad_norm": 2.1959662600808203, "learning_rate": 1.0378414767176706e-05, "loss": 1.3447, "step": 100 }, { "epoch": 0.5675675675675675, "grad_norm": 2.2348646553535714, "learning_rate": 9.43254726906926e-06, "loss": 1.3398, "step": 105 }, { "epoch": 0.5945945945945946, "grad_norm": 2.21405256524929, "learning_rate": 8.491757028386262e-06, "loss": 1.3383, "step": 110 }, { "epoch": 0.6216216216216216, "grad_norm": 2.1710850419967263, "learning_rate": 7.564461722890082e-06, "loss": 1.3271, "step": 115 }, { "epoch": 0.6486486486486487, "grad_norm": 2.1752417729097338, "learning_rate": 6.6589582850261025e-06, "loss": 1.3176, "step": 120 }, { "epoch": 0.6756756756756757, "grad_norm": 2.1322248363189686, "learning_rate": 5.78334866549816e-06, "loss": 1.321, "step": 125 }, { "epoch": 0.7027027027027027, "grad_norm": 2.1516864548066525, "learning_rate": 4.9454673414341945e-06, "loss": 1.3074, "step": 130 }, { "epoch": 0.7297297297297297, "grad_norm": 2.04797883118049, "learning_rate": 4.152811217759529e-06, "loss": 1.3123, "step": 135 }, { "epoch": 0.7567567567567568, "grad_norm": 2.1202023783270367, "learning_rate": 3.4124725489820643e-06, "loss": 1.3026, "step": 140 }, { "epoch": 0.7837837837837838, "grad_norm": 2.1203746539007158, "learning_rate": 2.7310754815685627e-06, "loss": 1.2838, "step": 145 }, { "epoch": 0.8108108108108109, "grad_norm": 2.058016060238832, "learning_rate": 2.114716784696342e-06, "loss": 1.2805, "step": 150 }, { "epoch": 0.8378378378378378, "grad_norm": 2.072702697896577, "learning_rate": 1.5689112996891576e-06, "loss": 1.276, "step": 155 }, { "epoch": 0.8648648648648649, "grad_norm": 2.070358297476048, "learning_rate": 1.0985425962260342e-06, "loss": 1.2768, "step": 160 }, { "epoch": 0.8918918918918919, "grad_norm": 1.993421902652665, "learning_rate": 7.078192768243486e-07, "loss": 1.2755, "step": 165 }, { "epoch": 0.918918918918919, "grad_norm": 2.02321205406663, "learning_rate": 4.0023732056077235e-07, "loss": 1.276, "step": 170 }, { "epoch": 0.9459459459459459, "grad_norm": 2.0216958269361154, "learning_rate": 1.7854880295797406e-07, "loss": 1.2694, "step": 175 }, { "epoch": 0.972972972972973, "grad_norm": 2.0002791761840157, "learning_rate": 4.473727191441124e-08, "loss": 1.2669, "step": 180 }, { "epoch": 1.0, "grad_norm": 1.9847900492979156, "learning_rate": 0.0, "loss": 1.2583, "step": 185 }, { "epoch": 1.0, "eval_loss": 1.2626361846923828, "eval_runtime": 57.4547, "eval_samples_per_second": 11.47, "eval_steps_per_second": 0.366, "step": 185 }, { "epoch": 1.0, "step": 185, "total_flos": 19367618150400.0, "train_loss": 1.3619260220914273, "train_runtime": 1878.7186, "train_samples_per_second": 3.148, "train_steps_per_second": 0.098 } ], "logging_steps": 5, "max_steps": 185, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 19367618150400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }