{ "best_metric": 0.5153154134750366, "best_model_checkpoint": "/ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d2_iter3_model/checkpoint-200", "epoch": 2.0, "eval_steps": 100, "global_step": 338, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05917159763313609, "grad_norm": 0.9615801643973331, "learning_rate": 2.5e-05, "loss": 1.8771, "step": 10 }, { "epoch": 0.11834319526627218, "grad_norm": 3.7252569579401107, "learning_rate": 5e-05, "loss": 1.4266, "step": 20 }, { "epoch": 0.17751479289940827, "grad_norm": 0.4587282095179799, "learning_rate": 4.98781004037916e-05, "loss": 0.7228, "step": 30 }, { "epoch": 0.23668639053254437, "grad_norm": 0.9607481645216482, "learning_rate": 4.951359037609088e-05, "loss": 0.5857, "step": 40 }, { "epoch": 0.2958579881656805, "grad_norm": 1.277269346490998, "learning_rate": 4.891002460691306e-05, "loss": 0.558, "step": 50 }, { "epoch": 0.35502958579881655, "grad_norm": 1.2676470043435204, "learning_rate": 4.807328905014201e-05, "loss": 0.6275, "step": 60 }, { "epoch": 0.41420118343195267, "grad_norm": 0.43321439772469345, "learning_rate": 4.7011543523897996e-05, "loss": 0.5004, "step": 70 }, { "epoch": 0.47337278106508873, "grad_norm": 0.8638153501814058, "learning_rate": 4.573514213625505e-05, "loss": 0.5087, "step": 80 }, { "epoch": 0.5325443786982249, "grad_norm": 3.2295994507917754, "learning_rate": 4.425653231231344e-05, "loss": 0.5459, "step": 90 }, { "epoch": 0.591715976331361, "grad_norm": 1.0832832075215777, "learning_rate": 4.259013340731224e-05, "loss": 0.4759, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.5315915942192078, "eval_runtime": 20.3569, "eval_samples_per_second": 14.737, "eval_steps_per_second": 1.867, "step": 100 }, { "epoch": 0.650887573964497, "grad_norm": 1.1344737325339866, "learning_rate": 4.075219608954278e-05, "loss": 0.4617, "step": 110 }, { "epoch": 0.7100591715976331, "grad_norm": 1.0747858639409258, "learning_rate": 3.876064386435646e-05, "loss": 0.4369, "step": 120 }, { "epoch": 0.7692307692307693, "grad_norm": 0.4410328402897703, "learning_rate": 3.663489828471953e-05, "loss": 0.4933, "step": 130 }, { "epoch": 0.8284023668639053, "grad_norm": 1.306125146854077, "learning_rate": 3.4395689552855955e-05, "loss": 0.4431, "step": 140 }, { "epoch": 0.8875739644970414, "grad_norm": 1.3505699464811827, "learning_rate": 3.206485435998498e-05, "loss": 0.4063, "step": 150 }, { "epoch": 0.9467455621301775, "grad_norm": 1.158959539769833, "learning_rate": 2.9665122935613727e-05, "loss": 0.4936, "step": 160 }, { "epoch": 1.0059171597633136, "grad_norm": 0.686278368841292, "learning_rate": 2.7219897383073373e-05, "loss": 0.4633, "step": 170 }, { "epoch": 1.0650887573964498, "grad_norm": 0.7329855489762843, "learning_rate": 2.475302346296336e-05, "loss": 0.3889, "step": 180 }, { "epoch": 1.1242603550295858, "grad_norm": 1.511269029215083, "learning_rate": 2.2288558050064367e-05, "loss": 0.457, "step": 190 }, { "epoch": 1.183431952662722, "grad_norm": 0.7905550148998332, "learning_rate": 1.9850534531472546e-05, "loss": 0.3967, "step": 200 }, { "epoch": 1.183431952662722, "eval_loss": 0.5153154134750366, "eval_runtime": 18.9227, "eval_samples_per_second": 15.854, "eval_steps_per_second": 2.008, "step": 200 }, { "epoch": 1.242603550295858, "grad_norm": 1.5247295502601703, "learning_rate": 1.746272843378493e-05, "loss": 0.3733, "step": 210 }, { "epoch": 1.301775147928994, "grad_norm": 0.8404429082517917, "learning_rate": 1.5148425564932084e-05, "loss": 0.3933, "step": 220 }, { "epoch": 1.3609467455621302, "grad_norm": 1.064902485845328, "learning_rate": 1.2930194931731382e-05, "loss": 0.3892, "step": 230 }, { "epoch": 1.4201183431952662, "grad_norm": 0.8352671361637931, "learning_rate": 1.0829668647661559e-05, "loss": 0.3891, "step": 240 }, { "epoch": 1.4792899408284024, "grad_norm": 0.8368737776004409, "learning_rate": 8.867330977190877e-06, "loss": 0.3597, "step": 250 }, { "epoch": 1.5384615384615383, "grad_norm": 0.4958411343194272, "learning_rate": 7.062318573891716e-06, "loss": 0.4077, "step": 260 }, { "epoch": 1.5976331360946747, "grad_norm": 0.4641500517420258, "learning_rate": 5.4322338604131715e-06, "loss": 0.4247, "step": 270 }, { "epoch": 1.6568047337278107, "grad_norm": 0.8611324943813823, "learning_rate": 3.992973370223896e-06, "loss": 0.3805, "step": 280 }, { "epoch": 1.7159763313609466, "grad_norm": 1.2247521849952663, "learning_rate": 2.75857272513132e-06, "loss": 0.4078, "step": 290 }, { "epoch": 1.7751479289940828, "grad_norm": 1.2501388237565647, "learning_rate": 1.7410697603511383e-06, "loss": 0.3547, "step": 300 }, { "epoch": 1.7751479289940828, "eval_loss": 0.5177696347236633, "eval_runtime": 18.9794, "eval_samples_per_second": 15.807, "eval_steps_per_second": 2.002, "step": 300 }, { "epoch": 1.834319526627219, "grad_norm": 0.749968665840715, "learning_rate": 9.503871319271551e-07, "loss": 0.35, "step": 310 }, { "epoch": 1.893491124260355, "grad_norm": 0.9424148400717701, "learning_rate": 3.9423555131007925e-07, "loss": 0.3912, "step": 320 }, { "epoch": 1.952662721893491, "grad_norm": 0.8186461379045846, "learning_rate": 7.803859074854425e-08, "loss": 0.4186, "step": 330 }, { "epoch": 2.0, "step": 338, "total_flos": 84346414039040.0, "train_loss": 0.5209799306632499, "train_runtime": 1210.5031, "train_samples_per_second": 4.461, "train_steps_per_second": 0.279 } ], "logging_steps": 10, "max_steps": 338, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 84346414039040.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }