{ "best_metric": 0.5390731692314148, "best_model_checkpoint": "/ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d1_iter2_model/checkpoint-200", "epoch": 2.0, "eval_steps": 100, "global_step": 338, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05917159763313609, "grad_norm": 1.5490762637870956, "learning_rate": 2.5e-05, "loss": 1.39, "step": 10 }, { "epoch": 0.11834319526627218, "grad_norm": 2.180427642573537, "learning_rate": 5e-05, "loss": 1.2536, "step": 20 }, { "epoch": 0.17751479289940827, "grad_norm": 1.4938352528249466, "learning_rate": 4.98781004037916e-05, "loss": 0.6182, "step": 30 }, { "epoch": 0.23668639053254437, "grad_norm": 0.8266774666049563, "learning_rate": 4.951359037609088e-05, "loss": 0.5871, "step": 40 }, { "epoch": 0.2958579881656805, "grad_norm": 0.44331898296236955, "learning_rate": 4.891002460691306e-05, "loss": 0.6622, "step": 50 }, { "epoch": 0.35502958579881655, "grad_norm": 0.7348699943282603, "learning_rate": 4.807328905014201e-05, "loss": 0.5239, "step": 60 }, { "epoch": 0.41420118343195267, "grad_norm": 1.0233758530765529, "learning_rate": 4.7011543523897996e-05, "loss": 0.6031, "step": 70 }, { "epoch": 0.47337278106508873, "grad_norm": 0.44566662901156306, "learning_rate": 4.573514213625505e-05, "loss": 0.5342, "step": 80 }, { "epoch": 0.5325443786982249, "grad_norm": 0.32376173126197005, "learning_rate": 4.425653231231344e-05, "loss": 0.5295, "step": 90 }, { "epoch": 0.591715976331361, "grad_norm": 2.5050276715521953, "learning_rate": 4.259013340731224e-05, "loss": 0.5923, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.5548861622810364, "eval_runtime": 20.5369, "eval_samples_per_second": 14.608, "eval_steps_per_second": 1.85, "step": 100 }, { "epoch": 0.650887573964497, "grad_norm": 0.6939641269685894, "learning_rate": 4.075219608954278e-05, "loss": 0.5283, "step": 110 }, { "epoch": 0.7100591715976331, "grad_norm": 0.3223403825561066, "learning_rate": 3.876064386435646e-05, "loss": 0.5123, "step": 120 }, { "epoch": 0.7692307692307693, "grad_norm": 1.242304136081819, "learning_rate": 3.663489828471953e-05, "loss": 0.5148, "step": 130 }, { "epoch": 0.8284023668639053, "grad_norm": 0.6162370328893566, "learning_rate": 3.4395689552855955e-05, "loss": 0.5706, "step": 140 }, { "epoch": 0.8875739644970414, "grad_norm": 2.854595963693751, "learning_rate": 3.206485435998498e-05, "loss": 0.5443, "step": 150 }, { "epoch": 0.9467455621301775, "grad_norm": 0.836450705626818, "learning_rate": 2.9665122935613727e-05, "loss": 0.5345, "step": 160 }, { "epoch": 1.0059171597633136, "grad_norm": 0.6854327873281724, "learning_rate": 2.7219897383073373e-05, "loss": 0.4759, "step": 170 }, { "epoch": 1.0650887573964498, "grad_norm": 0.3289222105005388, "learning_rate": 2.475302346296336e-05, "loss": 0.4207, "step": 180 }, { "epoch": 1.1242603550295858, "grad_norm": 0.6402929677890956, "learning_rate": 2.2288558050064367e-05, "loss": 0.4453, "step": 190 }, { "epoch": 1.183431952662722, "grad_norm": 0.4560254960950853, "learning_rate": 1.9850534531472546e-05, "loss": 0.485, "step": 200 }, { "epoch": 1.183431952662722, "eval_loss": 0.5390731692314148, "eval_runtime": 19.171, "eval_samples_per_second": 15.649, "eval_steps_per_second": 1.982, "step": 200 }, { "epoch": 1.242603550295858, "grad_norm": 0.4429967217295556, "learning_rate": 1.746272843378493e-05, "loss": 0.448, "step": 210 }, { "epoch": 1.301775147928994, "grad_norm": 0.5801927088218751, "learning_rate": 1.5148425564932084e-05, "loss": 0.4257, "step": 220 }, { "epoch": 1.3609467455621302, "grad_norm": 1.6461621420887593, "learning_rate": 1.2930194931731382e-05, "loss": 0.5102, "step": 230 }, { "epoch": 1.4201183431952662, "grad_norm": 1.4629083533873057, "learning_rate": 1.0829668647661559e-05, "loss": 0.4632, "step": 240 }, { "epoch": 1.4792899408284024, "grad_norm": 1.6289265138854685, "learning_rate": 8.867330977190877e-06, "loss": 0.5108, "step": 250 }, { "epoch": 1.5384615384615383, "grad_norm": 0.6682669143854097, "learning_rate": 7.062318573891716e-06, "loss": 0.4304, "step": 260 }, { "epoch": 1.5976331360946747, "grad_norm": 0.5194393122476173, "learning_rate": 5.4322338604131715e-06, "loss": 0.4281, "step": 270 }, { "epoch": 1.6568047337278107, "grad_norm": 1.0124557614072613, "learning_rate": 3.992973370223896e-06, "loss": 0.4529, "step": 280 }, { "epoch": 1.7159763313609466, "grad_norm": 0.42873082380830013, "learning_rate": 2.75857272513132e-06, "loss": 0.4698, "step": 290 }, { "epoch": 1.7751479289940828, "grad_norm": 0.9876326963962394, "learning_rate": 1.7410697603511383e-06, "loss": 0.4405, "step": 300 }, { "epoch": 1.7751479289940828, "eval_loss": 0.5398803353309631, "eval_runtime": 18.7716, "eval_samples_per_second": 15.982, "eval_steps_per_second": 2.024, "step": 300 }, { "epoch": 1.834319526627219, "grad_norm": 0.9194473587313593, "learning_rate": 9.503871319271551e-07, "loss": 0.4825, "step": 310 }, { "epoch": 1.893491124260355, "grad_norm": 1.330707485363624, "learning_rate": 3.9423555131007925e-07, "loss": 0.5166, "step": 320 }, { "epoch": 1.952662721893491, "grad_norm": 0.3585144579596122, "learning_rate": 7.803859074854425e-08, "loss": 0.4212, "step": 330 }, { "epoch": 2.0, "step": 338, "total_flos": 93650261901312.0, "train_loss": 0.5521188734551153, "train_runtime": 1213.4234, "train_samples_per_second": 4.45, "train_steps_per_second": 0.279 } ], "logging_steps": 10, "max_steps": 338, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 93650261901312.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }