|
{ |
|
"best_metric": 0.5390731692314148, |
|
"best_model_checkpoint": "/ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d1_iter2_model/checkpoint-200", |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 338, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 1.5490762637870956, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.39, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 2.180427642573537, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2536, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 1.4938352528249466, |
|
"learning_rate": 4.98781004037916e-05, |
|
"loss": 0.6182, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.8266774666049563, |
|
"learning_rate": 4.951359037609088e-05, |
|
"loss": 0.5871, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.44331898296236955, |
|
"learning_rate": 4.891002460691306e-05, |
|
"loss": 0.6622, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 0.7348699943282603, |
|
"learning_rate": 4.807328905014201e-05, |
|
"loss": 0.5239, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 1.0233758530765529, |
|
"learning_rate": 4.7011543523897996e-05, |
|
"loss": 0.6031, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.44566662901156306, |
|
"learning_rate": 4.573514213625505e-05, |
|
"loss": 0.5342, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 0.32376173126197005, |
|
"learning_rate": 4.425653231231344e-05, |
|
"loss": 0.5295, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 2.5050276715521953, |
|
"learning_rate": 4.259013340731224e-05, |
|
"loss": 0.5923, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"eval_loss": 0.5548861622810364, |
|
"eval_runtime": 20.5369, |
|
"eval_samples_per_second": 14.608, |
|
"eval_steps_per_second": 1.85, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 0.6939641269685894, |
|
"learning_rate": 4.075219608954278e-05, |
|
"loss": 0.5283, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 0.3223403825561066, |
|
"learning_rate": 3.876064386435646e-05, |
|
"loss": 0.5123, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.242304136081819, |
|
"learning_rate": 3.663489828471953e-05, |
|
"loss": 0.5148, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 0.6162370328893566, |
|
"learning_rate": 3.4395689552855955e-05, |
|
"loss": 0.5706, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 2.854595963693751, |
|
"learning_rate": 3.206485435998498e-05, |
|
"loss": 0.5443, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 0.836450705626818, |
|
"learning_rate": 2.9665122935613727e-05, |
|
"loss": 0.5345, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0059171597633136, |
|
"grad_norm": 0.6854327873281724, |
|
"learning_rate": 2.7219897383073373e-05, |
|
"loss": 0.4759, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0650887573964498, |
|
"grad_norm": 0.3289222105005388, |
|
"learning_rate": 2.475302346296336e-05, |
|
"loss": 0.4207, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1242603550295858, |
|
"grad_norm": 0.6402929677890956, |
|
"learning_rate": 2.2288558050064367e-05, |
|
"loss": 0.4453, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 0.4560254960950853, |
|
"learning_rate": 1.9850534531472546e-05, |
|
"loss": 0.485, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"eval_loss": 0.5390731692314148, |
|
"eval_runtime": 19.171, |
|
"eval_samples_per_second": 15.649, |
|
"eval_steps_per_second": 1.982, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.242603550295858, |
|
"grad_norm": 0.4429967217295556, |
|
"learning_rate": 1.746272843378493e-05, |
|
"loss": 0.448, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.301775147928994, |
|
"grad_norm": 0.5801927088218751, |
|
"learning_rate": 1.5148425564932084e-05, |
|
"loss": 0.4257, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3609467455621302, |
|
"grad_norm": 1.6461621420887593, |
|
"learning_rate": 1.2930194931731382e-05, |
|
"loss": 0.5102, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4201183431952662, |
|
"grad_norm": 1.4629083533873057, |
|
"learning_rate": 1.0829668647661559e-05, |
|
"loss": 0.4632, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4792899408284024, |
|
"grad_norm": 1.6289265138854685, |
|
"learning_rate": 8.867330977190877e-06, |
|
"loss": 0.5108, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.6682669143854097, |
|
"learning_rate": 7.062318573891716e-06, |
|
"loss": 0.4304, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5976331360946747, |
|
"grad_norm": 0.5194393122476173, |
|
"learning_rate": 5.4322338604131715e-06, |
|
"loss": 0.4281, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6568047337278107, |
|
"grad_norm": 1.0124557614072613, |
|
"learning_rate": 3.992973370223896e-06, |
|
"loss": 0.4529, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7159763313609466, |
|
"grad_norm": 0.42873082380830013, |
|
"learning_rate": 2.75857272513132e-06, |
|
"loss": 0.4698, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"grad_norm": 0.9876326963962394, |
|
"learning_rate": 1.7410697603511383e-06, |
|
"loss": 0.4405, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"eval_loss": 0.5398803353309631, |
|
"eval_runtime": 18.7716, |
|
"eval_samples_per_second": 15.982, |
|
"eval_steps_per_second": 2.024, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.834319526627219, |
|
"grad_norm": 0.9194473587313593, |
|
"learning_rate": 9.503871319271551e-07, |
|
"loss": 0.4825, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.893491124260355, |
|
"grad_norm": 1.330707485363624, |
|
"learning_rate": 3.9423555131007925e-07, |
|
"loss": 0.5166, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.952662721893491, |
|
"grad_norm": 0.3585144579596122, |
|
"learning_rate": 7.803859074854425e-08, |
|
"loss": 0.4212, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 338, |
|
"total_flos": 93650261901312.0, |
|
"train_loss": 0.5521188734551153, |
|
"train_runtime": 1213.4234, |
|
"train_samples_per_second": 4.45, |
|
"train_steps_per_second": 0.279 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 338, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 93650261901312.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|