|
{ |
|
"best_metric": 0.6997424960136414, |
|
"best_model_checkpoint": "/ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d0_iter1_model/checkpoint-300", |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 338, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 0.586446509459034, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.0824, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 2.2316997759806494, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0109, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 0.4074702236613225, |
|
"learning_rate": 4.98781004037916e-05, |
|
"loss": 0.6595, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.3126617987747972, |
|
"learning_rate": 4.951359037609088e-05, |
|
"loss": 0.7391, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.708271236864645, |
|
"learning_rate": 4.891002460691306e-05, |
|
"loss": 0.7051, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 0.3826205284015388, |
|
"learning_rate": 4.807328905014201e-05, |
|
"loss": 0.7091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 0.4309917712753102, |
|
"learning_rate": 4.7011543523897996e-05, |
|
"loss": 0.6642, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.46078253102774147, |
|
"learning_rate": 4.573514213625505e-05, |
|
"loss": 0.6352, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 0.350545318021008, |
|
"learning_rate": 4.425653231231344e-05, |
|
"loss": 0.695, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 1.3918929820024042, |
|
"learning_rate": 4.259013340731224e-05, |
|
"loss": 0.6716, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"eval_loss": 0.703880250453949, |
|
"eval_runtime": 20.1945, |
|
"eval_samples_per_second": 14.856, |
|
"eval_steps_per_second": 1.882, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 0.46801886344571736, |
|
"learning_rate": 4.075219608954278e-05, |
|
"loss": 0.6649, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 0.7261378819214507, |
|
"learning_rate": 3.876064386435646e-05, |
|
"loss": 0.6485, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.6311105785325938, |
|
"learning_rate": 3.663489828471953e-05, |
|
"loss": 0.5978, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 1.0566991845506082, |
|
"learning_rate": 3.4395689552855955e-05, |
|
"loss": 0.6504, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 0.42416309958564125, |
|
"learning_rate": 3.206485435998498e-05, |
|
"loss": 0.6214, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 0.42527966548399343, |
|
"learning_rate": 2.9665122935613727e-05, |
|
"loss": 0.6144, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0059171597633136, |
|
"grad_norm": 0.3979436989011932, |
|
"learning_rate": 2.7219897383073373e-05, |
|
"loss": 0.6493, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0650887573964498, |
|
"grad_norm": 0.24203747971705217, |
|
"learning_rate": 2.475302346296336e-05, |
|
"loss": 0.6474, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1242603550295858, |
|
"grad_norm": 0.319431105160605, |
|
"learning_rate": 2.2288558050064367e-05, |
|
"loss": 0.564, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 0.5080338290607097, |
|
"learning_rate": 1.9850534531472546e-05, |
|
"loss": 0.5798, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"eval_loss": 0.7010006904602051, |
|
"eval_runtime": 18.8091, |
|
"eval_samples_per_second": 15.95, |
|
"eval_steps_per_second": 2.02, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.242603550295858, |
|
"grad_norm": 0.564376924960013, |
|
"learning_rate": 1.746272843378493e-05, |
|
"loss": 0.6323, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.301775147928994, |
|
"grad_norm": 0.498318513735455, |
|
"learning_rate": 1.5148425564932084e-05, |
|
"loss": 0.6106, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3609467455621302, |
|
"grad_norm": 0.2827517725379645, |
|
"learning_rate": 1.2930194931731382e-05, |
|
"loss": 0.6374, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4201183431952662, |
|
"grad_norm": 0.5094729039694312, |
|
"learning_rate": 1.0829668647661559e-05, |
|
"loss": 0.5758, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4792899408284024, |
|
"grad_norm": 0.269479823527664, |
|
"learning_rate": 8.867330977190877e-06, |
|
"loss": 0.5988, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.3701018971236146, |
|
"learning_rate": 7.062318573891716e-06, |
|
"loss": 0.5956, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5976331360946747, |
|
"grad_norm": 0.43248474882397636, |
|
"learning_rate": 5.4322338604131715e-06, |
|
"loss": 0.5467, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6568047337278107, |
|
"grad_norm": 0.896259597910678, |
|
"learning_rate": 3.992973370223896e-06, |
|
"loss": 0.599, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7159763313609466, |
|
"grad_norm": 0.28651524116933125, |
|
"learning_rate": 2.75857272513132e-06, |
|
"loss": 0.5505, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"grad_norm": 0.5096606249052688, |
|
"learning_rate": 1.7410697603511383e-06, |
|
"loss": 0.5292, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"eval_loss": 0.6997424960136414, |
|
"eval_runtime": 19.0297, |
|
"eval_samples_per_second": 15.765, |
|
"eval_steps_per_second": 1.997, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.834319526627219, |
|
"grad_norm": 0.6033783324728439, |
|
"learning_rate": 9.503871319271551e-07, |
|
"loss": 0.6355, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.893491124260355, |
|
"grad_norm": 0.5681857650004716, |
|
"learning_rate": 3.9423555131007925e-07, |
|
"loss": 0.6309, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.952662721893491, |
|
"grad_norm": 0.5632696843447448, |
|
"learning_rate": 7.803859074854425e-08, |
|
"loss": 0.5814, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 338, |
|
"total_flos": 123391664193536.0, |
|
"train_loss": 0.6494796699320776, |
|
"train_runtime": 1192.6701, |
|
"train_samples_per_second": 4.528, |
|
"train_steps_per_second": 0.283 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 338, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 123391664193536.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|