|
{ |
|
"best_metric": 0.5153154134750366, |
|
"best_model_checkpoint": "/ML-A100/team/mm/eamon/self_instruction/seed_ppl/qwen14B_models/qwen_14B_d2_iter3_model/checkpoint-200", |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 338, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 0.9615801643973331, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.8771, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 3.7252569579401107, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4266, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 0.4587282095179799, |
|
"learning_rate": 4.98781004037916e-05, |
|
"loss": 0.7228, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.9607481645216482, |
|
"learning_rate": 4.951359037609088e-05, |
|
"loss": 0.5857, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 1.277269346490998, |
|
"learning_rate": 4.891002460691306e-05, |
|
"loss": 0.558, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 1.2676470043435204, |
|
"learning_rate": 4.807328905014201e-05, |
|
"loss": 0.6275, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 0.43321439772469345, |
|
"learning_rate": 4.7011543523897996e-05, |
|
"loss": 0.5004, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.8638153501814058, |
|
"learning_rate": 4.573514213625505e-05, |
|
"loss": 0.5087, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 3.2295994507917754, |
|
"learning_rate": 4.425653231231344e-05, |
|
"loss": 0.5459, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 1.0832832075215777, |
|
"learning_rate": 4.259013340731224e-05, |
|
"loss": 0.4759, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"eval_loss": 0.5315915942192078, |
|
"eval_runtime": 20.3569, |
|
"eval_samples_per_second": 14.737, |
|
"eval_steps_per_second": 1.867, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 1.1344737325339866, |
|
"learning_rate": 4.075219608954278e-05, |
|
"loss": 0.4617, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 1.0747858639409258, |
|
"learning_rate": 3.876064386435646e-05, |
|
"loss": 0.4369, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.4410328402897703, |
|
"learning_rate": 3.663489828471953e-05, |
|
"loss": 0.4933, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 1.306125146854077, |
|
"learning_rate": 3.4395689552855955e-05, |
|
"loss": 0.4431, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 1.3505699464811827, |
|
"learning_rate": 3.206485435998498e-05, |
|
"loss": 0.4063, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 1.158959539769833, |
|
"learning_rate": 2.9665122935613727e-05, |
|
"loss": 0.4936, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0059171597633136, |
|
"grad_norm": 0.686278368841292, |
|
"learning_rate": 2.7219897383073373e-05, |
|
"loss": 0.4633, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0650887573964498, |
|
"grad_norm": 0.7329855489762843, |
|
"learning_rate": 2.475302346296336e-05, |
|
"loss": 0.3889, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1242603550295858, |
|
"grad_norm": 1.511269029215083, |
|
"learning_rate": 2.2288558050064367e-05, |
|
"loss": 0.457, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 0.7905550148998332, |
|
"learning_rate": 1.9850534531472546e-05, |
|
"loss": 0.3967, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"eval_loss": 0.5153154134750366, |
|
"eval_runtime": 18.9227, |
|
"eval_samples_per_second": 15.854, |
|
"eval_steps_per_second": 2.008, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.242603550295858, |
|
"grad_norm": 1.5247295502601703, |
|
"learning_rate": 1.746272843378493e-05, |
|
"loss": 0.3733, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.301775147928994, |
|
"grad_norm": 0.8404429082517917, |
|
"learning_rate": 1.5148425564932084e-05, |
|
"loss": 0.3933, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3609467455621302, |
|
"grad_norm": 1.064902485845328, |
|
"learning_rate": 1.2930194931731382e-05, |
|
"loss": 0.3892, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4201183431952662, |
|
"grad_norm": 0.8352671361637931, |
|
"learning_rate": 1.0829668647661559e-05, |
|
"loss": 0.3891, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4792899408284024, |
|
"grad_norm": 0.8368737776004409, |
|
"learning_rate": 8.867330977190877e-06, |
|
"loss": 0.3597, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.4958411343194272, |
|
"learning_rate": 7.062318573891716e-06, |
|
"loss": 0.4077, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5976331360946747, |
|
"grad_norm": 0.4641500517420258, |
|
"learning_rate": 5.4322338604131715e-06, |
|
"loss": 0.4247, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6568047337278107, |
|
"grad_norm": 0.8611324943813823, |
|
"learning_rate": 3.992973370223896e-06, |
|
"loss": 0.3805, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7159763313609466, |
|
"grad_norm": 1.2247521849952663, |
|
"learning_rate": 2.75857272513132e-06, |
|
"loss": 0.4078, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"grad_norm": 1.2501388237565647, |
|
"learning_rate": 1.7410697603511383e-06, |
|
"loss": 0.3547, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"eval_loss": 0.5177696347236633, |
|
"eval_runtime": 18.9794, |
|
"eval_samples_per_second": 15.807, |
|
"eval_steps_per_second": 2.002, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.834319526627219, |
|
"grad_norm": 0.749968665840715, |
|
"learning_rate": 9.503871319271551e-07, |
|
"loss": 0.35, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.893491124260355, |
|
"grad_norm": 0.9424148400717701, |
|
"learning_rate": 3.9423555131007925e-07, |
|
"loss": 0.3912, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.952662721893491, |
|
"grad_norm": 0.8186461379045846, |
|
"learning_rate": 7.803859074854425e-08, |
|
"loss": 0.4186, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 338, |
|
"total_flos": 84346414039040.0, |
|
"train_loss": 0.5209799306632499, |
|
"train_runtime": 1210.5031, |
|
"train_samples_per_second": 4.461, |
|
"train_steps_per_second": 0.279 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 338, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 84346414039040.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|