|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9790209790209791, |
|
"eval_steps": 9, |
|
"global_step": 35, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027972027972027972, |
|
"grad_norm": 5.75, |
|
"learning_rate": 2e-05, |
|
"loss": 1.8618, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.027972027972027972, |
|
"eval_loss": 1.856866478919983, |
|
"eval_runtime": 4.5304, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 1.766, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.055944055944055944, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4e-05, |
|
"loss": 1.8343, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08391608391608392, |
|
"grad_norm": 5.75, |
|
"learning_rate": 6e-05, |
|
"loss": 1.87, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.11188811188811189, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6976, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.13986013986013987, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0745, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.16783216783216784, |
|
"grad_norm": 2.875, |
|
"learning_rate": 0.00012, |
|
"loss": 0.5387, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1958041958041958, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.00014, |
|
"loss": 0.3055, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.22377622377622378, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00016, |
|
"loss": 0.0592, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2517482517482518, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00018, |
|
"loss": 0.0185, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2517482517482518, |
|
"eval_loss": 0.0595741793513298, |
|
"eval_runtime": 4.6321, |
|
"eval_samples_per_second": 3.454, |
|
"eval_steps_per_second": 1.727, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1101, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 6.375, |
|
"learning_rate": 0.00019997080140801932, |
|
"loss": 0.1327, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3356643356643357, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.00019988322268323268, |
|
"loss": 0.0082, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 0.00019973731496914914, |
|
"loss": 0.0049, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.3916083916083916, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00019953316347176488, |
|
"loss": 0.0024, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4195804195804196, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001992708874098054, |
|
"loss": 0.065, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.44755244755244755, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.0001989506399451051, |
|
"loss": 0.0029, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.4755244755244755, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 0.0001985726080931651, |
|
"loss": 0.0053, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5034965034965035, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 0.00019813701261394136, |
|
"loss": 0.0056, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5034965034965035, |
|
"eval_loss": 0.02016551047563553, |
|
"eval_runtime": 4.6628, |
|
"eval_samples_per_second": 3.431, |
|
"eval_steps_per_second": 1.716, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5314685314685315, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00019764410788292722, |
|
"loss": 0.003, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 0.0001970941817426052, |
|
"loss": 0.0011, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5874125874125874, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00019648755533435518, |
|
"loss": 0.0445, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00019582458291091663, |
|
"loss": 0.0708, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6433566433566433, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.0114, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.6713286713286714, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001943311813257743, |
|
"loss": 0.0243, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.6993006993006993, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 0.0001935016242685415, |
|
"loss": 0.0655, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00019261746489577765, |
|
"loss": 0.0756, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.7552447552447552, |
|
"grad_norm": 0.0201416015625, |
|
"learning_rate": 0.00019167921953165825, |
|
"loss": 0.0008, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7552447552447552, |
|
"eval_loss": 0.0004657781682908535, |
|
"eval_runtime": 4.6398, |
|
"eval_samples_per_second": 3.448, |
|
"eval_steps_per_second": 1.724, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7832167832167832, |
|
"grad_norm": 0.0186767578125, |
|
"learning_rate": 0.00019068743608505455, |
|
"loss": 0.0008, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.8111888111888111, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 0.00018964269372957038, |
|
"loss": 0.0008, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 0.0046, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8671328671328671, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 0.0001873968032626518, |
|
"loss": 0.0049, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.8951048951048951, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 0.00018619696668800492, |
|
"loss": 0.0047, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 0.0001849467935121521, |
|
"loss": 0.0012, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.951048951048951, |
|
"grad_norm": 0.032958984375, |
|
"learning_rate": 0.00018364701380102266, |
|
"loss": 0.0014, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.9790209790209791, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 0.00018229838658936564, |
|
"loss": 0.001, |
|
"step": 35 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 140, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 35, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.222073165152256e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|