|
{ |
|
"best_metric": 0.8423399925231934, |
|
"best_model_checkpoint": "./output/training_results/C018_random_sample_llama3-8b-base_instruct_20240504_182259/checkpoint-20", |
|
"epoch": 4.0, |
|
"eval_steps": 20, |
|
"global_step": 192, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.9378, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 12.797572312512326, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.914, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 6.055131806849878, |
|
"learning_rate": 3.75e-06, |
|
"loss": 0.9191, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 5.364360008317059, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.8384, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 5.029346041755506, |
|
"learning_rate": 1.125e-05, |
|
"loss": 0.8108, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"eval_loss": 0.8423399925231934, |
|
"eval_runtime": 1.9929, |
|
"eval_samples_per_second": 170.602, |
|
"eval_steps_per_second": 1.505, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 4.414919525176414, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.8145, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 4.112077512908992, |
|
"learning_rate": 1.0857107196807194e-05, |
|
"loss": 0.8903, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 3.941277162112511, |
|
"learning_rate": 7.785589881369409e-06, |
|
"loss": 0.8187, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 4.254470659708268, |
|
"learning_rate": 5.529292099652595e-06, |
|
"loss": 0.7995, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_loss": 0.8555145263671875, |
|
"eval_runtime": 1.9676, |
|
"eval_samples_per_second": 172.799, |
|
"eval_steps_per_second": 1.525, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 3.9948611107854326, |
|
"learning_rate": 3.888024511896068e-06, |
|
"loss": 0.882, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 3.9168522584614895, |
|
"learning_rate": 2.706555900111454e-06, |
|
"loss": 0.736, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1458333333333333, |
|
"grad_norm": 2.943062356850262, |
|
"learning_rate": 1.865515934042282e-06, |
|
"loss": 0.4778, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 4.767880244063883, |
|
"learning_rate": 1.2739241815556468e-06, |
|
"loss": 0.4526, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8815763592720032, |
|
"eval_runtime": 1.9656, |
|
"eval_samples_per_second": 172.976, |
|
"eval_steps_per_second": 1.526, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3541666666666667, |
|
"grad_norm": 3.995356444110687, |
|
"learning_rate": 8.630954296648578e-07, |
|
"loss": 0.4192, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 4.06684246492274, |
|
"learning_rate": 5.817031181133133e-07, |
|
"loss": 0.4476, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 4.659083580115326, |
|
"learning_rate": 3.918112984729563e-07, |
|
"loss": 0.445, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.841871360565151, |
|
"learning_rate": 2.6571123033559406e-07, |
|
"loss": 0.4663, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 0.8521442413330078, |
|
"eval_runtime": 1.9598, |
|
"eval_samples_per_second": 173.485, |
|
"eval_steps_per_second": 1.531, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7708333333333335, |
|
"grad_norm": 3.5797328107351154, |
|
"learning_rate": 1.8342171723792628e-07, |
|
"loss": 0.4356, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 3.4219289180238364, |
|
"learning_rate": 1.3073276582043678e-07, |
|
"loss": 0.4398, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9791666666666665, |
|
"grad_norm": 3.7181063684193703, |
|
"learning_rate": 9.769031226024856e-08, |
|
"loss": 0.4823, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 3.4287058824726855, |
|
"learning_rate": 7.74357826465857e-08, |
|
"loss": 0.3927, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"eval_loss": 0.8507040143013, |
|
"eval_runtime": 1.96, |
|
"eval_samples_per_second": 173.471, |
|
"eval_steps_per_second": 1.531, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 3.4015335314434845, |
|
"learning_rate": 6.532831361687478e-08, |
|
"loss": 0.3733, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"grad_norm": 3.3103683305418437, |
|
"learning_rate": 5.828972369827512e-08, |
|
"loss": 0.4061, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3958333333333335, |
|
"grad_norm": 3.482262897795111, |
|
"learning_rate": 5.4322954384342975e-08, |
|
"loss": 0.3621, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.326841270087755, |
|
"learning_rate": 5.2163845524645534e-08, |
|
"loss": 0.4017, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.8561407923698425, |
|
"eval_runtime": 1.9661, |
|
"eval_samples_per_second": 172.93, |
|
"eval_steps_per_second": 1.526, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6041666666666665, |
|
"grad_norm": 3.5184015061674216, |
|
"learning_rate": 5.1033917145757624e-08, |
|
"loss": 0.3944, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"grad_norm": 3.67879442141003, |
|
"learning_rate": 5.046843690876512e-08, |
|
"loss": 0.3947, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 3.475723886508797, |
|
"learning_rate": 5.019958911899713e-08, |
|
"loss": 0.381, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 3.2733636154827503, |
|
"learning_rate": 5.0079150140309806e-08, |
|
"loss": 0.368, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"eval_loss": 0.8607857823371887, |
|
"eval_runtime": 1.9683, |
|
"eval_samples_per_second": 172.741, |
|
"eval_steps_per_second": 1.524, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0208333333333335, |
|
"grad_norm": 3.1617566806185375, |
|
"learning_rate": 5.0028831355203246e-08, |
|
"loss": 0.3777, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 3.228899698066603, |
|
"learning_rate": 5.00094821039914e-08, |
|
"loss": 0.3817, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2291666666666665, |
|
"grad_norm": 3.322344354923687, |
|
"learning_rate": 5.000275150604354e-08, |
|
"loss": 0.3773, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 4.645153711557618, |
|
"learning_rate": 5.000068241292119e-08, |
|
"loss": 0.3677, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 0.8646895885467529, |
|
"eval_runtime": 1.964, |
|
"eval_samples_per_second": 173.116, |
|
"eval_steps_per_second": 1.527, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 3.243053987237776, |
|
"learning_rate": 5.000013819045227e-08, |
|
"loss": 0.3656, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"grad_norm": 3.582352877774696, |
|
"learning_rate": 5.000002132208559e-08, |
|
"loss": 0.3503, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6458333333333335, |
|
"grad_norm": 3.2701278989275857, |
|
"learning_rate": 5.00000022411853e-08, |
|
"loss": 0.3707, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.338127751209054, |
|
"learning_rate": 5.000000013145176e-08, |
|
"loss": 0.3635, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.8675721287727356, |
|
"eval_runtime": 1.9612, |
|
"eval_samples_per_second": 173.361, |
|
"eval_steps_per_second": 1.53, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8541666666666665, |
|
"grad_norm": 3.6011084207595765, |
|
"learning_rate": 5.000000000284985e-08, |
|
"loss": 0.3833, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"grad_norm": 3.0931269944491455, |
|
"learning_rate": 5.000000000000758e-08, |
|
"loss": 0.3722, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 192, |
|
"total_flos": 5305238814720.0, |
|
"train_loss": 0.5165732034171621, |
|
"train_runtime": 1007.2477, |
|
"train_samples_per_second": 12.128, |
|
"train_steps_per_second": 0.191 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 192, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 20, |
|
"total_flos": 5305238814720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|