|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7692307692307693, |
|
"eval_steps": 10, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006153846153846154, |
|
"eval_loss": 1.845949411392212, |
|
"eval_runtime": 1.9761, |
|
"eval_samples_per_second": 57.69, |
|
"eval_steps_per_second": 3.036, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 29.46457290649414, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 1.944, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 1.7965577840805054, |
|
"eval_runtime": 1.9506, |
|
"eval_samples_per_second": 58.444, |
|
"eval_steps_per_second": 3.076, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 14.49207878112793, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 1.6999, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"eval_loss": 1.5032916069030762, |
|
"eval_runtime": 1.9599, |
|
"eval_samples_per_second": 58.168, |
|
"eval_steps_per_second": 3.061, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 11.355466842651367, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 1.1499, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.8391174674034119, |
|
"eval_runtime": 1.9437, |
|
"eval_samples_per_second": 58.65, |
|
"eval_steps_per_second": 3.087, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 5.648468017578125, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.7194, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"eval_loss": 0.6716201305389404, |
|
"eval_runtime": 1.9517, |
|
"eval_samples_per_second": 58.411, |
|
"eval_steps_per_second": 3.074, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 5.752841949462891, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.6811, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.6289324164390564, |
|
"eval_runtime": 1.952, |
|
"eval_samples_per_second": 58.401, |
|
"eval_steps_per_second": 3.074, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 3.414722204208374, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.5924, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.6140013337135315, |
|
"eval_runtime": 1.94, |
|
"eval_samples_per_second": 58.762, |
|
"eval_steps_per_second": 3.093, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 5.556036949157715, |
|
"learning_rate": 5.384615384615385e-06, |
|
"loss": 0.6135, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"eval_loss": 0.6054026484489441, |
|
"eval_runtime": 1.9659, |
|
"eval_samples_per_second": 57.988, |
|
"eval_steps_per_second": 3.052, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 4.436710834503174, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.5952, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"eval_loss": 0.5986860990524292, |
|
"eval_runtime": 1.9428, |
|
"eval_samples_per_second": 58.677, |
|
"eval_steps_per_second": 3.088, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 3.496018409729004, |
|
"learning_rate": 6.923076923076923e-06, |
|
"loss": 0.5887, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 0.594973623752594, |
|
"eval_runtime": 1.9468, |
|
"eval_samples_per_second": 58.558, |
|
"eval_steps_per_second": 3.082, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 2.3540539741516113, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.5634, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.5954164266586304, |
|
"eval_runtime": 1.948, |
|
"eval_samples_per_second": 58.521, |
|
"eval_steps_per_second": 3.08, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 2.9880635738372803, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 0.5827, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"eval_loss": 0.5955133438110352, |
|
"eval_runtime": 1.9432, |
|
"eval_samples_per_second": 58.667, |
|
"eval_steps_per_second": 3.088, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 1.9405996799468994, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.5726, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"eval_loss": 0.5951128602027893, |
|
"eval_runtime": 1.9715, |
|
"eval_samples_per_second": 57.824, |
|
"eval_steps_per_second": 3.043, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.9948021173477173, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6488, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.5964463949203491, |
|
"eval_runtime": 2.1032, |
|
"eval_samples_per_second": 54.203, |
|
"eval_steps_per_second": 2.853, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 1.95350980758667, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.6218, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"eval_loss": 0.5996471047401428, |
|
"eval_runtime": 1.9539, |
|
"eval_samples_per_second": 58.345, |
|
"eval_steps_per_second": 3.071, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 3.840015411376953, |
|
"learning_rate": 1.1538461538461538e-05, |
|
"loss": 0.5585, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.6000372767448425, |
|
"eval_runtime": 1.9457, |
|
"eval_samples_per_second": 58.592, |
|
"eval_steps_per_second": 3.084, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 2.345364809036255, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.5314, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"eval_loss": 0.6025042533874512, |
|
"eval_runtime": 1.9398, |
|
"eval_samples_per_second": 58.768, |
|
"eval_steps_per_second": 3.093, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 3.162071704864502, |
|
"learning_rate": 1.3076923076923078e-05, |
|
"loss": 0.6901, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"eval_loss": 0.598171055316925, |
|
"eval_runtime": 1.9419, |
|
"eval_samples_per_second": 58.705, |
|
"eval_steps_per_second": 3.09, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 3.9432108402252197, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.6604, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 0.5974885821342468, |
|
"eval_runtime": 1.9664, |
|
"eval_samples_per_second": 57.975, |
|
"eval_steps_per_second": 3.051, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 1.904718279838562, |
|
"learning_rate": 1.4615384615384615e-05, |
|
"loss": 0.5806, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"eval_loss": 0.6012160778045654, |
|
"eval_runtime": 2.082, |
|
"eval_samples_per_second": 54.755, |
|
"eval_steps_per_second": 2.882, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 2.2823173999786377, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.588, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.6052933931350708, |
|
"eval_runtime": 1.9613, |
|
"eval_samples_per_second": 58.124, |
|
"eval_steps_per_second": 3.059, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 1.8020161390304565, |
|
"learning_rate": 1.6153846153846154e-05, |
|
"loss": 0.5826, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"eval_loss": 0.6047356724739075, |
|
"eval_runtime": 1.9487, |
|
"eval_samples_per_second": 58.5, |
|
"eval_steps_per_second": 3.079, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 1.8297995328903198, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.5442, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"eval_loss": 0.6105689406394958, |
|
"eval_runtime": 2.0567, |
|
"eval_samples_per_second": 55.429, |
|
"eval_steps_per_second": 2.917, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 1.8141131401062012, |
|
"learning_rate": 1.7692307692307694e-05, |
|
"loss": 0.602, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"eval_loss": 0.6149886250495911, |
|
"eval_runtime": 2.1951, |
|
"eval_samples_per_second": 51.933, |
|
"eval_steps_per_second": 2.733, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 2.093683958053589, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.6103, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"eval_loss": 0.621900200843811, |
|
"eval_runtime": 1.9465, |
|
"eval_samples_per_second": 58.567, |
|
"eval_steps_per_second": 3.082, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.840038776397705, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 0.5775, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.6232128739356995, |
|
"eval_runtime": 1.9883, |
|
"eval_samples_per_second": 57.334, |
|
"eval_steps_per_second": 3.018, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 50, |
|
"total_flos": 2.170153002126541e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|