|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 10695, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 34.351593017578125, |
|
"learning_rate": 3.327102803738318e-05, |
|
"loss": 1.2122, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 10.322906494140625, |
|
"learning_rate": 4.816103896103896e-05, |
|
"loss": 0.8692, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 23.850841522216797, |
|
"learning_rate": 4.446233766233767e-05, |
|
"loss": 0.747, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 26.398853302001953, |
|
"learning_rate": 4.076363636363636e-05, |
|
"loss": 0.6957, |
|
"step": 2848 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 39.551414489746094, |
|
"learning_rate": 3.706493506493507e-05, |
|
"loss": 0.6323, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 20.38348388671875, |
|
"learning_rate": 3.3366233766233766e-05, |
|
"loss": 0.5724, |
|
"step": 4272 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 18.263744354248047, |
|
"learning_rate": 2.9667532467532467e-05, |
|
"loss": 0.5256, |
|
"step": 4984 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 7.960543632507324, |
|
"learning_rate": 2.596883116883117e-05, |
|
"loss": 0.522, |
|
"step": 5696 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 5.682173252105713, |
|
"learning_rate": 2.227012987012987e-05, |
|
"loss": 0.4766, |
|
"step": 6408 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 7.151096820831299, |
|
"learning_rate": 1.8571428571428572e-05, |
|
"loss": 0.466, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 48.665584564208984, |
|
"learning_rate": 1.4872727272727275e-05, |
|
"loss": 0.3953, |
|
"step": 7832 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.8058860301971436, |
|
"learning_rate": 1.1174025974025975e-05, |
|
"loss": 0.3673, |
|
"step": 8544 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 17.926410675048828, |
|
"learning_rate": 7.475324675324675e-06, |
|
"loss": 0.3625, |
|
"step": 9256 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 25.310287475585938, |
|
"learning_rate": 3.776623376623377e-06, |
|
"loss": 0.348, |
|
"step": 9968 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 18.61994743347168, |
|
"learning_rate": 7.792207792207792e-08, |
|
"loss": 0.3617, |
|
"step": 10680 |
|
} |
|
], |
|
"logging_steps": 712, |
|
"max_steps": 10695, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 6.702780767795675e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|