sedrickkeh's picture
End of training
2d1d4fd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.987551867219917,
"eval_steps": 500,
"global_step": 360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08298755186721991,
"grad_norm": 0.4138890994781238,
"learning_rate": 5e-06,
"loss": 0.6819,
"step": 10
},
{
"epoch": 0.16597510373443983,
"grad_norm": 0.29279799441362253,
"learning_rate": 5e-06,
"loss": 0.6152,
"step": 20
},
{
"epoch": 0.24896265560165975,
"grad_norm": 0.22155490022279595,
"learning_rate": 5e-06,
"loss": 0.5911,
"step": 30
},
{
"epoch": 0.33195020746887965,
"grad_norm": 0.21703792973144043,
"learning_rate": 5e-06,
"loss": 0.5758,
"step": 40
},
{
"epoch": 0.4149377593360996,
"grad_norm": 0.19782125949582666,
"learning_rate": 5e-06,
"loss": 0.5658,
"step": 50
},
{
"epoch": 0.4979253112033195,
"grad_norm": 0.20429620764864578,
"learning_rate": 5e-06,
"loss": 0.5621,
"step": 60
},
{
"epoch": 0.5809128630705395,
"grad_norm": 0.1947179711144,
"learning_rate": 5e-06,
"loss": 0.5481,
"step": 70
},
{
"epoch": 0.6639004149377593,
"grad_norm": 0.20952965041956714,
"learning_rate": 5e-06,
"loss": 0.5491,
"step": 80
},
{
"epoch": 0.7468879668049793,
"grad_norm": 0.2092027679734135,
"learning_rate": 5e-06,
"loss": 0.5473,
"step": 90
},
{
"epoch": 0.8298755186721992,
"grad_norm": 0.1973704614234666,
"learning_rate": 5e-06,
"loss": 0.5412,
"step": 100
},
{
"epoch": 0.9128630705394191,
"grad_norm": 0.22215520376065145,
"learning_rate": 5e-06,
"loss": 0.5361,
"step": 110
},
{
"epoch": 0.995850622406639,
"grad_norm": 0.20002555613598916,
"learning_rate": 5e-06,
"loss": 0.5354,
"step": 120
},
{
"epoch": 0.995850622406639,
"eval_loss": 0.5283368229866028,
"eval_runtime": 121.5237,
"eval_samples_per_second": 26.703,
"eval_steps_per_second": 0.42,
"step": 120
},
{
"epoch": 1.0788381742738589,
"grad_norm": 0.23151869944663353,
"learning_rate": 5e-06,
"loss": 0.5334,
"step": 130
},
{
"epoch": 1.161825726141079,
"grad_norm": 0.20416069004838694,
"learning_rate": 5e-06,
"loss": 0.5122,
"step": 140
},
{
"epoch": 1.2448132780082988,
"grad_norm": 0.21256654137396935,
"learning_rate": 5e-06,
"loss": 0.509,
"step": 150
},
{
"epoch": 1.3278008298755186,
"grad_norm": 0.21018667523519946,
"learning_rate": 5e-06,
"loss": 0.5041,
"step": 160
},
{
"epoch": 1.4107883817427385,
"grad_norm": 0.219240042940767,
"learning_rate": 5e-06,
"loss": 0.4998,
"step": 170
},
{
"epoch": 1.4937759336099585,
"grad_norm": 0.22681455392212077,
"learning_rate": 5e-06,
"loss": 0.5037,
"step": 180
},
{
"epoch": 1.5767634854771784,
"grad_norm": 0.227133839723048,
"learning_rate": 5e-06,
"loss": 0.4977,
"step": 190
},
{
"epoch": 1.6597510373443982,
"grad_norm": 0.21040711904959797,
"learning_rate": 5e-06,
"loss": 0.4941,
"step": 200
},
{
"epoch": 1.7427385892116183,
"grad_norm": 0.23482785666403702,
"learning_rate": 5e-06,
"loss": 0.4945,
"step": 210
},
{
"epoch": 1.8257261410788381,
"grad_norm": 0.2035179907011211,
"learning_rate": 5e-06,
"loss": 0.4904,
"step": 220
},
{
"epoch": 1.908713692946058,
"grad_norm": 0.21720290177963564,
"learning_rate": 5e-06,
"loss": 0.491,
"step": 230
},
{
"epoch": 1.991701244813278,
"grad_norm": 0.2214820393037949,
"learning_rate": 5e-06,
"loss": 0.4901,
"step": 240
},
{
"epoch": 2.0,
"eval_loss": 0.5045989155769348,
"eval_runtime": 122.3281,
"eval_samples_per_second": 26.527,
"eval_steps_per_second": 0.417,
"step": 241
},
{
"epoch": 2.074688796680498,
"grad_norm": 0.25601226331965665,
"learning_rate": 5e-06,
"loss": 0.4945,
"step": 250
},
{
"epoch": 2.1576763485477177,
"grad_norm": 0.2537099080076595,
"learning_rate": 5e-06,
"loss": 0.4617,
"step": 260
},
{
"epoch": 2.240663900414938,
"grad_norm": 0.2445352596834903,
"learning_rate": 5e-06,
"loss": 0.4648,
"step": 270
},
{
"epoch": 2.323651452282158,
"grad_norm": 0.24195048816699535,
"learning_rate": 5e-06,
"loss": 0.4688,
"step": 280
},
{
"epoch": 2.4066390041493775,
"grad_norm": 0.3297443855710949,
"learning_rate": 5e-06,
"loss": 0.46,
"step": 290
},
{
"epoch": 2.4896265560165975,
"grad_norm": 0.2227067008121754,
"learning_rate": 5e-06,
"loss": 0.4679,
"step": 300
},
{
"epoch": 2.572614107883817,
"grad_norm": 0.24268677689146825,
"learning_rate": 5e-06,
"loss": 0.4642,
"step": 310
},
{
"epoch": 2.6556016597510372,
"grad_norm": 0.24131530500929413,
"learning_rate": 5e-06,
"loss": 0.4597,
"step": 320
},
{
"epoch": 2.7385892116182573,
"grad_norm": 0.22997089130920098,
"learning_rate": 5e-06,
"loss": 0.4617,
"step": 330
},
{
"epoch": 2.821576763485477,
"grad_norm": 0.23994756278793414,
"learning_rate": 5e-06,
"loss": 0.4597,
"step": 340
},
{
"epoch": 2.904564315352697,
"grad_norm": 0.23257285232469585,
"learning_rate": 5e-06,
"loss": 0.4545,
"step": 350
},
{
"epoch": 2.987551867219917,
"grad_norm": 0.22525776234601527,
"learning_rate": 5e-06,
"loss": 0.4618,
"step": 360
},
{
"epoch": 2.987551867219917,
"eval_loss": 0.49428611993789673,
"eval_runtime": 121.9138,
"eval_samples_per_second": 26.617,
"eval_steps_per_second": 0.418,
"step": 360
},
{
"epoch": 2.987551867219917,
"step": 360,
"total_flos": 602804028702720.0,
"train_loss": 0.5124640332327949,
"train_runtime": 20041.158,
"train_samples_per_second": 9.227,
"train_steps_per_second": 0.018
}
],
"logging_steps": 10,
"max_steps": 360,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 602804028702720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}