ardaspear's picture
Training in progress, step 100, checkpoint
bbade1a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.021847178983013818,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00021847178983013817,
"eval_loss": 10.377206802368164,
"eval_runtime": 33.2267,
"eval_samples_per_second": 232.042,
"eval_steps_per_second": 29.013,
"step": 1
},
{
"epoch": 0.0006554153694904145,
"grad_norm": 0.27843907475471497,
"learning_rate": 3e-05,
"loss": 10.3759,
"step": 3
},
{
"epoch": 0.001310830738980829,
"grad_norm": 0.2833423614501953,
"learning_rate": 6e-05,
"loss": 10.3729,
"step": 6
},
{
"epoch": 0.0019662461084712438,
"grad_norm": 0.25739535689353943,
"learning_rate": 9e-05,
"loss": 10.3794,
"step": 9
},
{
"epoch": 0.0019662461084712438,
"eval_loss": 10.371369361877441,
"eval_runtime": 33.0873,
"eval_samples_per_second": 233.02,
"eval_steps_per_second": 29.135,
"step": 9
},
{
"epoch": 0.002621661477961658,
"grad_norm": 0.31889083981513977,
"learning_rate": 9.987820251299122e-05,
"loss": 10.3702,
"step": 12
},
{
"epoch": 0.003277076847452073,
"grad_norm": 0.4143044650554657,
"learning_rate": 9.924038765061042e-05,
"loss": 10.372,
"step": 15
},
{
"epoch": 0.0039324922169424875,
"grad_norm": 0.4828105866909027,
"learning_rate": 9.806308479691595e-05,
"loss": 10.3554,
"step": 18
},
{
"epoch": 0.0039324922169424875,
"eval_loss": 10.353870391845703,
"eval_runtime": 33.1999,
"eval_samples_per_second": 232.229,
"eval_steps_per_second": 29.036,
"step": 18
},
{
"epoch": 0.004587907586432902,
"grad_norm": 0.5287975668907166,
"learning_rate": 9.635919272833938e-05,
"loss": 10.3574,
"step": 21
},
{
"epoch": 0.005243322955923316,
"grad_norm": 0.6890485286712646,
"learning_rate": 9.414737964294636e-05,
"loss": 10.3417,
"step": 24
},
{
"epoch": 0.005898738325413731,
"grad_norm": 0.6008455753326416,
"learning_rate": 9.145187862775209e-05,
"loss": 10.3313,
"step": 27
},
{
"epoch": 0.005898738325413731,
"eval_loss": 10.32519817352295,
"eval_runtime": 33.0002,
"eval_samples_per_second": 233.635,
"eval_steps_per_second": 29.212,
"step": 27
},
{
"epoch": 0.006554153694904146,
"grad_norm": 0.6580014228820801,
"learning_rate": 8.83022221559489e-05,
"loss": 10.3249,
"step": 30
},
{
"epoch": 0.00720956906439456,
"grad_norm": 0.6910955309867859,
"learning_rate": 8.473291852294987e-05,
"loss": 10.3104,
"step": 33
},
{
"epoch": 0.007864984433884975,
"grad_norm": 0.5430018305778503,
"learning_rate": 8.07830737662829e-05,
"loss": 10.3037,
"step": 36
},
{
"epoch": 0.007864984433884975,
"eval_loss": 10.294708251953125,
"eval_runtime": 33.2434,
"eval_samples_per_second": 231.926,
"eval_steps_per_second": 28.998,
"step": 36
},
{
"epoch": 0.008520399803375388,
"grad_norm": 0.5753027200698853,
"learning_rate": 7.649596321166024e-05,
"loss": 10.2936,
"step": 39
},
{
"epoch": 0.009175815172865804,
"grad_norm": 0.5511729121208191,
"learning_rate": 7.191855733945387e-05,
"loss": 10.2914,
"step": 42
},
{
"epoch": 0.009831230542356219,
"grad_norm": 0.3970569670200348,
"learning_rate": 6.710100716628344e-05,
"loss": 10.2756,
"step": 45
},
{
"epoch": 0.009831230542356219,
"eval_loss": 10.277950286865234,
"eval_runtime": 33.0373,
"eval_samples_per_second": 233.372,
"eval_steps_per_second": 29.179,
"step": 45
},
{
"epoch": 0.010486645911846632,
"grad_norm": 0.40028998255729675,
"learning_rate": 6.209609477998338e-05,
"loss": 10.2778,
"step": 48
},
{
"epoch": 0.011142061281337047,
"grad_norm": 0.3963126838207245,
"learning_rate": 5.695865504800327e-05,
"loss": 10.2763,
"step": 51
},
{
"epoch": 0.011797476650827463,
"grad_norm": 0.32093650102615356,
"learning_rate": 5.174497483512506e-05,
"loss": 10.2762,
"step": 54
},
{
"epoch": 0.011797476650827463,
"eval_loss": 10.270526885986328,
"eval_runtime": 33.099,
"eval_samples_per_second": 232.937,
"eval_steps_per_second": 29.125,
"step": 54
},
{
"epoch": 0.012452892020317876,
"grad_norm": 0.3351193070411682,
"learning_rate": 4.6512176312793736e-05,
"loss": 10.2752,
"step": 57
},
{
"epoch": 0.013108307389808291,
"grad_norm": 0.2998097240924835,
"learning_rate": 4.131759111665349e-05,
"loss": 10.2659,
"step": 60
},
{
"epoch": 0.013763722759298706,
"grad_norm": 0.2995798885822296,
"learning_rate": 3.6218132209150045e-05,
"loss": 10.2697,
"step": 63
},
{
"epoch": 0.013763722759298706,
"eval_loss": 10.267303466796875,
"eval_runtime": 33.1882,
"eval_samples_per_second": 232.311,
"eval_steps_per_second": 29.046,
"step": 63
},
{
"epoch": 0.01441913812878912,
"grad_norm": 0.2436102032661438,
"learning_rate": 3.12696703292044e-05,
"loss": 10.266,
"step": 66
},
{
"epoch": 0.015074553498279535,
"grad_norm": 0.2320987582206726,
"learning_rate": 2.6526421860705473e-05,
"loss": 10.2794,
"step": 69
},
{
"epoch": 0.01572996886776995,
"grad_norm": 0.21924351155757904,
"learning_rate": 2.2040354826462668e-05,
"loss": 10.2635,
"step": 72
},
{
"epoch": 0.01572996886776995,
"eval_loss": 10.265777587890625,
"eval_runtime": 33.0027,
"eval_samples_per_second": 233.618,
"eval_steps_per_second": 29.21,
"step": 72
},
{
"epoch": 0.016385384237260363,
"grad_norm": 0.17793336510658264,
"learning_rate": 1.7860619515673033e-05,
"loss": 10.2665,
"step": 75
},
{
"epoch": 0.017040799606750777,
"grad_norm": 0.2533828020095825,
"learning_rate": 1.4033009983067452e-05,
"loss": 10.2756,
"step": 78
},
{
"epoch": 0.017696214976241194,
"grad_norm": 0.2704446017742157,
"learning_rate": 1.0599462319663905e-05,
"loss": 10.2756,
"step": 81
},
{
"epoch": 0.017696214976241194,
"eval_loss": 10.265067100524902,
"eval_runtime": 33.0919,
"eval_samples_per_second": 232.988,
"eval_steps_per_second": 29.131,
"step": 81
},
{
"epoch": 0.018351630345731607,
"grad_norm": 0.31666257977485657,
"learning_rate": 7.597595192178702e-06,
"loss": 10.2732,
"step": 84
},
{
"epoch": 0.01900704571522202,
"grad_norm": 0.20740436017513275,
"learning_rate": 5.060297685041659e-06,
"loss": 10.2702,
"step": 87
},
{
"epoch": 0.019662461084712438,
"grad_norm": 0.15666821599006653,
"learning_rate": 3.0153689607045845e-06,
"loss": 10.2646,
"step": 90
},
{
"epoch": 0.019662461084712438,
"eval_loss": 10.264784812927246,
"eval_runtime": 33.0395,
"eval_samples_per_second": 233.357,
"eval_steps_per_second": 29.177,
"step": 90
},
{
"epoch": 0.02031787645420285,
"grad_norm": 0.146971195936203,
"learning_rate": 1.4852136862001764e-06,
"loss": 10.2779,
"step": 93
},
{
"epoch": 0.020973291823693264,
"grad_norm": 0.2508053779602051,
"learning_rate": 4.865965629214819e-07,
"loss": 10.2598,
"step": 96
},
{
"epoch": 0.02162870719318368,
"grad_norm": 0.16878999769687653,
"learning_rate": 3.04586490452119e-08,
"loss": 10.2826,
"step": 99
},
{
"epoch": 0.02162870719318368,
"eval_loss": 10.264737129211426,
"eval_runtime": 33.0022,
"eval_samples_per_second": 233.62,
"eval_steps_per_second": 29.21,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 22310132121600.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}