|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993343687597072, |
|
"eval_steps": 500, |
|
"global_step": 1126, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.74387526512146, |
|
"learning_rate": 6.106194690265487e-06, |
|
"loss": 10.8673, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.843221426010132, |
|
"learning_rate": 1.2743362831858408e-05, |
|
"loss": 10.3682, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.8696110248565674, |
|
"learning_rate": 1.9380530973451328e-05, |
|
"loss": 9.237, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3768067359924316, |
|
"learning_rate": 2.601769911504425e-05, |
|
"loss": 7.662, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.8718193769454956, |
|
"learning_rate": 2.9703849950641657e-05, |
|
"loss": 6.5011, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.3883230686187744, |
|
"learning_rate": 2.899308983218164e-05, |
|
"loss": 5.4884, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0601733922958374, |
|
"learning_rate": 2.8252714708785782e-05, |
|
"loss": 4.8107, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.9832029938697815, |
|
"learning_rate": 2.7512339585389932e-05, |
|
"loss": 4.384, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.0105878114700317, |
|
"learning_rate": 2.677196446199408e-05, |
|
"loss": 4.1117, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.2003360986709595, |
|
"learning_rate": 2.6031589338598225e-05, |
|
"loss": 3.8207, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0249788761138916, |
|
"learning_rate": 2.529121421520237e-05, |
|
"loss": 3.6499, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.2118958234786987, |
|
"learning_rate": 2.4550839091806515e-05, |
|
"loss": 3.5689, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0994243621826172, |
|
"learning_rate": 2.3810463968410662e-05, |
|
"loss": 3.4153, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.3940232992172241, |
|
"learning_rate": 2.307008884501481e-05, |
|
"loss": 3.2788, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3793669939041138, |
|
"learning_rate": 2.2329713721618955e-05, |
|
"loss": 3.0424, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.6049970388412476, |
|
"learning_rate": 2.15893385982231e-05, |
|
"loss": 3.0105, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.6481398344039917, |
|
"learning_rate": 2.0848963474827245e-05, |
|
"loss": 2.9522, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.9402527809143066, |
|
"learning_rate": 2.010858835143139e-05, |
|
"loss": 2.8484, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.9975693225860596, |
|
"learning_rate": 1.9368213228035538e-05, |
|
"loss": 2.7488, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.3999898433685303, |
|
"learning_rate": 1.8627838104639688e-05, |
|
"loss": 2.6719, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.681518077850342, |
|
"learning_rate": 1.788746298124383e-05, |
|
"loss": 2.5165, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.2228667736053467, |
|
"learning_rate": 1.7147087857847977e-05, |
|
"loss": 2.4539, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.7527260780334473, |
|
"learning_rate": 1.6406712734452124e-05, |
|
"loss": 2.4722, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.6381165981292725, |
|
"learning_rate": 1.5666337611056267e-05, |
|
"loss": 2.331, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.7754085063934326, |
|
"learning_rate": 1.4925962487660415e-05, |
|
"loss": 2.2549, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.6391360759735107, |
|
"learning_rate": 1.4185587364264562e-05, |
|
"loss": 2.214, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.563453197479248, |
|
"learning_rate": 1.3445212240868707e-05, |
|
"loss": 2.262, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.6172292232513428, |
|
"learning_rate": 1.2704837117472853e-05, |
|
"loss": 2.1302, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.8900041580200195, |
|
"learning_rate": 1.1964461994077e-05, |
|
"loss": 2.0746, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.4479780197143555, |
|
"learning_rate": 1.1224086870681145e-05, |
|
"loss": 1.9904, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.833683967590332, |
|
"learning_rate": 1.0483711747285293e-05, |
|
"loss": 2.1597, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.8591232299804688, |
|
"learning_rate": 9.743336623889438e-06, |
|
"loss": 2.0252, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.5961415767669678, |
|
"learning_rate": 9.002961500493583e-06, |
|
"loss": 2.0082, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.318519353866577, |
|
"learning_rate": 8.26258637709773e-06, |
|
"loss": 1.9316, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.528031826019287, |
|
"learning_rate": 7.522211253701876e-06, |
|
"loss": 2.0105, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.732753038406372, |
|
"learning_rate": 6.7818361303060216e-06, |
|
"loss": 1.9798, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.0377025604248047, |
|
"learning_rate": 6.041461006910168e-06, |
|
"loss": 1.928, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.2801241874694824, |
|
"learning_rate": 5.301085883514315e-06, |
|
"loss": 1.9888, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.3720247745513916, |
|
"learning_rate": 4.56071076011846e-06, |
|
"loss": 1.8992, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.735081195831299, |
|
"learning_rate": 3.820335636722606e-06, |
|
"loss": 1.8668, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.3813605308532715, |
|
"learning_rate": 3.0799605133267522e-06, |
|
"loss": 2.0131, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.1669371128082275, |
|
"learning_rate": 2.3395853899308984e-06, |
|
"loss": 1.9027, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.2110283374786377, |
|
"learning_rate": 1.5992102665350445e-06, |
|
"loss": 1.7928, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.6622347831726074, |
|
"learning_rate": 8.588351431391905e-07, |
|
"loss": 1.8259, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.6261448860168457, |
|
"learning_rate": 1.1846001974333663e-07, |
|
"loss": 1.8333, |
|
"step": 1125 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1126, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.992942617100288e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|