|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4253490870032223, |
|
"eval_steps": 50, |
|
"global_step": 99, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004296455424274973, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004296455424274973, |
|
"eval_loss": NaN, |
|
"eval_runtime": 35.4972, |
|
"eval_samples_per_second": 11.043, |
|
"eval_steps_per_second": 1.38, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008592910848549946, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01288936627282492, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015, |
|
"loss": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.017185821697099892, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.021482277121374866, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00025, |
|
"loss": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02577873254564984, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003, |
|
"loss": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03007518796992481, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00035, |
|
"loss": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.034371643394199784, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004, |
|
"loss": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03866809881847476, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04296455424274973, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0005, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.047261009667024706, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004998442655654946, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05155746509129968, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004993772562876909, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.055853920515574654, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004985995540019955, |
|
"loss": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06015037593984962, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004975121276286136, |
|
"loss": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0644468313641246, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004961163319653958, |
|
"loss": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06874328678839957, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004944139059999286, |
|
"loss": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07303974221267455, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000492406970742972, |
|
"loss": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07733619763694952, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004900980265859448, |
|
"loss": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004874899501857477, |
|
"loss": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08592910848549946, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00048458599088080736, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09022556390977443, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004813897666428053, |
|
"loss": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09452201933404941, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00047790525956913543, |
|
"loss": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09881847475832438, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004741368109217071, |
|
"loss": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10311493018259936, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00047008911571827283, |
|
"loss": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10741138560687433, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00046576721688302105, |
|
"loss": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11170784103114931, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004611764989637205, |
|
"loss": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11600429645542427, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004563226814232444, |
|
"loss": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12030075187969924, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004512118115138315, |
|
"loss": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12459720730397422, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004458502567429631, |
|
"loss": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1288936627282492, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00044024469694024196, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13319011815252416, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00043440211593515554, |
|
"loss": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13748657357679914, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004283297928560951, |
|
"loss": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14178302900107412, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0004220352930614672, |
|
"loss": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1460794844253491, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00041552645871420013, |
|
"loss": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15037593984962405, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00040881139901138467, |
|
"loss": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15467239527389903, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00040189848008122475, |
|
"loss": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.158968850698174, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00039479631455988334, |
|
"loss": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003875137508612103, |
|
"loss": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16756176154672395, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00038005986215272055, |
|
"loss": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.17185821697099893, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003724439350515571, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1761546723952739, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003646754580545226, |
|
"loss": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.18045112781954886, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000356764109716594, |
|
"loss": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18474758324382384, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00034871974659264783, |
|
"loss": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.18904403866809882, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003405523909574206, |
|
"loss": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1933404940923738, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003322722183190025, |
|
"loss": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19763694951664876, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003238895447414211, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.20193340494092374, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0003154148139921102, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20622986036519872, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00030685858453027663, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002982315163523742, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.21482277121374865, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.000289544357711076, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21482277121374865, |
|
"eval_loss": NaN, |
|
"eval_runtime": 1.13, |
|
"eval_samples_per_second": 346.907, |
|
"eval_steps_per_second": 43.363, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21911922663802363, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002808079317242896, |
|
"loss": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22341568206229862, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002720331228909005, |
|
"loss": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.22771213748657357, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00026323086353004075, |
|
"loss": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23200859291084855, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002544121201607822, |
|
"loss": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.23630504833512353, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00024558787983921783, |
|
"loss": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24060150375939848, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0002367691364699592, |
|
"loss": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00022796687710909964, |
|
"loss": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.24919441460794844, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00021919206827571036, |
|
"loss": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2534908700322234, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00021045564228892402, |
|
"loss": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2577873254564984, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00020176848364762578, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2620837808807734, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019314141546972343, |
|
"loss": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2663802363050483, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00018458518600788986, |
|
"loss": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2706766917293233, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017611045525857898, |
|
"loss": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2749731471535983, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001677277816809975, |
|
"loss": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.27926960257787325, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015944760904257942, |
|
"loss": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.28356605800214824, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001512802534073522, |
|
"loss": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2878625134264232, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014323589028340596, |
|
"loss": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2921589688506982, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00013532454194547733, |
|
"loss": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2964554242749731, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00012755606494844294, |
|
"loss": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3007518796992481, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011994013784727947, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3050483351235231, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00011248624913878966, |
|
"loss": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.30934479054779807, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001052036854401166, |
|
"loss": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.31364124597207305, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.810151991877531e-05, |
|
"loss": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.317937701396348, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.118860098861537e-05, |
|
"loss": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.322234156820623, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.44735412857999e-05, |
|
"loss": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.79647069385328e-05, |
|
"loss": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3308270676691729, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.167020714390501e-05, |
|
"loss": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3351235230934479, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.559788406484446e-05, |
|
"loss": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3394199785177229, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.975530305975807e-05, |
|
"loss": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.34371643394199786, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.414974325703686e-05, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.34801288936627284, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.8788188486168616e-05, |
|
"loss": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3523093447905478, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.367731857675569e-05, |
|
"loss": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.35660580021482274, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.882350103627952e-05, |
|
"loss": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3609022556390977, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.423278311697897e-05, |
|
"loss": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3651987110633727, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9910884281727225e-05, |
|
"loss": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3694951664876477, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.586318907829291e-05, |
|
"loss": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.37379162191192267, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.209474043086457e-05, |
|
"loss": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.37808807733619765, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.861023335719475e-05, |
|
"loss": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.38238453276047263, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5414009119192633e-05, |
|
"loss": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3866809881847476, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.25100498142523e-05, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.39097744360902253, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.901973414055187e-06, |
|
"loss": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3952738990332975, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.593029257027956e-06, |
|
"loss": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3995703544575725, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5860940000714015e-06, |
|
"loss": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.4038668098818475, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.8836680346041594e-06, |
|
"loss": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.487872371386424e-06, |
|
"loss": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.41245972073039744, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4004459980045125e-06, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4167561761546724, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.22743712309054e-07, |
|
"loss": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.557344345054501e-07, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4253490870032223, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 99 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 99, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5109091185655808e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|