|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3657457189463597, |
|
"eval_steps": 500, |
|
"global_step": 50000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 7.314914378927195e-06, |
|
"grad_norm": 2702.876220703125, |
|
"learning_rate": 3.6573769292663306e-10, |
|
"loss": 185.8854, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007314914378927195, |
|
"grad_norm": 25910.705078125, |
|
"learning_rate": 3.6573769292663306e-07, |
|
"loss": 48.434, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01462982875785439, |
|
"grad_norm": 3.9086620807647705, |
|
"learning_rate": 7.314753858532661e-07, |
|
"loss": 17.9804, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.021944743136781585, |
|
"grad_norm": 17.556304931640625, |
|
"learning_rate": 1.097213078779899e-06, |
|
"loss": 10.5262, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.02925965751570878, |
|
"grad_norm": 4689.71240234375, |
|
"learning_rate": 1.4629507717065323e-06, |
|
"loss": 11.6651, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.03657457189463598, |
|
"grad_norm": 0.0065016308799386024, |
|
"learning_rate": 1.8286884646331652e-06, |
|
"loss": 9.4746, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04388948627356317, |
|
"grad_norm": 0.060599055141210556, |
|
"learning_rate": 2.194426157559798e-06, |
|
"loss": 7.1303, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.051204400652490364, |
|
"grad_norm": 1028.9453125, |
|
"learning_rate": 2.560163850486431e-06, |
|
"loss": 8.917, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.05851931503141756, |
|
"grad_norm": 5.463726043701172, |
|
"learning_rate": 2.9259015434130645e-06, |
|
"loss": 8.9339, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.06583422941034475, |
|
"grad_norm": 0.3307730257511139, |
|
"learning_rate": 3.2916392363396975e-06, |
|
"loss": 8.746, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.07314914378927195, |
|
"grad_norm": 0.003347629914060235, |
|
"learning_rate": 3.6573769292663304e-06, |
|
"loss": 9.4711, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.08046405816819914, |
|
"grad_norm": 0.006155295763164759, |
|
"learning_rate": 4.023114622192964e-06, |
|
"loss": 7.647, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.08777897254712634, |
|
"grad_norm": 0.0019895241130143404, |
|
"learning_rate": 4.388852315119596e-06, |
|
"loss": 5.7327, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.09509388692605353, |
|
"grad_norm": 0.004527593031525612, |
|
"learning_rate": 4.75459000804623e-06, |
|
"loss": 5.5751, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.10240880130498073, |
|
"grad_norm": 0.009128883481025696, |
|
"learning_rate": 4.986629929451543e-06, |
|
"loss": 8.6148, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.10972371568390792, |
|
"grad_norm": 0.012683026492595673, |
|
"learning_rate": 4.9459914171462015e-06, |
|
"loss": 8.5558, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.11703863006283512, |
|
"grad_norm": 0.0028167981654405594, |
|
"learning_rate": 4.90535290484086e-06, |
|
"loss": 6.0433, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.1243535444417623, |
|
"grad_norm": 0.0011900264071300626, |
|
"learning_rate": 4.864714392535519e-06, |
|
"loss": 6.9084, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.1316684588206895, |
|
"grad_norm": 0.025524910539388657, |
|
"learning_rate": 4.824075880230177e-06, |
|
"loss": 6.7333, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.1389833731996167, |
|
"grad_norm": 0.027619725093245506, |
|
"learning_rate": 4.783437367924835e-06, |
|
"loss": 4.1436, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.1462982875785439, |
|
"grad_norm": 0.012698939070105553, |
|
"learning_rate": 4.742798855619494e-06, |
|
"loss": 5.397, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.15361320195747108, |
|
"grad_norm": 0.031142177060246468, |
|
"learning_rate": 4.702160343314152e-06, |
|
"loss": 5.5156, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.16092811633639828, |
|
"grad_norm": 0.0004189134924672544, |
|
"learning_rate": 4.66152183100881e-06, |
|
"loss": 4.8633, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.16824303071532548, |
|
"grad_norm": 0.0103899035602808, |
|
"learning_rate": 4.620883318703469e-06, |
|
"loss": 7.2146, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.17555794509425268, |
|
"grad_norm": 1158.8017578125, |
|
"learning_rate": 4.580244806398127e-06, |
|
"loss": 5.7667, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.18287285947317986, |
|
"grad_norm": 0.03399639576673508, |
|
"learning_rate": 4.5396062940927856e-06, |
|
"loss": 5.0472, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.19018777385210706, |
|
"grad_norm": 0.017644532024860382, |
|
"learning_rate": 4.4989677817874446e-06, |
|
"loss": 4.8188, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.19750268823103426, |
|
"grad_norm": 0.00010079160711029544, |
|
"learning_rate": 4.4583292694821035e-06, |
|
"loss": 5.7598, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.20481760260996146, |
|
"grad_norm": 0.009618501178920269, |
|
"learning_rate": 4.417690757176762e-06, |
|
"loss": 4.683, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.21213251698888866, |
|
"grad_norm": 0.018396975472569466, |
|
"learning_rate": 4.377052244871421e-06, |
|
"loss": 5.7816, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.21944743136781583, |
|
"grad_norm": 0.026549218222498894, |
|
"learning_rate": 4.336413732566079e-06, |
|
"loss": 5.5149, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.22676234574674303, |
|
"grad_norm": 0.01402178592979908, |
|
"learning_rate": 4.295775220260737e-06, |
|
"loss": 6.1021, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.23407726012567023, |
|
"grad_norm": 2950.190185546875, |
|
"learning_rate": 4.255136707955396e-06, |
|
"loss": 5.1742, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.24139217450459743, |
|
"grad_norm": 0.01243713591247797, |
|
"learning_rate": 4.214498195650054e-06, |
|
"loss": 4.8856, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.2487070888835246, |
|
"grad_norm": 0.00121857482008636, |
|
"learning_rate": 4.173859683344712e-06, |
|
"loss": 4.0296, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.25602200326245184, |
|
"grad_norm": 0.021528728306293488, |
|
"learning_rate": 4.133221171039371e-06, |
|
"loss": 3.7989, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.263336917641379, |
|
"grad_norm": 0.067794568836689, |
|
"learning_rate": 4.092582658734029e-06, |
|
"loss": 4.8373, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.2706518320203062, |
|
"grad_norm": 83.66200256347656, |
|
"learning_rate": 4.0519441464286876e-06, |
|
"loss": 3.2441, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.2779667463992334, |
|
"grad_norm": 0.00217541866004467, |
|
"learning_rate": 4.0113056341233466e-06, |
|
"loss": 3.5578, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.2852816607781606, |
|
"grad_norm": 0.0008728219545446336, |
|
"learning_rate": 3.970667121818005e-06, |
|
"loss": 2.6644, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.2925965751570878, |
|
"grad_norm": 0.027021408081054688, |
|
"learning_rate": 3.930028609512664e-06, |
|
"loss": 3.7778, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.299911489536015, |
|
"grad_norm": 0.001312136766500771, |
|
"learning_rate": 3.889390097207322e-06, |
|
"loss": 4.2509, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.30722640391494216, |
|
"grad_norm": 0.011145360767841339, |
|
"learning_rate": 3.84875158490198e-06, |
|
"loss": 4.6084, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.3145413182938694, |
|
"grad_norm": 289.0566711425781, |
|
"learning_rate": 3.8081130725966386e-06, |
|
"loss": 3.3176, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.32185623267279656, |
|
"grad_norm": 0.006871068850159645, |
|
"learning_rate": 3.767474560291297e-06, |
|
"loss": 3.0108, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.32917114705172373, |
|
"grad_norm": 0.002771923318505287, |
|
"learning_rate": 3.7268360479859557e-06, |
|
"loss": 3.5366, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.33648606143065096, |
|
"grad_norm": 0.0030192858539521694, |
|
"learning_rate": 3.686197535680614e-06, |
|
"loss": 3.3997, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.34380097580957814, |
|
"grad_norm": 3926.616943359375, |
|
"learning_rate": 3.6455590233752724e-06, |
|
"loss": 3.5042, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.35111589018850536, |
|
"grad_norm": 0.000544128124602139, |
|
"learning_rate": 3.604920511069931e-06, |
|
"loss": 2.9953, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.35843080456743254, |
|
"grad_norm": 14144.9677734375, |
|
"learning_rate": 3.564281998764589e-06, |
|
"loss": 2.9742, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.3657457189463597, |
|
"grad_norm": 0.0021343908738344908, |
|
"learning_rate": 3.5236434864592477e-06, |
|
"loss": 3.4145, |
|
"step": 50000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 136707, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50000, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|