|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6928406466512702, |
|
"eval_steps": 500, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004618937644341801, |
|
"grad_norm": 10.975909233093262, |
|
"learning_rate": 3.4642032332563515e-07, |
|
"loss": 2.4942, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009237875288683603, |
|
"grad_norm": 9.186948776245117, |
|
"learning_rate": 9.237875288683603e-07, |
|
"loss": 2.7582, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013856812933025405, |
|
"grad_norm": 10.907584190368652, |
|
"learning_rate": 1.443418013856813e-06, |
|
"loss": 2.6814, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018475750577367205, |
|
"grad_norm": 9.531168937683105, |
|
"learning_rate": 2.0207852193995383e-06, |
|
"loss": 2.6982, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023094688221709007, |
|
"grad_norm": 14.727725982666016, |
|
"learning_rate": 2.5981524249422633e-06, |
|
"loss": 2.0218, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02771362586605081, |
|
"grad_norm": 8.314309120178223, |
|
"learning_rate": 3.117782909930716e-06, |
|
"loss": 1.5595, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03233256351039261, |
|
"grad_norm": 4.944284915924072, |
|
"learning_rate": 3.6951501154734412e-06, |
|
"loss": 1.02, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03695150115473441, |
|
"grad_norm": 10.882843971252441, |
|
"learning_rate": 4.272517321016166e-06, |
|
"loss": 1.0419, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04157043879907621, |
|
"grad_norm": 12.310320854187012, |
|
"learning_rate": 4.849884526558892e-06, |
|
"loss": 1.0801, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.046189376443418015, |
|
"grad_norm": 6.49992036819458, |
|
"learning_rate": 5.427251732101616e-06, |
|
"loss": 0.8444, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.050808314087759814, |
|
"grad_norm": 2.89493465423584, |
|
"learning_rate": 6.004618937644342e-06, |
|
"loss": 0.8884, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05542725173210162, |
|
"grad_norm": 3.966763734817505, |
|
"learning_rate": 6.581986143187067e-06, |
|
"loss": 0.8672, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06004618937644342, |
|
"grad_norm": 4.442293167114258, |
|
"learning_rate": 7.159353348729793e-06, |
|
"loss": 0.8037, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06466512702078522, |
|
"grad_norm": 2.506918430328369, |
|
"learning_rate": 7.736720554272519e-06, |
|
"loss": 0.724, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06928406466512702, |
|
"grad_norm": 5.733686447143555, |
|
"learning_rate": 8.314087759815242e-06, |
|
"loss": 0.7692, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07390300230946882, |
|
"grad_norm": 4.161188125610352, |
|
"learning_rate": 8.891454965357968e-06, |
|
"loss": 0.7413, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07852193995381063, |
|
"grad_norm": 3.9434962272644043, |
|
"learning_rate": 9.468822170900693e-06, |
|
"loss": 0.7386, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08314087759815242, |
|
"grad_norm": 2.9100701808929443, |
|
"learning_rate": 1.0046189376443418e-05, |
|
"loss": 0.6942, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08775981524249422, |
|
"grad_norm": 5.367318153381348, |
|
"learning_rate": 1.0623556581986144e-05, |
|
"loss": 0.8011, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09237875288683603, |
|
"grad_norm": 3.1690614223480225, |
|
"learning_rate": 1.1200923787528869e-05, |
|
"loss": 0.6816, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09699769053117784, |
|
"grad_norm": 4.35976505279541, |
|
"learning_rate": 1.1778290993071595e-05, |
|
"loss": 0.7408, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10161662817551963, |
|
"grad_norm": 3.330937623977661, |
|
"learning_rate": 1.235565819861432e-05, |
|
"loss": 0.7159, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10623556581986143, |
|
"grad_norm": 5.761129379272461, |
|
"learning_rate": 1.2933025404157046e-05, |
|
"loss": 0.6838, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11085450346420324, |
|
"grad_norm": 7.05668830871582, |
|
"learning_rate": 1.351039260969977e-05, |
|
"loss": 0.7135, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11547344110854503, |
|
"grad_norm": 3.7135939598083496, |
|
"learning_rate": 1.4087759815242497e-05, |
|
"loss": 0.6385, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12009237875288684, |
|
"grad_norm": 5.477907657623291, |
|
"learning_rate": 1.4665127020785218e-05, |
|
"loss": 0.6292, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12471131639722864, |
|
"grad_norm": 6.577059268951416, |
|
"learning_rate": 1.5242494226327944e-05, |
|
"loss": 0.6921, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12933025404157045, |
|
"grad_norm": 3.6328892707824707, |
|
"learning_rate": 1.581986143187067e-05, |
|
"loss": 0.6621, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13394919168591224, |
|
"grad_norm": 4.084783554077148, |
|
"learning_rate": 1.6397228637413393e-05, |
|
"loss": 0.6667, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13856812933025403, |
|
"grad_norm": 3.8719701766967773, |
|
"learning_rate": 1.697459584295612e-05, |
|
"loss": 0.6692, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14318706697459585, |
|
"grad_norm": 7.860931873321533, |
|
"learning_rate": 1.7551963048498846e-05, |
|
"loss": 0.6251, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14780600461893764, |
|
"grad_norm": 4.381837368011475, |
|
"learning_rate": 1.812933025404157e-05, |
|
"loss": 0.6297, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15242494226327943, |
|
"grad_norm": 3.7145886421203613, |
|
"learning_rate": 1.8706697459584295e-05, |
|
"loss": 0.6483, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15704387990762125, |
|
"grad_norm": 2.609006643295288, |
|
"learning_rate": 1.9284064665127023e-05, |
|
"loss": 0.6149, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16166281755196305, |
|
"grad_norm": 4.774081230163574, |
|
"learning_rate": 1.9861431870669748e-05, |
|
"loss": 0.6034, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16628175519630484, |
|
"grad_norm": 7.305100440979004, |
|
"learning_rate": 2.0438799076212473e-05, |
|
"loss": 0.6496, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.17090069284064666, |
|
"grad_norm": 5.507181644439697, |
|
"learning_rate": 2.1016166281755197e-05, |
|
"loss": 0.643, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.17551963048498845, |
|
"grad_norm": 4.033135890960693, |
|
"learning_rate": 2.1593533487297922e-05, |
|
"loss": 0.6186, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18013856812933027, |
|
"grad_norm": 3.903007745742798, |
|
"learning_rate": 2.217090069284065e-05, |
|
"loss": 0.6041, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18475750577367206, |
|
"grad_norm": 4.785562992095947, |
|
"learning_rate": 2.2748267898383374e-05, |
|
"loss": 0.5527, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18937644341801385, |
|
"grad_norm": 3.4289231300354004, |
|
"learning_rate": 2.3325635103926096e-05, |
|
"loss": 0.5936, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19399538106235567, |
|
"grad_norm": 2.384840965270996, |
|
"learning_rate": 2.3903002309468824e-05, |
|
"loss": 0.5421, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19861431870669746, |
|
"grad_norm": 4.025755882263184, |
|
"learning_rate": 2.448036951501155e-05, |
|
"loss": 0.5839, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.20323325635103925, |
|
"grad_norm": 4.832013130187988, |
|
"learning_rate": 2.5057736720554276e-05, |
|
"loss": 0.5938, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20785219399538107, |
|
"grad_norm": 3.66886305809021, |
|
"learning_rate": 2.5635103926096998e-05, |
|
"loss": 0.5607, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21247113163972287, |
|
"grad_norm": 3.7285852432250977, |
|
"learning_rate": 2.6212471131639726e-05, |
|
"loss": 0.5457, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21709006928406466, |
|
"grad_norm": 3.755711555480957, |
|
"learning_rate": 2.678983833718245e-05, |
|
"loss": 0.5721, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22170900692840648, |
|
"grad_norm": 4.016116619110107, |
|
"learning_rate": 2.7367205542725178e-05, |
|
"loss": 0.59, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22632794457274827, |
|
"grad_norm": 6.123377799987793, |
|
"learning_rate": 2.79445727482679e-05, |
|
"loss": 0.6236, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23094688221709006, |
|
"grad_norm": 3.77093505859375, |
|
"learning_rate": 2.8521939953810624e-05, |
|
"loss": 0.6306, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23556581986143188, |
|
"grad_norm": 5.101199626922607, |
|
"learning_rate": 2.9099307159353352e-05, |
|
"loss": 0.5806, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.24018475750577367, |
|
"grad_norm": 4.425400257110596, |
|
"learning_rate": 2.9676674364896073e-05, |
|
"loss": 0.5644, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.24480369515011546, |
|
"grad_norm": 3.661381244659424, |
|
"learning_rate": 3.02540415704388e-05, |
|
"loss": 0.538, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24942263279445728, |
|
"grad_norm": 3.271655559539795, |
|
"learning_rate": 3.0831408775981526e-05, |
|
"loss": 0.5927, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2540415704387991, |
|
"grad_norm": 4.603795051574707, |
|
"learning_rate": 3.140877598152425e-05, |
|
"loss": 0.5772, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2586605080831409, |
|
"grad_norm": 3.379786968231201, |
|
"learning_rate": 3.1986143187066975e-05, |
|
"loss": 0.5577, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2632794457274827, |
|
"grad_norm": 3.3702409267425537, |
|
"learning_rate": 3.25635103926097e-05, |
|
"loss": 0.6258, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2678983833718245, |
|
"grad_norm": 3.1498706340789795, |
|
"learning_rate": 3.3140877598152424e-05, |
|
"loss": 0.5631, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.27251732101616627, |
|
"grad_norm": 4.846761703491211, |
|
"learning_rate": 3.3718244803695156e-05, |
|
"loss": 0.5944, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.27713625866050806, |
|
"grad_norm": 4.397818088531494, |
|
"learning_rate": 3.4295612009237874e-05, |
|
"loss": 0.5698, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2817551963048499, |
|
"grad_norm": 2.2724201679229736, |
|
"learning_rate": 3.4872979214780605e-05, |
|
"loss": 0.5468, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2863741339491917, |
|
"grad_norm": 4.73584508895874, |
|
"learning_rate": 3.545034642032333e-05, |
|
"loss": 0.5697, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2909930715935335, |
|
"grad_norm": 3.557711124420166, |
|
"learning_rate": 3.6027713625866054e-05, |
|
"loss": 0.5195, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2956120092378753, |
|
"grad_norm": 4.273311614990234, |
|
"learning_rate": 3.660508083140878e-05, |
|
"loss": 0.5561, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3002309468822171, |
|
"grad_norm": 3.6489686965942383, |
|
"learning_rate": 3.7182448036951504e-05, |
|
"loss": 0.566, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.30484988452655887, |
|
"grad_norm": 3.4011149406433105, |
|
"learning_rate": 3.775981524249423e-05, |
|
"loss": 0.5468, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3094688221709007, |
|
"grad_norm": 3.517822742462158, |
|
"learning_rate": 3.833718244803695e-05, |
|
"loss": 0.5834, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3140877598152425, |
|
"grad_norm": 2.7577602863311768, |
|
"learning_rate": 3.891454965357968e-05, |
|
"loss": 0.5489, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3187066974595843, |
|
"grad_norm": 2.856598138809204, |
|
"learning_rate": 3.94919168591224e-05, |
|
"loss": 0.4906, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3233256351039261, |
|
"grad_norm": 4.136464595794678, |
|
"learning_rate": 4.0069284064665133e-05, |
|
"loss": 0.5705, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3279445727482679, |
|
"grad_norm": 4.121008396148682, |
|
"learning_rate": 4.064665127020785e-05, |
|
"loss": 0.6241, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3325635103926097, |
|
"grad_norm": 3.611814498901367, |
|
"learning_rate": 4.122401847575058e-05, |
|
"loss": 0.5034, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3371824480369515, |
|
"grad_norm": 3.4624834060668945, |
|
"learning_rate": 4.18013856812933e-05, |
|
"loss": 0.5643, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3418013856812933, |
|
"grad_norm": 3.3586976528167725, |
|
"learning_rate": 4.237875288683603e-05, |
|
"loss": 0.5439, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3464203233256351, |
|
"grad_norm": 2.5129313468933105, |
|
"learning_rate": 4.2956120092378757e-05, |
|
"loss": 0.5062, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3510392609699769, |
|
"grad_norm": 3.0052690505981445, |
|
"learning_rate": 4.353348729792148e-05, |
|
"loss": 0.5219, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3556581986143187, |
|
"grad_norm": 3.7070388793945312, |
|
"learning_rate": 4.4110854503464206e-05, |
|
"loss": 0.5425, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.36027713625866054, |
|
"grad_norm": 2.8378756046295166, |
|
"learning_rate": 4.468822170900693e-05, |
|
"loss": 0.5208, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3648960739030023, |
|
"grad_norm": 2.3988196849823, |
|
"learning_rate": 4.5265588914549655e-05, |
|
"loss": 0.5149, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3695150115473441, |
|
"grad_norm": 2.2305569648742676, |
|
"learning_rate": 4.584295612009238e-05, |
|
"loss": 0.5028, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3741339491916859, |
|
"grad_norm": 3.0817391872406006, |
|
"learning_rate": 4.6420323325635104e-05, |
|
"loss": 0.5419, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3787528868360277, |
|
"grad_norm": 2.767381429672241, |
|
"learning_rate": 4.699769053117783e-05, |
|
"loss": 0.5036, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3833718244803695, |
|
"grad_norm": 2.8563129901885986, |
|
"learning_rate": 4.757505773672056e-05, |
|
"loss": 0.5079, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.38799076212471134, |
|
"grad_norm": 4.459218978881836, |
|
"learning_rate": 4.815242494226328e-05, |
|
"loss": 0.4891, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.39260969976905313, |
|
"grad_norm": 2.825631856918335, |
|
"learning_rate": 4.872979214780601e-05, |
|
"loss": 0.4921, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3972286374133949, |
|
"grad_norm": 2.835643768310547, |
|
"learning_rate": 4.9307159353348734e-05, |
|
"loss": 0.4736, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4018475750577367, |
|
"grad_norm": 3.237922430038452, |
|
"learning_rate": 4.988452655889146e-05, |
|
"loss": 0.5254, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4064665127020785, |
|
"grad_norm": 3.913771390914917, |
|
"learning_rate": 4.9999870022388165e-05, |
|
"loss": 0.5212, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4110854503464203, |
|
"grad_norm": 4.357654094696045, |
|
"learning_rate": 4.999934199065641e-05, |
|
"loss": 0.6004, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.41570438799076215, |
|
"grad_norm": 2.1519246101379395, |
|
"learning_rate": 4.999840778977644e-05, |
|
"loss": 0.5106, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42032332563510394, |
|
"grad_norm": 3.196580410003662, |
|
"learning_rate": 4.9997067434926386e-05, |
|
"loss": 0.4708, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.42494226327944573, |
|
"grad_norm": 3.0933518409729004, |
|
"learning_rate": 4.9995320947883265e-05, |
|
"loss": 0.5228, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4295612009237875, |
|
"grad_norm": 2.5883092880249023, |
|
"learning_rate": 4.999316835702259e-05, |
|
"loss": 0.54, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4341801385681293, |
|
"grad_norm": 2.738126277923584, |
|
"learning_rate": 4.9990609697317916e-05, |
|
"loss": 0.4965, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4387990762124711, |
|
"grad_norm": 2.903364419937134, |
|
"learning_rate": 4.998764501034028e-05, |
|
"loss": 0.4732, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.44341801385681295, |
|
"grad_norm": 1.895765781402588, |
|
"learning_rate": 4.998427434425753e-05, |
|
"loss": 0.4857, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.44803695150115475, |
|
"grad_norm": 5.765718936920166, |
|
"learning_rate": 4.998049775383353e-05, |
|
"loss": 0.4901, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.45265588914549654, |
|
"grad_norm": 3.9257349967956543, |
|
"learning_rate": 4.997631530042727e-05, |
|
"loss": 0.493, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.45727482678983833, |
|
"grad_norm": 2.3068912029266357, |
|
"learning_rate": 4.997172705199189e-05, |
|
"loss": 0.5061, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4618937644341801, |
|
"grad_norm": 2.966820001602173, |
|
"learning_rate": 4.996673308307355e-05, |
|
"loss": 0.4557, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4665127020785219, |
|
"grad_norm": 3.704120397567749, |
|
"learning_rate": 4.996133347481021e-05, |
|
"loss": 0.5302, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.47113163972286376, |
|
"grad_norm": 2.066206216812134, |
|
"learning_rate": 4.9955528314930376e-05, |
|
"loss": 0.5035, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.47575057736720555, |
|
"grad_norm": 3.7750158309936523, |
|
"learning_rate": 4.9949317697751596e-05, |
|
"loss": 0.5327, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.48036951501154734, |
|
"grad_norm": 2.5127952098846436, |
|
"learning_rate": 4.9942701724178965e-05, |
|
"loss": 0.4929, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.48498845265588914, |
|
"grad_norm": 2.594442367553711, |
|
"learning_rate": 4.9935680501703485e-05, |
|
"loss": 0.4795, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4896073903002309, |
|
"grad_norm": 4.830276966094971, |
|
"learning_rate": 4.992825414440032e-05, |
|
"loss": 0.5268, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4942263279445728, |
|
"grad_norm": 2.404269218444824, |
|
"learning_rate": 4.9920422772926933e-05, |
|
"loss": 0.5199, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.49884526558891457, |
|
"grad_norm": 2.622856378555298, |
|
"learning_rate": 4.991218651452114e-05, |
|
"loss": 0.4928, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5034642032332564, |
|
"grad_norm": 2.660313844680786, |
|
"learning_rate": 4.9903545502999014e-05, |
|
"loss": 0.516, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5080831408775982, |
|
"grad_norm": 2.2060108184814453, |
|
"learning_rate": 4.9894499878752744e-05, |
|
"loss": 0.474, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5127020785219399, |
|
"grad_norm": 2.774624824523926, |
|
"learning_rate": 4.988504978874835e-05, |
|
"loss": 0.5053, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5173210161662818, |
|
"grad_norm": 2.0086095333099365, |
|
"learning_rate": 4.987519538652326e-05, |
|
"loss": 0.4478, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5219399538106235, |
|
"grad_norm": 2.7462918758392334, |
|
"learning_rate": 4.986493683218386e-05, |
|
"loss": 0.4872, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5265588914549654, |
|
"grad_norm": 2.2208642959594727, |
|
"learning_rate": 4.985427429240286e-05, |
|
"loss": 0.4617, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5311778290993071, |
|
"grad_norm": 3.3714964389801025, |
|
"learning_rate": 4.984320794041662e-05, |
|
"loss": 0.5293, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.535796766743649, |
|
"grad_norm": 2.732804298400879, |
|
"learning_rate": 4.98317379560223e-05, |
|
"loss": 0.4582, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5404157043879908, |
|
"grad_norm": 3.4411492347717285, |
|
"learning_rate": 4.981986452557495e-05, |
|
"loss": 0.5359, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5450346420323325, |
|
"grad_norm": 2.5346171855926514, |
|
"learning_rate": 4.9807587841984494e-05, |
|
"loss": 0.438, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5496535796766744, |
|
"grad_norm": 2.068873882293701, |
|
"learning_rate": 4.9794908104712586e-05, |
|
"loss": 0.4541, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5542725173210161, |
|
"grad_norm": 3.7652342319488525, |
|
"learning_rate": 4.978182551976939e-05, |
|
"loss": 0.5372, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.558891454965358, |
|
"grad_norm": 2.7133548259735107, |
|
"learning_rate": 4.976834029971017e-05, |
|
"loss": 0.4949, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5635103926096998, |
|
"grad_norm": 2.7093558311462402, |
|
"learning_rate": 4.975445266363191e-05, |
|
"loss": 0.4484, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5681293302540416, |
|
"grad_norm": 2.551117420196533, |
|
"learning_rate": 4.974016283716974e-05, |
|
"loss": 0.4865, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5727482678983834, |
|
"grad_norm": 3.5680108070373535, |
|
"learning_rate": 4.9725471052493225e-05, |
|
"loss": 0.512, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5773672055427251, |
|
"grad_norm": 3.1270763874053955, |
|
"learning_rate": 4.9710377548302636e-05, |
|
"loss": 0.4658, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.581986143187067, |
|
"grad_norm": 2.7450294494628906, |
|
"learning_rate": 4.9694882569825045e-05, |
|
"loss": 0.5726, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5866050808314087, |
|
"grad_norm": 2.192270040512085, |
|
"learning_rate": 4.967898636881039e-05, |
|
"loss": 0.456, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5912240184757506, |
|
"grad_norm": 3.5508313179016113, |
|
"learning_rate": 4.9662689203527304e-05, |
|
"loss": 0.4363, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5958429561200924, |
|
"grad_norm": 3.789588212966919, |
|
"learning_rate": 4.964599133875899e-05, |
|
"loss": 0.3894, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6004618937644342, |
|
"grad_norm": 2.185136556625366, |
|
"learning_rate": 4.9628893045798905e-05, |
|
"loss": 0.4388, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.605080831408776, |
|
"grad_norm": 3.9445693492889404, |
|
"learning_rate": 4.961139460244631e-05, |
|
"loss": 0.4761, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6096997690531177, |
|
"grad_norm": 2.027426242828369, |
|
"learning_rate": 4.95934962930018e-05, |
|
"loss": 0.4726, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6143187066974596, |
|
"grad_norm": 2.555227518081665, |
|
"learning_rate": 4.957519840826268e-05, |
|
"loss": 0.4546, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6189376443418014, |
|
"grad_norm": 3.800353765487671, |
|
"learning_rate": 4.955650124551823e-05, |
|
"loss": 0.4657, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6235565819861432, |
|
"grad_norm": 1.877545952796936, |
|
"learning_rate": 4.953740510854485e-05, |
|
"loss": 0.46, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.628175519630485, |
|
"grad_norm": 2.205172538757324, |
|
"learning_rate": 4.9517910307601204e-05, |
|
"loss": 0.4845, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6327944572748267, |
|
"grad_norm": 4.207221031188965, |
|
"learning_rate": 4.949801715942306e-05, |
|
"loss": 0.4784, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6374133949191686, |
|
"grad_norm": 2.3010756969451904, |
|
"learning_rate": 4.947772598721828e-05, |
|
"loss": 0.4679, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6420323325635104, |
|
"grad_norm": 1.776963233947754, |
|
"learning_rate": 4.9457037120661455e-05, |
|
"loss": 0.4405, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6466512702078522, |
|
"grad_norm": 2.8734822273254395, |
|
"learning_rate": 4.9435950895888604e-05, |
|
"loss": 0.474, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.651270207852194, |
|
"grad_norm": 3.049509286880493, |
|
"learning_rate": 4.9414467655491695e-05, |
|
"loss": 0.4334, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6558891454965358, |
|
"grad_norm": 3.274083375930786, |
|
"learning_rate": 4.9392587748513105e-05, |
|
"loss": 0.4519, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6605080831408776, |
|
"grad_norm": 2.149959087371826, |
|
"learning_rate": 4.937031153043991e-05, |
|
"loss": 0.4625, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6651270207852193, |
|
"grad_norm": 2.3562982082366943, |
|
"learning_rate": 4.934763936319814e-05, |
|
"loss": 0.4487, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6697459584295612, |
|
"grad_norm": 2.5950613021850586, |
|
"learning_rate": 4.932457161514689e-05, |
|
"loss": 0.4979, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.674364896073903, |
|
"grad_norm": 2.4045233726501465, |
|
"learning_rate": 4.9301108661072315e-05, |
|
"loss": 0.4442, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6789838337182448, |
|
"grad_norm": 2.833594799041748, |
|
"learning_rate": 4.9277250882181575e-05, |
|
"loss": 0.4408, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6836027713625866, |
|
"grad_norm": 1.7960981130599976, |
|
"learning_rate": 4.9252998666096625e-05, |
|
"loss": 0.4672, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6882217090069284, |
|
"grad_norm": 2.261974811553955, |
|
"learning_rate": 4.922835240684792e-05, |
|
"loss": 0.3947, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6928406466512702, |
|
"grad_norm": 2.966799020767212, |
|
"learning_rate": 4.9203312504867994e-05, |
|
"loss": 0.4546, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8660, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.102925424714056e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|