diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12188 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8668, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011536686663590217, + "grad_norm": 0.39384475350379944, + "learning_rate": 2.306805074971165e-07, + "loss": 1.2335, + "step": 1 + }, + { + "epoch": 0.0005768343331795108, + "grad_norm": 0.3301478326320648, + "learning_rate": 1.1534025374855826e-06, + "loss": 1.1506, + "step": 5 + }, + { + "epoch": 0.0011536686663590216, + "grad_norm": 0.32749322056770325, + "learning_rate": 2.3068050749711653e-06, + "loss": 1.1258, + "step": 10 + }, + { + "epoch": 0.0017305029995385325, + "grad_norm": 0.3380628526210785, + "learning_rate": 3.4602076124567477e-06, + "loss": 1.2071, + "step": 15 + }, + { + "epoch": 0.0023073373327180432, + "grad_norm": 0.3165785074234009, + "learning_rate": 4.6136101499423305e-06, + "loss": 1.1473, + "step": 20 + }, + { + "epoch": 0.002884171665897554, + "grad_norm": 0.3524048924446106, + "learning_rate": 5.7670126874279126e-06, + "loss": 1.1617, + "step": 25 + }, + { + "epoch": 0.003461005999077065, + "grad_norm": 0.31146979331970215, + "learning_rate": 6.920415224913495e-06, + "loss": 1.1327, + "step": 30 + }, + { + "epoch": 0.0040378403322565756, + "grad_norm": 0.2818208634853363, + "learning_rate": 8.073817762399077e-06, + "loss": 1.1434, + "step": 35 + }, + { + "epoch": 0.0046146746654360865, + "grad_norm": 0.28524142503738403, + "learning_rate": 9.227220299884661e-06, + "loss": 1.0858, + "step": 40 + }, + { + "epoch": 0.005191508998615597, + "grad_norm": 0.24573445320129395, + "learning_rate": 1.0380622837370241e-05, + "loss": 1.0725, + "step": 45 + }, + { + "epoch": 0.005768343331795108, + "grad_norm": 0.263478547334671, + "learning_rate": 1.1534025374855825e-05, + "loss": 1.0934, + "step": 50 + }, + { + "epoch": 0.006345177664974619, + "grad_norm": 0.2519683539867401, + "learning_rate": 1.2687427912341407e-05, + "loss": 1.0627, + "step": 55 + }, + { + "epoch": 0.00692201199815413, + "grad_norm": 0.23009565472602844, + "learning_rate": 1.384083044982699e-05, + "loss": 1.0259, + "step": 60 + }, + { + "epoch": 0.007498846331333641, + "grad_norm": 0.2220510095357895, + "learning_rate": 1.4994232987312573e-05, + "loss": 1.0459, + "step": 65 + }, + { + "epoch": 0.008075680664513151, + "grad_norm": 0.20859792828559875, + "learning_rate": 1.6147635524798155e-05, + "loss": 1.0493, + "step": 70 + }, + { + "epoch": 0.008652514997692663, + "grad_norm": 0.24422504007816315, + "learning_rate": 1.7301038062283735e-05, + "loss": 1.0426, + "step": 75 + }, + { + "epoch": 0.009229349330872173, + "grad_norm": 0.26209887862205505, + "learning_rate": 1.8454440599769322e-05, + "loss": 1.1044, + "step": 80 + }, + { + "epoch": 0.009806183664051685, + "grad_norm": 0.22573940455913544, + "learning_rate": 1.9607843137254903e-05, + "loss": 1.0154, + "step": 85 + }, + { + "epoch": 0.010383017997231195, + "grad_norm": 0.23182636499404907, + "learning_rate": 2.0761245674740483e-05, + "loss": 0.9841, + "step": 90 + }, + { + "epoch": 0.010959852330410707, + "grad_norm": 0.20166198909282684, + "learning_rate": 2.191464821222607e-05, + "loss": 0.9969, + "step": 95 + }, + { + "epoch": 0.011536686663590217, + "grad_norm": 0.24127909541130066, + "learning_rate": 2.306805074971165e-05, + "loss": 1.0168, + "step": 100 + }, + { + "epoch": 0.012113520996769728, + "grad_norm": 0.23480503261089325, + "learning_rate": 2.422145328719723e-05, + "loss": 1.0291, + "step": 105 + }, + { + "epoch": 0.012690355329949238, + "grad_norm": 0.2245372086763382, + "learning_rate": 2.5374855824682814e-05, + "loss": 1.0141, + "step": 110 + }, + { + "epoch": 0.01326718966312875, + "grad_norm": 0.22040125727653503, + "learning_rate": 2.6528258362168395e-05, + "loss": 1.0327, + "step": 115 + }, + { + "epoch": 0.01384402399630826, + "grad_norm": 0.22710174322128296, + "learning_rate": 2.768166089965398e-05, + "loss": 1.055, + "step": 120 + }, + { + "epoch": 0.01442085832948777, + "grad_norm": 0.21789094805717468, + "learning_rate": 2.8835063437139565e-05, + "loss": 1.0143, + "step": 125 + }, + { + "epoch": 0.014997692662667282, + "grad_norm": 0.26034095883369446, + "learning_rate": 2.9988465974625146e-05, + "loss": 1.082, + "step": 130 + }, + { + "epoch": 0.015574526995846792, + "grad_norm": 0.24871432781219482, + "learning_rate": 3.1141868512110726e-05, + "loss": 0.9683, + "step": 135 + }, + { + "epoch": 0.016151361329026302, + "grad_norm": 0.24066007137298584, + "learning_rate": 3.229527104959631e-05, + "loss": 1.0045, + "step": 140 + }, + { + "epoch": 0.016728195662205816, + "grad_norm": 0.24926799535751343, + "learning_rate": 3.344867358708189e-05, + "loss": 1.0534, + "step": 145 + }, + { + "epoch": 0.017305029995385326, + "grad_norm": 0.2561403512954712, + "learning_rate": 3.460207612456747e-05, + "loss": 0.9992, + "step": 150 + }, + { + "epoch": 0.017881864328564836, + "grad_norm": 0.25197339057922363, + "learning_rate": 3.575547866205306e-05, + "loss": 1.0213, + "step": 155 + }, + { + "epoch": 0.018458698661744346, + "grad_norm": 0.257375031709671, + "learning_rate": 3.6908881199538644e-05, + "loss": 1.0, + "step": 160 + }, + { + "epoch": 0.01903553299492386, + "grad_norm": 0.26233741641044617, + "learning_rate": 3.806228373702422e-05, + "loss": 0.9835, + "step": 165 + }, + { + "epoch": 0.01961236732810337, + "grad_norm": 0.26200389862060547, + "learning_rate": 3.9215686274509805e-05, + "loss": 1.0475, + "step": 170 + }, + { + "epoch": 0.02018920166128288, + "grad_norm": 0.3026478886604309, + "learning_rate": 4.036908881199539e-05, + "loss": 1.0337, + "step": 175 + }, + { + "epoch": 0.02076603599446239, + "grad_norm": 0.24392971396446228, + "learning_rate": 4.1522491349480966e-05, + "loss": 1.0181, + "step": 180 + }, + { + "epoch": 0.0213428703276419, + "grad_norm": 0.2525658905506134, + "learning_rate": 4.2675893886966556e-05, + "loss": 0.9815, + "step": 185 + }, + { + "epoch": 0.021919704660821413, + "grad_norm": 0.2874011695384979, + "learning_rate": 4.382929642445214e-05, + "loss": 0.9717, + "step": 190 + }, + { + "epoch": 0.022496538994000923, + "grad_norm": 0.3149228096008301, + "learning_rate": 4.498269896193772e-05, + "loss": 1.0196, + "step": 195 + }, + { + "epoch": 0.023073373327180433, + "grad_norm": 0.24751496315002441, + "learning_rate": 4.61361014994233e-05, + "loss": 1.029, + "step": 200 + }, + { + "epoch": 0.023650207660359943, + "grad_norm": 0.26091670989990234, + "learning_rate": 4.7289504036908884e-05, + "loss": 0.9799, + "step": 205 + }, + { + "epoch": 0.024227041993539457, + "grad_norm": 0.24843664467334747, + "learning_rate": 4.844290657439446e-05, + "loss": 0.948, + "step": 210 + }, + { + "epoch": 0.024803876326718967, + "grad_norm": 0.26564517617225647, + "learning_rate": 4.9596309111880045e-05, + "loss": 1.0037, + "step": 215 + }, + { + "epoch": 0.025380710659898477, + "grad_norm": 0.25427091121673584, + "learning_rate": 5.074971164936563e-05, + "loss": 0.9891, + "step": 220 + }, + { + "epoch": 0.025957544993077987, + "grad_norm": 0.24899506568908691, + "learning_rate": 5.190311418685121e-05, + "loss": 0.9797, + "step": 225 + }, + { + "epoch": 0.0265343793262575, + "grad_norm": 0.2365058809518814, + "learning_rate": 5.305651672433679e-05, + "loss": 0.9779, + "step": 230 + }, + { + "epoch": 0.02711121365943701, + "grad_norm": 0.25148388743400574, + "learning_rate": 5.4209919261822386e-05, + "loss": 0.9934, + "step": 235 + }, + { + "epoch": 0.02768804799261652, + "grad_norm": 0.24851441383361816, + "learning_rate": 5.536332179930796e-05, + "loss": 0.9609, + "step": 240 + }, + { + "epoch": 0.02826488232579603, + "grad_norm": 0.2596060633659363, + "learning_rate": 5.651672433679355e-05, + "loss": 0.955, + "step": 245 + }, + { + "epoch": 0.02884171665897554, + "grad_norm": 0.26578041911125183, + "learning_rate": 5.767012687427913e-05, + "loss": 1.0006, + "step": 250 + }, + { + "epoch": 0.029418550992155054, + "grad_norm": 0.25275397300720215, + "learning_rate": 5.882352941176471e-05, + "loss": 1.0443, + "step": 255 + }, + { + "epoch": 0.029995385325334564, + "grad_norm": 0.246384397149086, + "learning_rate": 5.997693194925029e-05, + "loss": 1.0123, + "step": 260 + }, + { + "epoch": 0.030572219658514074, + "grad_norm": 0.23048047721385956, + "learning_rate": 6.113033448673587e-05, + "loss": 0.9969, + "step": 265 + }, + { + "epoch": 0.031149053991693584, + "grad_norm": 0.24622942507266998, + "learning_rate": 6.228373702422145e-05, + "loss": 1.0324, + "step": 270 + }, + { + "epoch": 0.031725888324873094, + "grad_norm": 0.23391634225845337, + "learning_rate": 6.343713956170704e-05, + "loss": 0.9575, + "step": 275 + }, + { + "epoch": 0.032302722658052604, + "grad_norm": 0.2237214893102646, + "learning_rate": 6.459054209919262e-05, + "loss": 1.0388, + "step": 280 + }, + { + "epoch": 0.03287955699123212, + "grad_norm": 0.22574639320373535, + "learning_rate": 6.57439446366782e-05, + "loss": 1.0074, + "step": 285 + }, + { + "epoch": 0.03345639132441163, + "grad_norm": 0.23499815165996552, + "learning_rate": 6.689734717416379e-05, + "loss": 0.9567, + "step": 290 + }, + { + "epoch": 0.03403322565759114, + "grad_norm": 0.22160688042640686, + "learning_rate": 6.805074971164937e-05, + "loss": 1.0325, + "step": 295 + }, + { + "epoch": 0.03461005999077065, + "grad_norm": 0.2253323346376419, + "learning_rate": 6.920415224913494e-05, + "loss": 0.9682, + "step": 300 + }, + { + "epoch": 0.03518689432395016, + "grad_norm": 0.22069986164569855, + "learning_rate": 7.035755478662054e-05, + "loss": 1.0038, + "step": 305 + }, + { + "epoch": 0.03576372865712967, + "grad_norm": 0.23517417907714844, + "learning_rate": 7.151095732410612e-05, + "loss": 0.9954, + "step": 310 + }, + { + "epoch": 0.03634056299030918, + "grad_norm": 0.22023826837539673, + "learning_rate": 7.26643598615917e-05, + "loss": 1.0708, + "step": 315 + }, + { + "epoch": 0.03691739732348869, + "grad_norm": 0.22501811385154724, + "learning_rate": 7.381776239907729e-05, + "loss": 1.0403, + "step": 320 + }, + { + "epoch": 0.0374942316566682, + "grad_norm": 0.24813653528690338, + "learning_rate": 7.497116493656286e-05, + "loss": 1.0395, + "step": 325 + }, + { + "epoch": 0.03807106598984772, + "grad_norm": 0.213524729013443, + "learning_rate": 7.612456747404844e-05, + "loss": 1.0479, + "step": 330 + }, + { + "epoch": 0.03864790032302723, + "grad_norm": 0.2197512686252594, + "learning_rate": 7.727797001153403e-05, + "loss": 0.9709, + "step": 335 + }, + { + "epoch": 0.03922473465620674, + "grad_norm": 0.21706676483154297, + "learning_rate": 7.843137254901961e-05, + "loss": 0.9824, + "step": 340 + }, + { + "epoch": 0.03980156898938625, + "grad_norm": 0.2092558592557907, + "learning_rate": 7.95847750865052e-05, + "loss": 0.9749, + "step": 345 + }, + { + "epoch": 0.04037840332256576, + "grad_norm": 0.20605534315109253, + "learning_rate": 8.073817762399078e-05, + "loss": 0.9728, + "step": 350 + }, + { + "epoch": 0.04095523765574527, + "grad_norm": 0.20985351502895355, + "learning_rate": 8.189158016147636e-05, + "loss": 0.9399, + "step": 355 + }, + { + "epoch": 0.04153207198892478, + "grad_norm": 0.20424993336200714, + "learning_rate": 8.304498269896193e-05, + "loss": 0.9659, + "step": 360 + }, + { + "epoch": 0.04210890632210429, + "grad_norm": 0.2047097533941269, + "learning_rate": 8.419838523644751e-05, + "loss": 0.9825, + "step": 365 + }, + { + "epoch": 0.0426857406552838, + "grad_norm": 0.2090773731470108, + "learning_rate": 8.535178777393311e-05, + "loss": 0.9915, + "step": 370 + }, + { + "epoch": 0.043262574988463316, + "grad_norm": 0.20311830937862396, + "learning_rate": 8.65051903114187e-05, + "loss": 0.9481, + "step": 375 + }, + { + "epoch": 0.043839409321642826, + "grad_norm": 0.20276297628879547, + "learning_rate": 8.765859284890428e-05, + "loss": 1.0057, + "step": 380 + }, + { + "epoch": 0.044416243654822336, + "grad_norm": 0.20655770599842072, + "learning_rate": 8.881199538638986e-05, + "loss": 1.0034, + "step": 385 + }, + { + "epoch": 0.044993077988001846, + "grad_norm": 0.21446913480758667, + "learning_rate": 8.996539792387543e-05, + "loss": 0.958, + "step": 390 + }, + { + "epoch": 0.045569912321181356, + "grad_norm": 0.20291991531848907, + "learning_rate": 9.111880046136102e-05, + "loss": 0.9714, + "step": 395 + }, + { + "epoch": 0.046146746654360866, + "grad_norm": 0.19942238926887512, + "learning_rate": 9.22722029988466e-05, + "loss": 0.9791, + "step": 400 + }, + { + "epoch": 0.046723580987540377, + "grad_norm": 0.21126116812229156, + "learning_rate": 9.342560553633218e-05, + "loss": 0.9764, + "step": 405 + }, + { + "epoch": 0.04730041532071989, + "grad_norm": 0.2006087452173233, + "learning_rate": 9.457900807381777e-05, + "loss": 0.9389, + "step": 410 + }, + { + "epoch": 0.0478772496538994, + "grad_norm": 0.1997053176164627, + "learning_rate": 9.573241061130335e-05, + "loss": 0.9795, + "step": 415 + }, + { + "epoch": 0.048454083987078914, + "grad_norm": 0.19919082522392273, + "learning_rate": 9.688581314878892e-05, + "loss": 0.9865, + "step": 420 + }, + { + "epoch": 0.049030918320258424, + "grad_norm": 0.20420940220355988, + "learning_rate": 9.80392156862745e-05, + "loss": 1.0243, + "step": 425 + }, + { + "epoch": 0.049607752653437934, + "grad_norm": 0.20005351305007935, + "learning_rate": 9.919261822376009e-05, + "loss": 0.9298, + "step": 430 + }, + { + "epoch": 0.050184586986617444, + "grad_norm": 0.2057618349790573, + "learning_rate": 0.00010034602076124569, + "loss": 0.9673, + "step": 435 + }, + { + "epoch": 0.050761421319796954, + "grad_norm": 0.2002270221710205, + "learning_rate": 0.00010149942329873126, + "loss": 1.0362, + "step": 440 + }, + { + "epoch": 0.051338255652976464, + "grad_norm": 0.23044097423553467, + "learning_rate": 0.00010265282583621685, + "loss": 1.0132, + "step": 445 + }, + { + "epoch": 0.051915089986155974, + "grad_norm": 0.19683632254600525, + "learning_rate": 0.00010380622837370242, + "loss": 0.9883, + "step": 450 + }, + { + "epoch": 0.052491924319335484, + "grad_norm": 0.18685606122016907, + "learning_rate": 0.00010495963091118801, + "loss": 1.0097, + "step": 455 + }, + { + "epoch": 0.053068758652515, + "grad_norm": 0.19922864437103271, + "learning_rate": 0.00010611303344867358, + "loss": 1.0129, + "step": 460 + }, + { + "epoch": 0.05364559298569451, + "grad_norm": 0.22423453629016876, + "learning_rate": 0.00010726643598615918, + "loss": 0.9616, + "step": 465 + }, + { + "epoch": 0.05422242731887402, + "grad_norm": 0.19105130434036255, + "learning_rate": 0.00010841983852364477, + "loss": 0.9873, + "step": 470 + }, + { + "epoch": 0.05479926165205353, + "grad_norm": 0.191370889544487, + "learning_rate": 0.00010957324106113034, + "loss": 0.9869, + "step": 475 + }, + { + "epoch": 0.05537609598523304, + "grad_norm": 0.1888877898454666, + "learning_rate": 0.00011072664359861593, + "loss": 0.9754, + "step": 480 + }, + { + "epoch": 0.05595293031841255, + "grad_norm": 0.19908085465431213, + "learning_rate": 0.0001118800461361015, + "loss": 0.9978, + "step": 485 + }, + { + "epoch": 0.05652976465159206, + "grad_norm": 0.1849226951599121, + "learning_rate": 0.0001130334486735871, + "loss": 1.0036, + "step": 490 + }, + { + "epoch": 0.05710659898477157, + "grad_norm": 0.18880179524421692, + "learning_rate": 0.00011418685121107266, + "loss": 0.9969, + "step": 495 + }, + { + "epoch": 0.05768343331795108, + "grad_norm": 0.18281018733978271, + "learning_rate": 0.00011534025374855826, + "loss": 1.0163, + "step": 500 + }, + { + "epoch": 0.0582602676511306, + "grad_norm": 0.18743227422237396, + "learning_rate": 0.00011649365628604383, + "loss": 0.9774, + "step": 505 + }, + { + "epoch": 0.05883710198431011, + "grad_norm": 0.18649840354919434, + "learning_rate": 0.00011764705882352942, + "loss": 0.9948, + "step": 510 + }, + { + "epoch": 0.05941393631748962, + "grad_norm": 0.18914422392845154, + "learning_rate": 0.00011880046136101499, + "loss": 0.9789, + "step": 515 + }, + { + "epoch": 0.05999077065066913, + "grad_norm": 0.20373612642288208, + "learning_rate": 0.00011995386389850058, + "loss": 0.9594, + "step": 520 + }, + { + "epoch": 0.06056760498384864, + "grad_norm": 0.1853344440460205, + "learning_rate": 0.00012110726643598615, + "loss": 0.9422, + "step": 525 + }, + { + "epoch": 0.06114443931702815, + "grad_norm": 0.17580455541610718, + "learning_rate": 0.00012226066897347174, + "loss": 0.962, + "step": 530 + }, + { + "epoch": 0.06172127365020766, + "grad_norm": 0.18509171903133392, + "learning_rate": 0.00012341407151095733, + "loss": 0.9269, + "step": 535 + }, + { + "epoch": 0.06229810798338717, + "grad_norm": 0.17391765117645264, + "learning_rate": 0.0001245674740484429, + "loss": 0.9379, + "step": 540 + }, + { + "epoch": 0.06287494231656668, + "grad_norm": 0.185151144862175, + "learning_rate": 0.0001257208765859285, + "loss": 0.9803, + "step": 545 + }, + { + "epoch": 0.06345177664974619, + "grad_norm": 0.18012270331382751, + "learning_rate": 0.00012687427912341407, + "loss": 0.9318, + "step": 550 + }, + { + "epoch": 0.0640286109829257, + "grad_norm": 0.19646641612052917, + "learning_rate": 0.00012802768166089967, + "loss": 1.0218, + "step": 555 + }, + { + "epoch": 0.06460544531610521, + "grad_norm": 0.1846383512020111, + "learning_rate": 0.00012918108419838524, + "loss": 1.025, + "step": 560 + }, + { + "epoch": 0.06518227964928472, + "grad_norm": 0.18073053658008575, + "learning_rate": 0.00013033448673587084, + "loss": 0.9418, + "step": 565 + }, + { + "epoch": 0.06575911398246424, + "grad_norm": 0.17771542072296143, + "learning_rate": 0.0001314878892733564, + "loss": 0.939, + "step": 570 + }, + { + "epoch": 0.06633594831564375, + "grad_norm": 0.17932404577732086, + "learning_rate": 0.000132641291810842, + "loss": 1.0029, + "step": 575 + }, + { + "epoch": 0.06691278264882326, + "grad_norm": 0.1846706122159958, + "learning_rate": 0.00013379469434832757, + "loss": 0.9732, + "step": 580 + }, + { + "epoch": 0.06748961698200277, + "grad_norm": 0.18250420689582825, + "learning_rate": 0.00013494809688581317, + "loss": 0.9838, + "step": 585 + }, + { + "epoch": 0.06806645131518228, + "grad_norm": 0.18181759119033813, + "learning_rate": 0.00013610149942329874, + "loss": 0.9732, + "step": 590 + }, + { + "epoch": 0.0686432856483618, + "grad_norm": 0.19068962335586548, + "learning_rate": 0.0001372549019607843, + "loss": 0.9789, + "step": 595 + }, + { + "epoch": 0.0692201199815413, + "grad_norm": 0.17766667902469635, + "learning_rate": 0.00013840830449826988, + "loss": 0.9702, + "step": 600 + }, + { + "epoch": 0.06979695431472081, + "grad_norm": 0.18375654518604279, + "learning_rate": 0.00013956170703575548, + "loss": 0.9239, + "step": 605 + }, + { + "epoch": 0.07037378864790032, + "grad_norm": 0.17888925969600677, + "learning_rate": 0.00014071510957324108, + "loss": 0.9834, + "step": 610 + }, + { + "epoch": 0.07095062298107983, + "grad_norm": 0.1771126687526703, + "learning_rate": 0.00014186851211072665, + "loss": 0.9383, + "step": 615 + }, + { + "epoch": 0.07152745731425934, + "grad_norm": 0.19248628616333008, + "learning_rate": 0.00014302191464821224, + "loss": 0.931, + "step": 620 + }, + { + "epoch": 0.07210429164743885, + "grad_norm": 0.1845206469297409, + "learning_rate": 0.0001441753171856978, + "loss": 0.9919, + "step": 625 + }, + { + "epoch": 0.07268112598061836, + "grad_norm": 0.17463363707065582, + "learning_rate": 0.0001453287197231834, + "loss": 0.9698, + "step": 630 + }, + { + "epoch": 0.07325796031379787, + "grad_norm": 0.17882540822029114, + "learning_rate": 0.00014648212226066898, + "loss": 0.9985, + "step": 635 + }, + { + "epoch": 0.07383479464697738, + "grad_norm": 0.1773298680782318, + "learning_rate": 0.00014763552479815458, + "loss": 0.9572, + "step": 640 + }, + { + "epoch": 0.0744116289801569, + "grad_norm": 0.18714497983455658, + "learning_rate": 0.00014878892733564015, + "loss": 1.0213, + "step": 645 + }, + { + "epoch": 0.0749884633133364, + "grad_norm": 0.16878995299339294, + "learning_rate": 0.00014994232987312572, + "loss": 0.9478, + "step": 650 + }, + { + "epoch": 0.07556529764651591, + "grad_norm": 0.18111148476600647, + "learning_rate": 0.0001510957324106113, + "loss": 0.9727, + "step": 655 + }, + { + "epoch": 0.07614213197969544, + "grad_norm": 0.18033739924430847, + "learning_rate": 0.00015224913494809689, + "loss": 0.9953, + "step": 660 + }, + { + "epoch": 0.07671896631287495, + "grad_norm": 0.184474378824234, + "learning_rate": 0.00015340253748558246, + "loss": 1.0182, + "step": 665 + }, + { + "epoch": 0.07729580064605446, + "grad_norm": 0.17728130519390106, + "learning_rate": 0.00015455594002306805, + "loss": 0.921, + "step": 670 + }, + { + "epoch": 0.07787263497923397, + "grad_norm": 0.1744864135980606, + "learning_rate": 0.00015570934256055365, + "loss": 0.9696, + "step": 675 + }, + { + "epoch": 0.07844946931241348, + "grad_norm": 0.16884462535381317, + "learning_rate": 0.00015686274509803922, + "loss": 0.9756, + "step": 680 + }, + { + "epoch": 0.07902630364559299, + "grad_norm": 0.1904095709323883, + "learning_rate": 0.00015801614763552482, + "loss": 0.9662, + "step": 685 + }, + { + "epoch": 0.0796031379787725, + "grad_norm": 0.1834830790758133, + "learning_rate": 0.0001591695501730104, + "loss": 0.9916, + "step": 690 + }, + { + "epoch": 0.08017997231195201, + "grad_norm": 0.17388418316841125, + "learning_rate": 0.00016032295271049598, + "loss": 0.9661, + "step": 695 + }, + { + "epoch": 0.08075680664513152, + "grad_norm": 0.17867860198020935, + "learning_rate": 0.00016147635524798155, + "loss": 0.9702, + "step": 700 + }, + { + "epoch": 0.08133364097831103, + "grad_norm": 0.184253990650177, + "learning_rate": 0.00016262975778546715, + "loss": 0.9628, + "step": 705 + }, + { + "epoch": 0.08191047531149054, + "grad_norm": 0.18207858502864838, + "learning_rate": 0.00016378316032295272, + "loss": 0.9882, + "step": 710 + }, + { + "epoch": 0.08248730964467005, + "grad_norm": 0.1673344224691391, + "learning_rate": 0.0001649365628604383, + "loss": 1.0004, + "step": 715 + }, + { + "epoch": 0.08306414397784956, + "grad_norm": 0.17834331095218658, + "learning_rate": 0.00016608996539792386, + "loss": 0.9624, + "step": 720 + }, + { + "epoch": 0.08364097831102907, + "grad_norm": 0.17010116577148438, + "learning_rate": 0.00016724336793540946, + "loss": 0.9419, + "step": 725 + }, + { + "epoch": 0.08421781264420858, + "grad_norm": 0.1887020319700241, + "learning_rate": 0.00016839677047289503, + "loss": 0.9735, + "step": 730 + }, + { + "epoch": 0.08479464697738809, + "grad_norm": 0.17254365980625153, + "learning_rate": 0.00016955017301038063, + "loss": 0.9959, + "step": 735 + }, + { + "epoch": 0.0853714813105676, + "grad_norm": 0.18531525135040283, + "learning_rate": 0.00017070357554786622, + "loss": 1.0063, + "step": 740 + }, + { + "epoch": 0.08594831564374712, + "grad_norm": 0.1854889988899231, + "learning_rate": 0.0001718569780853518, + "loss": 1.0276, + "step": 745 + }, + { + "epoch": 0.08652514997692663, + "grad_norm": 0.17261534929275513, + "learning_rate": 0.0001730103806228374, + "loss": 0.9459, + "step": 750 + }, + { + "epoch": 0.08710198431010614, + "grad_norm": 0.17594070732593536, + "learning_rate": 0.00017416378316032296, + "loss": 0.981, + "step": 755 + }, + { + "epoch": 0.08767881864328565, + "grad_norm": 0.17268770933151245, + "learning_rate": 0.00017531718569780856, + "loss": 0.9591, + "step": 760 + }, + { + "epoch": 0.08825565297646516, + "grad_norm": 0.1795402467250824, + "learning_rate": 0.00017647058823529413, + "loss": 1.0009, + "step": 765 + }, + { + "epoch": 0.08883248730964467, + "grad_norm": 0.17739154398441315, + "learning_rate": 0.00017762399077277973, + "loss": 1.0325, + "step": 770 + }, + { + "epoch": 0.08940932164282418, + "grad_norm": 0.1737346351146698, + "learning_rate": 0.0001787773933102653, + "loss": 0.966, + "step": 775 + }, + { + "epoch": 0.08998615597600369, + "grad_norm": 0.178639754652977, + "learning_rate": 0.00017993079584775087, + "loss": 0.9921, + "step": 780 + }, + { + "epoch": 0.0905629903091832, + "grad_norm": 0.18672531843185425, + "learning_rate": 0.00018108419838523644, + "loss": 0.9398, + "step": 785 + }, + { + "epoch": 0.09113982464236271, + "grad_norm": 0.17832833528518677, + "learning_rate": 0.00018223760092272203, + "loss": 1.0191, + "step": 790 + }, + { + "epoch": 0.09171665897554222, + "grad_norm": 0.17171098291873932, + "learning_rate": 0.0001833910034602076, + "loss": 0.9889, + "step": 795 + }, + { + "epoch": 0.09229349330872173, + "grad_norm": 0.17071138322353363, + "learning_rate": 0.0001845444059976932, + "loss": 0.9414, + "step": 800 + }, + { + "epoch": 0.09287032764190124, + "grad_norm": 0.17644046247005463, + "learning_rate": 0.0001856978085351788, + "loss": 1.0259, + "step": 805 + }, + { + "epoch": 0.09344716197508075, + "grad_norm": 0.17984060943126678, + "learning_rate": 0.00018685121107266437, + "loss": 0.9826, + "step": 810 + }, + { + "epoch": 0.09402399630826026, + "grad_norm": 0.1776990294456482, + "learning_rate": 0.00018800461361014997, + "loss": 0.9663, + "step": 815 + }, + { + "epoch": 0.09460083064143977, + "grad_norm": 0.17558909952640533, + "learning_rate": 0.00018915801614763554, + "loss": 1.0345, + "step": 820 + }, + { + "epoch": 0.09517766497461928, + "grad_norm": 0.18706142902374268, + "learning_rate": 0.00019031141868512113, + "loss": 1.0199, + "step": 825 + }, + { + "epoch": 0.0957544993077988, + "grad_norm": 0.1777406483888626, + "learning_rate": 0.0001914648212226067, + "loss": 1.0523, + "step": 830 + }, + { + "epoch": 0.09633133364097832, + "grad_norm": 0.16959840059280396, + "learning_rate": 0.00019261822376009227, + "loss": 0.9452, + "step": 835 + }, + { + "epoch": 0.09690816797415783, + "grad_norm": 0.17502658069133759, + "learning_rate": 0.00019377162629757784, + "loss": 1.0417, + "step": 840 + }, + { + "epoch": 0.09748500230733734, + "grad_norm": 0.17616289854049683, + "learning_rate": 0.00019492502883506344, + "loss": 0.9871, + "step": 845 + }, + { + "epoch": 0.09806183664051685, + "grad_norm": 0.1845337152481079, + "learning_rate": 0.000196078431372549, + "loss": 0.9917, + "step": 850 + }, + { + "epoch": 0.09863867097369636, + "grad_norm": 0.1851508468389511, + "learning_rate": 0.0001972318339100346, + "loss": 1.0393, + "step": 855 + }, + { + "epoch": 0.09921550530687587, + "grad_norm": 0.1803300529718399, + "learning_rate": 0.00019838523644752018, + "loss": 0.975, + "step": 860 + }, + { + "epoch": 0.09979233964005538, + "grad_norm": 0.17024391889572144, + "learning_rate": 0.00019953863898500578, + "loss": 0.9605, + "step": 865 + }, + { + "epoch": 0.10036917397323489, + "grad_norm": 0.17882846295833588, + "learning_rate": 0.0001999999270186907, + "loss": 0.9835, + "step": 870 + }, + { + "epoch": 0.1009460083064144, + "grad_norm": 0.1729104071855545, + "learning_rate": 0.0001999994810221862, + "loss": 0.9661, + "step": 875 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 0.1663312315940857, + "learning_rate": 0.00019999862957615513, + "loss": 1.0276, + "step": 880 + }, + { + "epoch": 0.10209967697277342, + "grad_norm": 0.1741991490125656, + "learning_rate": 0.00019999737268404973, + "loss": 0.9306, + "step": 885 + }, + { + "epoch": 0.10267651130595293, + "grad_norm": 0.19253897666931152, + "learning_rate": 0.00019999571035096608, + "loss": 1.0522, + "step": 890 + }, + { + "epoch": 0.10325334563913244, + "grad_norm": 0.19019544124603271, + "learning_rate": 0.00019999364258364413, + "loss": 0.991, + "step": 895 + }, + { + "epoch": 0.10383017997231195, + "grad_norm": 0.17976698279380798, + "learning_rate": 0.00019999116939046764, + "loss": 0.9986, + "step": 900 + }, + { + "epoch": 0.10440701430549146, + "grad_norm": 0.18436984717845917, + "learning_rate": 0.0001999882907814643, + "loss": 0.9901, + "step": 905 + }, + { + "epoch": 0.10498384863867097, + "grad_norm": 0.17687635123729706, + "learning_rate": 0.0001999850067683054, + "loss": 1.0231, + "step": 910 + }, + { + "epoch": 0.10556068297185048, + "grad_norm": 0.18162792921066284, + "learning_rate": 0.00019998131736430604, + "loss": 0.9746, + "step": 915 + }, + { + "epoch": 0.10613751730503, + "grad_norm": 0.18052905797958374, + "learning_rate": 0.00019997722258442499, + "loss": 0.9929, + "step": 920 + }, + { + "epoch": 0.10671435163820951, + "grad_norm": 0.1827809065580368, + "learning_rate": 0.00019997272244526456, + "loss": 1.0031, + "step": 925 + }, + { + "epoch": 0.10729118597138902, + "grad_norm": 0.1823299676179886, + "learning_rate": 0.00019996781696507069, + "loss": 0.969, + "step": 930 + }, + { + "epoch": 0.10786802030456853, + "grad_norm": 0.18722161650657654, + "learning_rate": 0.00019996250616373268, + "loss": 0.9922, + "step": 935 + }, + { + "epoch": 0.10844485463774804, + "grad_norm": 0.1760849803686142, + "learning_rate": 0.0001999567900627833, + "loss": 1.0221, + "step": 940 + }, + { + "epoch": 0.10902168897092755, + "grad_norm": 0.17676706612110138, + "learning_rate": 0.0001999506686853986, + "loss": 0.9472, + "step": 945 + }, + { + "epoch": 0.10959852330410706, + "grad_norm": 0.1758042424917221, + "learning_rate": 0.00019994414205639775, + "loss": 0.9059, + "step": 950 + }, + { + "epoch": 0.11017535763728657, + "grad_norm": 0.18398715555667877, + "learning_rate": 0.00019993721020224308, + "loss": 0.9649, + "step": 955 + }, + { + "epoch": 0.11075219197046608, + "grad_norm": 0.17959755659103394, + "learning_rate": 0.0001999298731510399, + "loss": 0.9904, + "step": 960 + }, + { + "epoch": 0.11132902630364559, + "grad_norm": 0.17280980944633484, + "learning_rate": 0.00019992213093253643, + "loss": 1.0314, + "step": 965 + }, + { + "epoch": 0.1119058606368251, + "grad_norm": 0.17635640501976013, + "learning_rate": 0.0001999139835781236, + "loss": 1.0148, + "step": 970 + }, + { + "epoch": 0.11248269497000461, + "grad_norm": 0.1721516102552414, + "learning_rate": 0.00019990543112083503, + "loss": 0.9573, + "step": 975 + }, + { + "epoch": 0.11305952930318412, + "grad_norm": 0.18502525985240936, + "learning_rate": 0.00019989647359534672, + "loss": 1.0328, + "step": 980 + }, + { + "epoch": 0.11363636363636363, + "grad_norm": 0.18054784834384918, + "learning_rate": 0.0001998871110379772, + "loss": 0.9956, + "step": 985 + }, + { + "epoch": 0.11421319796954314, + "grad_norm": 0.1870676875114441, + "learning_rate": 0.00019987734348668706, + "loss": 0.9665, + "step": 990 + }, + { + "epoch": 0.11479003230272265, + "grad_norm": 0.18523703515529633, + "learning_rate": 0.00019986717098107896, + "loss": 0.9926, + "step": 995 + }, + { + "epoch": 0.11536686663590216, + "grad_norm": 0.2257257103919983, + "learning_rate": 0.00019985659356239758, + "loss": 0.9635, + "step": 1000 + }, + { + "epoch": 0.11594370096908169, + "grad_norm": 0.1749376505613327, + "learning_rate": 0.00019984561127352914, + "loss": 0.9773, + "step": 1005 + }, + { + "epoch": 0.1165205353022612, + "grad_norm": 0.18685142695903778, + "learning_rate": 0.00019983422415900158, + "loss": 0.967, + "step": 1010 + }, + { + "epoch": 0.1170973696354407, + "grad_norm": 0.1762312352657318, + "learning_rate": 0.00019982243226498411, + "loss": 0.9666, + "step": 1015 + }, + { + "epoch": 0.11767420396862022, + "grad_norm": 0.18300583958625793, + "learning_rate": 0.00019981023563928716, + "loss": 0.9654, + "step": 1020 + }, + { + "epoch": 0.11825103830179973, + "grad_norm": 0.17595453560352325, + "learning_rate": 0.00019979763433136216, + "loss": 0.9668, + "step": 1025 + }, + { + "epoch": 0.11882787263497924, + "grad_norm": 0.17857778072357178, + "learning_rate": 0.00019978462839230133, + "loss": 0.9814, + "step": 1030 + }, + { + "epoch": 0.11940470696815875, + "grad_norm": 0.194159135222435, + "learning_rate": 0.0001997712178748374, + "loss": 0.9548, + "step": 1035 + }, + { + "epoch": 0.11998154130133826, + "grad_norm": 0.1872921586036682, + "learning_rate": 0.0001997574028333436, + "loss": 0.9537, + "step": 1040 + }, + { + "epoch": 0.12055837563451777, + "grad_norm": 0.18175874650478363, + "learning_rate": 0.0001997431833238332, + "loss": 0.9708, + "step": 1045 + }, + { + "epoch": 0.12113520996769728, + "grad_norm": 0.17377467453479767, + "learning_rate": 0.00019972855940395947, + "loss": 1.0154, + "step": 1050 + }, + { + "epoch": 0.12171204430087679, + "grad_norm": 0.1744399070739746, + "learning_rate": 0.00019971353113301527, + "loss": 0.983, + "step": 1055 + }, + { + "epoch": 0.1222888786340563, + "grad_norm": 0.19033999741077423, + "learning_rate": 0.00019969809857193306, + "loss": 0.9915, + "step": 1060 + }, + { + "epoch": 0.12286571296723581, + "grad_norm": 0.18971246480941772, + "learning_rate": 0.0001996822617832843, + "loss": 0.974, + "step": 1065 + }, + { + "epoch": 0.12344254730041532, + "grad_norm": 0.18075977265834808, + "learning_rate": 0.0001996660208312796, + "loss": 0.9888, + "step": 1070 + }, + { + "epoch": 0.12401938163359483, + "grad_norm": 0.18304185569286346, + "learning_rate": 0.00019964937578176816, + "loss": 1.0237, + "step": 1075 + }, + { + "epoch": 0.12459621596677434, + "grad_norm": 0.18264040350914001, + "learning_rate": 0.00019963232670223752, + "loss": 1.023, + "step": 1080 + }, + { + "epoch": 0.12517305029995385, + "grad_norm": 0.17688079178333282, + "learning_rate": 0.00019961487366181355, + "loss": 0.9399, + "step": 1085 + }, + { + "epoch": 0.12574988463313336, + "grad_norm": 0.18298649787902832, + "learning_rate": 0.00019959701673125983, + "loss": 1.0495, + "step": 1090 + }, + { + "epoch": 0.12632671896631287, + "grad_norm": 0.1869022250175476, + "learning_rate": 0.00019957875598297759, + "loss": 0.9749, + "step": 1095 + }, + { + "epoch": 0.12690355329949238, + "grad_norm": 0.17531757056713104, + "learning_rate": 0.00019956009149100533, + "loss": 0.9841, + "step": 1100 + }, + { + "epoch": 0.1274803876326719, + "grad_norm": 0.18799515068531036, + "learning_rate": 0.00019954102333101856, + "loss": 0.985, + "step": 1105 + }, + { + "epoch": 0.1280572219658514, + "grad_norm": 0.19451211392879486, + "learning_rate": 0.0001995215515803294, + "loss": 1.0184, + "step": 1110 + }, + { + "epoch": 0.1286340562990309, + "grad_norm": 0.17992301285266876, + "learning_rate": 0.00019950167631788642, + "loss": 1.0029, + "step": 1115 + }, + { + "epoch": 0.12921089063221042, + "grad_norm": 0.18217399716377258, + "learning_rate": 0.00019948139762427416, + "loss": 0.9786, + "step": 1120 + }, + { + "epoch": 0.12978772496538993, + "grad_norm": 0.1724836379289627, + "learning_rate": 0.000199460715581713, + "loss": 0.9503, + "step": 1125 + }, + { + "epoch": 0.13036455929856944, + "grad_norm": 0.18973779678344727, + "learning_rate": 0.0001994396302740585, + "loss": 0.9589, + "step": 1130 + }, + { + "epoch": 0.13094139363174895, + "grad_norm": 0.1789364516735077, + "learning_rate": 0.00019941814178680144, + "loss": 1.0123, + "step": 1135 + }, + { + "epoch": 0.13151822796492849, + "grad_norm": 0.179600328207016, + "learning_rate": 0.00019939625020706724, + "loss": 0.9644, + "step": 1140 + }, + { + "epoch": 0.132095062298108, + "grad_norm": 0.20578639209270477, + "learning_rate": 0.00019937395562361564, + "loss": 0.9155, + "step": 1145 + }, + { + "epoch": 0.1326718966312875, + "grad_norm": 0.17455343902111053, + "learning_rate": 0.00019935125812684047, + "loss": 1.0081, + "step": 1150 + }, + { + "epoch": 0.13324873096446702, + "grad_norm": 0.18840928375720978, + "learning_rate": 0.00019932815780876904, + "loss": 0.9383, + "step": 1155 + }, + { + "epoch": 0.13382556529764653, + "grad_norm": 0.18001633882522583, + "learning_rate": 0.00019930465476306197, + "loss": 0.997, + "step": 1160 + }, + { + "epoch": 0.13440239963082604, + "grad_norm": 0.18877308070659637, + "learning_rate": 0.00019928074908501272, + "loss": 1.0056, + "step": 1165 + }, + { + "epoch": 0.13497923396400555, + "grad_norm": 0.19247262179851532, + "learning_rate": 0.00019925644087154734, + "loss": 0.9396, + "step": 1170 + }, + { + "epoch": 0.13555606829718506, + "grad_norm": 0.1856631338596344, + "learning_rate": 0.00019923173022122378, + "loss": 0.9777, + "step": 1175 + }, + { + "epoch": 0.13613290263036457, + "grad_norm": 0.18794025480747223, + "learning_rate": 0.00019920661723423183, + "loss": 0.9611, + "step": 1180 + }, + { + "epoch": 0.13670973696354408, + "grad_norm": 0.18122021853923798, + "learning_rate": 0.00019918110201239247, + "loss": 0.9364, + "step": 1185 + }, + { + "epoch": 0.1372865712967236, + "grad_norm": 0.18005718290805817, + "learning_rate": 0.00019915518465915758, + "loss": 0.9338, + "step": 1190 + }, + { + "epoch": 0.1378634056299031, + "grad_norm": 0.1888824999332428, + "learning_rate": 0.00019912886527960954, + "loss": 0.9059, + "step": 1195 + }, + { + "epoch": 0.1384402399630826, + "grad_norm": 0.17332522571086884, + "learning_rate": 0.0001991021439804607, + "loss": 0.9822, + "step": 1200 + }, + { + "epoch": 0.13901707429626212, + "grad_norm": 0.18236401677131653, + "learning_rate": 0.00019907502087005297, + "loss": 1.0221, + "step": 1205 + }, + { + "epoch": 0.13959390862944163, + "grad_norm": 0.18748964369297028, + "learning_rate": 0.00019904749605835742, + "loss": 1.0282, + "step": 1210 + }, + { + "epoch": 0.14017074296262114, + "grad_norm": 0.18207697570323944, + "learning_rate": 0.00019901956965697387, + "loss": 1.0046, + "step": 1215 + }, + { + "epoch": 0.14074757729580065, + "grad_norm": 0.18568968772888184, + "learning_rate": 0.00019899124177913041, + "loss": 1.0182, + "step": 1220 + }, + { + "epoch": 0.14132441162898016, + "grad_norm": 0.17385929822921753, + "learning_rate": 0.00019896251253968288, + "loss": 0.956, + "step": 1225 + }, + { + "epoch": 0.14190124596215967, + "grad_norm": 0.18582086265087128, + "learning_rate": 0.0001989333820551144, + "loss": 1.0011, + "step": 1230 + }, + { + "epoch": 0.14247808029533918, + "grad_norm": 0.18691149353981018, + "learning_rate": 0.00019890385044353501, + "loss": 1.0031, + "step": 1235 + }, + { + "epoch": 0.1430549146285187, + "grad_norm": 0.17932343482971191, + "learning_rate": 0.00019887391782468113, + "loss": 1.0163, + "step": 1240 + }, + { + "epoch": 0.1436317489616982, + "grad_norm": 0.1789650321006775, + "learning_rate": 0.000198843584319915, + "loss": 0.9428, + "step": 1245 + }, + { + "epoch": 0.1442085832948777, + "grad_norm": 0.17853863537311554, + "learning_rate": 0.0001988128500522244, + "loss": 0.9619, + "step": 1250 + }, + { + "epoch": 0.14478541762805722, + "grad_norm": 0.18588118255138397, + "learning_rate": 0.00019878171514622187, + "loss": 0.9928, + "step": 1255 + }, + { + "epoch": 0.14536225196123673, + "grad_norm": 0.1837441772222519, + "learning_rate": 0.00019875017972814435, + "loss": 0.9711, + "step": 1260 + }, + { + "epoch": 0.14593908629441624, + "grad_norm": 0.1958654373884201, + "learning_rate": 0.00019871824392585276, + "loss": 0.9413, + "step": 1265 + }, + { + "epoch": 0.14651592062759575, + "grad_norm": 0.19134190678596497, + "learning_rate": 0.00019868590786883134, + "loss": 0.9717, + "step": 1270 + }, + { + "epoch": 0.14709275496077526, + "grad_norm": 0.18663935363292694, + "learning_rate": 0.00019865317168818713, + "loss": 0.9806, + "step": 1275 + }, + { + "epoch": 0.14766958929395477, + "grad_norm": 0.18832144141197205, + "learning_rate": 0.0001986200355166495, + "loss": 0.9256, + "step": 1280 + }, + { + "epoch": 0.14824642362713428, + "grad_norm": 0.17965355515480042, + "learning_rate": 0.0001985864994885697, + "loss": 0.9852, + "step": 1285 + }, + { + "epoch": 0.1488232579603138, + "grad_norm": 0.18879751861095428, + "learning_rate": 0.00019855256373991993, + "loss": 0.9489, + "step": 1290 + }, + { + "epoch": 0.1494000922934933, + "grad_norm": 0.18717263638973236, + "learning_rate": 0.00019851822840829338, + "loss": 0.9698, + "step": 1295 + }, + { + "epoch": 0.1499769266266728, + "grad_norm": 0.19117680191993713, + "learning_rate": 0.0001984834936329031, + "loss": 0.9967, + "step": 1300 + }, + { + "epoch": 0.15055376095985232, + "grad_norm": 0.18893282115459442, + "learning_rate": 0.00019844835955458193, + "loss": 0.9391, + "step": 1305 + }, + { + "epoch": 0.15113059529303183, + "grad_norm": 0.1852642297744751, + "learning_rate": 0.00019841282631578145, + "loss": 0.9871, + "step": 1310 + }, + { + "epoch": 0.15170742962621137, + "grad_norm": 0.1851365864276886, + "learning_rate": 0.00019837689406057183, + "loss": 0.9459, + "step": 1315 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 0.216008722782135, + "learning_rate": 0.00019834056293464093, + "loss": 0.9901, + "step": 1320 + }, + { + "epoch": 0.15286109829257039, + "grad_norm": 0.17874599993228912, + "learning_rate": 0.00019830383308529393, + "loss": 0.984, + "step": 1325 + }, + { + "epoch": 0.1534379326257499, + "grad_norm": 0.18888545036315918, + "learning_rate": 0.00019826670466145262, + "loss": 0.9617, + "step": 1330 + }, + { + "epoch": 0.1540147669589294, + "grad_norm": 0.1813315898180008, + "learning_rate": 0.00019822917781365474, + "loss": 0.9944, + "step": 1335 + }, + { + "epoch": 0.15459160129210892, + "grad_norm": 0.18800750374794006, + "learning_rate": 0.00019819125269405352, + "loss": 0.9283, + "step": 1340 + }, + { + "epoch": 0.15516843562528843, + "grad_norm": 0.19481781125068665, + "learning_rate": 0.00019815292945641705, + "loss": 0.9559, + "step": 1345 + }, + { + "epoch": 0.15574526995846794, + "grad_norm": 0.17894810438156128, + "learning_rate": 0.0001981142082561274, + "loss": 0.9628, + "step": 1350 + }, + { + "epoch": 0.15632210429164745, + "grad_norm": 0.1818206012248993, + "learning_rate": 0.0001980750892501804, + "loss": 1.0073, + "step": 1355 + }, + { + "epoch": 0.15689893862482696, + "grad_norm": 0.19612440466880798, + "learning_rate": 0.0001980355725971847, + "loss": 0.9837, + "step": 1360 + }, + { + "epoch": 0.15747577295800647, + "grad_norm": 0.1835489571094513, + "learning_rate": 0.0001979956584573612, + "loss": 1.0062, + "step": 1365 + }, + { + "epoch": 0.15805260729118598, + "grad_norm": 0.18093307316303253, + "learning_rate": 0.00019795534699254238, + "loss": 0.9496, + "step": 1370 + }, + { + "epoch": 0.15862944162436549, + "grad_norm": 0.1910361796617508, + "learning_rate": 0.00019791463836617176, + "loss": 1.0064, + "step": 1375 + }, + { + "epoch": 0.159206275957545, + "grad_norm": 0.17584437131881714, + "learning_rate": 0.00019787353274330313, + "loss": 0.9604, + "step": 1380 + }, + { + "epoch": 0.1597831102907245, + "grad_norm": 0.193894624710083, + "learning_rate": 0.00019783203029059997, + "loss": 0.9816, + "step": 1385 + }, + { + "epoch": 0.16035994462390402, + "grad_norm": 0.18043072521686554, + "learning_rate": 0.00019779013117633454, + "loss": 0.9106, + "step": 1390 + }, + { + "epoch": 0.16093677895708353, + "grad_norm": 0.18467725813388824, + "learning_rate": 0.00019774783557038755, + "loss": 0.9019, + "step": 1395 + }, + { + "epoch": 0.16151361329026304, + "grad_norm": 0.17407982051372528, + "learning_rate": 0.00019770514364424725, + "loss": 0.9465, + "step": 1400 + }, + { + "epoch": 0.16209044762344255, + "grad_norm": 0.18963223695755005, + "learning_rate": 0.00019766205557100868, + "loss": 0.9775, + "step": 1405 + }, + { + "epoch": 0.16266728195662206, + "grad_norm": 0.18725010752677917, + "learning_rate": 0.0001976185715253732, + "loss": 0.9709, + "step": 1410 + }, + { + "epoch": 0.16324411628980157, + "grad_norm": 0.17535746097564697, + "learning_rate": 0.0001975746916836475, + "loss": 0.9495, + "step": 1415 + }, + { + "epoch": 0.16382095062298108, + "grad_norm": 0.19157184660434723, + "learning_rate": 0.0001975304162237432, + "loss": 0.9545, + "step": 1420 + }, + { + "epoch": 0.1643977849561606, + "grad_norm": 0.20112945139408112, + "learning_rate": 0.00019748574532517586, + "loss": 0.9671, + "step": 1425 + }, + { + "epoch": 0.1649746192893401, + "grad_norm": 0.19094939529895782, + "learning_rate": 0.0001974406791690643, + "loss": 0.9768, + "step": 1430 + }, + { + "epoch": 0.1655514536225196, + "grad_norm": 0.175222247838974, + "learning_rate": 0.00019739521793813006, + "loss": 0.9699, + "step": 1435 + }, + { + "epoch": 0.16612828795569912, + "grad_norm": 0.18948869407176971, + "learning_rate": 0.00019734936181669638, + "loss": 1.0102, + "step": 1440 + }, + { + "epoch": 0.16670512228887863, + "grad_norm": 0.18475863337516785, + "learning_rate": 0.00019730311099068771, + "loss": 0.922, + "step": 1445 + }, + { + "epoch": 0.16728195662205814, + "grad_norm": 0.18690991401672363, + "learning_rate": 0.00019725646564762878, + "loss": 0.9693, + "step": 1450 + }, + { + "epoch": 0.16785879095523765, + "grad_norm": 0.1863308697938919, + "learning_rate": 0.00019720942597664385, + "loss": 0.9639, + "step": 1455 + }, + { + "epoch": 0.16843562528841716, + "grad_norm": 0.18741768598556519, + "learning_rate": 0.00019716199216845604, + "loss": 1.0382, + "step": 1460 + }, + { + "epoch": 0.16901245962159667, + "grad_norm": 0.18736031651496887, + "learning_rate": 0.00019711416441538652, + "loss": 1.0025, + "step": 1465 + }, + { + "epoch": 0.16958929395477618, + "grad_norm": 0.18398988246917725, + "learning_rate": 0.00019706594291135366, + "loss": 0.9566, + "step": 1470 + }, + { + "epoch": 0.1701661282879557, + "grad_norm": 0.18206271529197693, + "learning_rate": 0.0001970173278518724, + "loss": 0.9727, + "step": 1475 + }, + { + "epoch": 0.1707429626211352, + "grad_norm": 0.24337929487228394, + "learning_rate": 0.00019696831943405324, + "loss": 1.0105, + "step": 1480 + }, + { + "epoch": 0.1713197969543147, + "grad_norm": 0.20907312631607056, + "learning_rate": 0.0001969189178566016, + "loss": 0.9619, + "step": 1485 + }, + { + "epoch": 0.17189663128749424, + "grad_norm": 0.1832038313150406, + "learning_rate": 0.00019686912331981702, + "loss": 0.9998, + "step": 1490 + }, + { + "epoch": 0.17247346562067375, + "grad_norm": 0.1847505420446396, + "learning_rate": 0.00019681893602559224, + "loss": 0.9444, + "step": 1495 + }, + { + "epoch": 0.17305029995385326, + "grad_norm": 0.19578874111175537, + "learning_rate": 0.00019676835617741249, + "loss": 0.966, + "step": 1500 + }, + { + "epoch": 0.17362713428703277, + "grad_norm": 0.20430216193199158, + "learning_rate": 0.0001967173839803545, + "loss": 0.9983, + "step": 1505 + }, + { + "epoch": 0.17420396862021229, + "grad_norm": 0.19105270504951477, + "learning_rate": 0.00019666601964108598, + "loss": 0.9622, + "step": 1510 + }, + { + "epoch": 0.1747808029533918, + "grad_norm": 0.19650229811668396, + "learning_rate": 0.00019661426336786445, + "loss": 0.924, + "step": 1515 + }, + { + "epoch": 0.1753576372865713, + "grad_norm": 0.18799568712711334, + "learning_rate": 0.00019656211537053654, + "loss": 0.9319, + "step": 1520 + }, + { + "epoch": 0.17593447161975082, + "grad_norm": 0.19247035682201385, + "learning_rate": 0.00019650957586053716, + "loss": 0.9913, + "step": 1525 + }, + { + "epoch": 0.17651130595293033, + "grad_norm": 0.18789616227149963, + "learning_rate": 0.00019645664505088864, + "loss": 0.8992, + "step": 1530 + }, + { + "epoch": 0.17708814028610984, + "grad_norm": 0.18446215987205505, + "learning_rate": 0.00019640332315619977, + "loss": 0.987, + "step": 1535 + }, + { + "epoch": 0.17766497461928935, + "grad_norm": 0.1788845956325531, + "learning_rate": 0.00019634961039266506, + "loss": 0.9455, + "step": 1540 + }, + { + "epoch": 0.17824180895246886, + "grad_norm": 0.17681817710399628, + "learning_rate": 0.0001962955069780638, + "loss": 1.0042, + "step": 1545 + }, + { + "epoch": 0.17881864328564837, + "grad_norm": 0.18479640781879425, + "learning_rate": 0.00019624101313175918, + "loss": 0.9973, + "step": 1550 + }, + { + "epoch": 0.17939547761882788, + "grad_norm": 0.17797234654426575, + "learning_rate": 0.00019618612907469732, + "loss": 0.959, + "step": 1555 + }, + { + "epoch": 0.17997231195200739, + "grad_norm": 0.1861361563205719, + "learning_rate": 0.00019613085502940658, + "loss": 0.9529, + "step": 1560 + }, + { + "epoch": 0.1805491462851869, + "grad_norm": 0.19123396277427673, + "learning_rate": 0.00019607519121999647, + "loss": 0.9506, + "step": 1565 + }, + { + "epoch": 0.1811259806183664, + "grad_norm": 0.19712139666080475, + "learning_rate": 0.00019601913787215683, + "loss": 1.0023, + "step": 1570 + }, + { + "epoch": 0.18170281495154592, + "grad_norm": 0.18968220055103302, + "learning_rate": 0.0001959626952131568, + "loss": 0.9354, + "step": 1575 + }, + { + "epoch": 0.18227964928472543, + "grad_norm": 0.1877613365650177, + "learning_rate": 0.00019590586347184417, + "loss": 0.9825, + "step": 1580 + }, + { + "epoch": 0.18285648361790494, + "grad_norm": 0.1915743201971054, + "learning_rate": 0.00019584864287864408, + "loss": 0.9456, + "step": 1585 + }, + { + "epoch": 0.18343331795108445, + "grad_norm": 0.19143231213092804, + "learning_rate": 0.0001957910336655584, + "loss": 0.9993, + "step": 1590 + }, + { + "epoch": 0.18401015228426396, + "grad_norm": 0.1909196525812149, + "learning_rate": 0.00019573303606616459, + "loss": 0.9774, + "step": 1595 + }, + { + "epoch": 0.18458698661744347, + "grad_norm": 0.19479255378246307, + "learning_rate": 0.00019567465031561487, + "loss": 0.9727, + "step": 1600 + }, + { + "epoch": 0.18516382095062298, + "grad_norm": 0.18205581605434418, + "learning_rate": 0.0001956158766506352, + "loss": 0.9786, + "step": 1605 + }, + { + "epoch": 0.18574065528380249, + "grad_norm": 0.19251278042793274, + "learning_rate": 0.00019555671530952445, + "loss": 0.9815, + "step": 1610 + }, + { + "epoch": 0.186317489616982, + "grad_norm": 0.1939883977174759, + "learning_rate": 0.00019549716653215318, + "loss": 0.9449, + "step": 1615 + }, + { + "epoch": 0.1868943239501615, + "grad_norm": 0.187269926071167, + "learning_rate": 0.00019543723055996282, + "loss": 0.9425, + "step": 1620 + }, + { + "epoch": 0.18747115828334102, + "grad_norm": 0.18750888109207153, + "learning_rate": 0.00019537690763596487, + "loss": 0.961, + "step": 1625 + }, + { + "epoch": 0.18804799261652053, + "grad_norm": 0.20002637803554535, + "learning_rate": 0.00019531619800473952, + "loss": 0.9399, + "step": 1630 + }, + { + "epoch": 0.18862482694970004, + "grad_norm": 0.19453556835651398, + "learning_rate": 0.00019525510191243498, + "loss": 0.9138, + "step": 1635 + }, + { + "epoch": 0.18920166128287955, + "grad_norm": 0.18952696025371552, + "learning_rate": 0.0001951936196067664, + "loss": 1.0256, + "step": 1640 + }, + { + "epoch": 0.18977849561605906, + "grad_norm": 0.22311453521251678, + "learning_rate": 0.00019513175133701474, + "loss": 0.9247, + "step": 1645 + }, + { + "epoch": 0.19035532994923857, + "grad_norm": 0.1846632957458496, + "learning_rate": 0.00019506949735402588, + "loss": 0.9555, + "step": 1650 + }, + { + "epoch": 0.19093216428241808, + "grad_norm": 0.1898278146982193, + "learning_rate": 0.00019500685791020968, + "loss": 0.9954, + "step": 1655 + }, + { + "epoch": 0.1915089986155976, + "grad_norm": 0.18607108294963837, + "learning_rate": 0.00019494383325953875, + "loss": 0.9065, + "step": 1660 + }, + { + "epoch": 0.19208583294877712, + "grad_norm": 0.19422227144241333, + "learning_rate": 0.00019488042365754758, + "loss": 0.9198, + "step": 1665 + }, + { + "epoch": 0.19266266728195663, + "grad_norm": 0.1838252693414688, + "learning_rate": 0.0001948166293613314, + "loss": 0.9501, + "step": 1670 + }, + { + "epoch": 0.19323950161513614, + "grad_norm": 0.1975485235452652, + "learning_rate": 0.00019475245062954523, + "loss": 0.9422, + "step": 1675 + }, + { + "epoch": 0.19381633594831565, + "grad_norm": 0.19868746399879456, + "learning_rate": 0.00019468788772240286, + "loss": 0.9407, + "step": 1680 + }, + { + "epoch": 0.19439317028149516, + "grad_norm": 0.19394518435001373, + "learning_rate": 0.00019462294090167554, + "loss": 0.9775, + "step": 1685 + }, + { + "epoch": 0.19497000461467467, + "grad_norm": 0.19594334065914154, + "learning_rate": 0.0001945576104306913, + "loss": 0.9393, + "step": 1690 + }, + { + "epoch": 0.19554683894785418, + "grad_norm": 0.18195070326328278, + "learning_rate": 0.00019449189657433358, + "loss": 0.9847, + "step": 1695 + }, + { + "epoch": 0.1961236732810337, + "grad_norm": 0.19553768634796143, + "learning_rate": 0.00019442579959904024, + "loss": 0.946, + "step": 1700 + }, + { + "epoch": 0.1967005076142132, + "grad_norm": 0.18642808496952057, + "learning_rate": 0.0001943593197728026, + "loss": 0.938, + "step": 1705 + }, + { + "epoch": 0.19727734194739271, + "grad_norm": 0.18931232392787933, + "learning_rate": 0.00019429245736516415, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.19785417628057222, + "grad_norm": 0.1960645318031311, + "learning_rate": 0.00019422521264721962, + "loss": 0.9677, + "step": 1715 + }, + { + "epoch": 0.19843101061375173, + "grad_norm": 0.18322202563285828, + "learning_rate": 0.00019415758589161385, + "loss": 0.9631, + "step": 1720 + }, + { + "epoch": 0.19900784494693124, + "grad_norm": 0.19517329335212708, + "learning_rate": 0.0001940895773725406, + "loss": 0.9644, + "step": 1725 + }, + { + "epoch": 0.19958467928011075, + "grad_norm": 0.19266700744628906, + "learning_rate": 0.00019402118736574155, + "loss": 0.9747, + "step": 1730 + }, + { + "epoch": 0.20016151361329027, + "grad_norm": 0.18284273147583008, + "learning_rate": 0.00019395241614850504, + "loss": 0.922, + "step": 1735 + }, + { + "epoch": 0.20073834794646978, + "grad_norm": 0.19635812938213348, + "learning_rate": 0.00019388326399966515, + "loss": 0.9793, + "step": 1740 + }, + { + "epoch": 0.20131518227964929, + "grad_norm": 0.19478964805603027, + "learning_rate": 0.00019381373119960033, + "loss": 0.9948, + "step": 1745 + }, + { + "epoch": 0.2018920166128288, + "grad_norm": 0.2036595195531845, + "learning_rate": 0.00019374381803023252, + "loss": 1.0269, + "step": 1750 + }, + { + "epoch": 0.2024688509460083, + "grad_norm": 0.20682990550994873, + "learning_rate": 0.00019367352477502576, + "loss": 0.9985, + "step": 1755 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 0.178926020860672, + "learning_rate": 0.0001936028517189852, + "loss": 0.9763, + "step": 1760 + }, + { + "epoch": 0.20362251961236733, + "grad_norm": 0.19505412876605988, + "learning_rate": 0.00019353179914865596, + "loss": 0.9484, + "step": 1765 + }, + { + "epoch": 0.20419935394554684, + "grad_norm": 0.20620250701904297, + "learning_rate": 0.00019346036735212177, + "loss": 0.9963, + "step": 1770 + }, + { + "epoch": 0.20477618827872635, + "grad_norm": 0.19574880599975586, + "learning_rate": 0.00019338855661900405, + "loss": 0.9976, + "step": 1775 + }, + { + "epoch": 0.20535302261190586, + "grad_norm": 0.19267630577087402, + "learning_rate": 0.00019331636724046058, + "loss": 0.9666, + "step": 1780 + }, + { + "epoch": 0.20592985694508537, + "grad_norm": 0.18831545114517212, + "learning_rate": 0.00019324379950918437, + "loss": 1.0032, + "step": 1785 + }, + { + "epoch": 0.20650669127826488, + "grad_norm": 0.19826582074165344, + "learning_rate": 0.00019317085371940246, + "loss": 0.9847, + "step": 1790 + }, + { + "epoch": 0.20708352561144439, + "grad_norm": 0.19098900258541107, + "learning_rate": 0.00019309753016687477, + "loss": 0.9777, + "step": 1795 + }, + { + "epoch": 0.2076603599446239, + "grad_norm": 0.19980227947235107, + "learning_rate": 0.00019302382914889284, + "loss": 0.9907, + "step": 1800 + }, + { + "epoch": 0.2082371942778034, + "grad_norm": 0.18145912885665894, + "learning_rate": 0.00019294975096427862, + "loss": 0.9465, + "step": 1805 + }, + { + "epoch": 0.20881402861098292, + "grad_norm": 0.18919304013252258, + "learning_rate": 0.00019287529591338333, + "loss": 0.9272, + "step": 1810 + }, + { + "epoch": 0.20939086294416243, + "grad_norm": 0.18271850049495697, + "learning_rate": 0.0001928004642980862, + "loss": 0.9525, + "step": 1815 + }, + { + "epoch": 0.20996769727734194, + "grad_norm": 0.21809083223342896, + "learning_rate": 0.00019272525642179323, + "loss": 0.972, + "step": 1820 + }, + { + "epoch": 0.21054453161052145, + "grad_norm": 0.19276146590709686, + "learning_rate": 0.00019264967258943595, + "loss": 0.9335, + "step": 1825 + }, + { + "epoch": 0.21112136594370096, + "grad_norm": 0.18455006182193756, + "learning_rate": 0.0001925737131074703, + "loss": 0.9558, + "step": 1830 + }, + { + "epoch": 0.2116982002768805, + "grad_norm": 0.2075817883014679, + "learning_rate": 0.00019249737828387522, + "loss": 1.0067, + "step": 1835 + }, + { + "epoch": 0.21227503461006, + "grad_norm": 0.18990999460220337, + "learning_rate": 0.00019242066842815146, + "loss": 0.9964, + "step": 1840 + }, + { + "epoch": 0.21285186894323951, + "grad_norm": 0.19079752266407013, + "learning_rate": 0.00019234358385132038, + "loss": 0.9827, + "step": 1845 + }, + { + "epoch": 0.21342870327641902, + "grad_norm": 0.187656432390213, + "learning_rate": 0.00019226612486592271, + "loss": 0.9619, + "step": 1850 + }, + { + "epoch": 0.21400553760959853, + "grad_norm": 0.19485455751419067, + "learning_rate": 0.00019218829178601713, + "loss": 0.9874, + "step": 1855 + }, + { + "epoch": 0.21458237194277804, + "grad_norm": 0.19166654348373413, + "learning_rate": 0.00019211008492717914, + "loss": 1.0142, + "step": 1860 + }, + { + "epoch": 0.21515920627595755, + "grad_norm": 0.20272095501422882, + "learning_rate": 0.0001920315046064997, + "loss": 0.9981, + "step": 1865 + }, + { + "epoch": 0.21573604060913706, + "grad_norm": 0.18274882435798645, + "learning_rate": 0.00019195255114258408, + "loss": 0.9761, + "step": 1870 + }, + { + "epoch": 0.21631287494231657, + "grad_norm": 0.18468588590621948, + "learning_rate": 0.00019187322485555031, + "loss": 0.9657, + "step": 1875 + }, + { + "epoch": 0.21688970927549608, + "grad_norm": 0.19172626733779907, + "learning_rate": 0.00019179352606702813, + "loss": 0.9432, + "step": 1880 + }, + { + "epoch": 0.2174665436086756, + "grad_norm": 0.1883264034986496, + "learning_rate": 0.00019171345510015758, + "loss": 0.9939, + "step": 1885 + }, + { + "epoch": 0.2180433779418551, + "grad_norm": 0.1864314079284668, + "learning_rate": 0.0001916330122795877, + "loss": 1.0028, + "step": 1890 + }, + { + "epoch": 0.21862021227503461, + "grad_norm": 0.19553621113300323, + "learning_rate": 0.00019155219793147522, + "loss": 0.9866, + "step": 1895 + }, + { + "epoch": 0.21919704660821412, + "grad_norm": 0.1937098354101181, + "learning_rate": 0.00019147101238348326, + "loss": 0.9515, + "step": 1900 + }, + { + "epoch": 0.21977388094139363, + "grad_norm": 0.1826750636100769, + "learning_rate": 0.00019138945596477994, + "loss": 0.9832, + "step": 1905 + }, + { + "epoch": 0.22035071527457314, + "grad_norm": 0.19543537497520447, + "learning_rate": 0.00019130752900603702, + "loss": 0.9677, + "step": 1910 + }, + { + "epoch": 0.22092754960775265, + "grad_norm": 0.18481284379959106, + "learning_rate": 0.00019122523183942879, + "loss": 0.9081, + "step": 1915 + }, + { + "epoch": 0.22150438394093216, + "grad_norm": 0.17375807464122772, + "learning_rate": 0.00019114256479863038, + "loss": 0.9674, + "step": 1920 + }, + { + "epoch": 0.22208121827411167, + "grad_norm": 0.19649936258792877, + "learning_rate": 0.00019105952821881668, + "loss": 0.9693, + "step": 1925 + }, + { + "epoch": 0.22265805260729118, + "grad_norm": 0.18931066989898682, + "learning_rate": 0.00019097612243666086, + "loss": 0.9665, + "step": 1930 + }, + { + "epoch": 0.2232348869404707, + "grad_norm": 0.18258830904960632, + "learning_rate": 0.00019089234779033306, + "loss": 0.9844, + "step": 1935 + }, + { + "epoch": 0.2238117212736502, + "grad_norm": 0.2044518142938614, + "learning_rate": 0.00019080820461949886, + "loss": 0.9293, + "step": 1940 + }, + { + "epoch": 0.22438855560682971, + "grad_norm": 0.1841280460357666, + "learning_rate": 0.00019072369326531824, + "loss": 0.9823, + "step": 1945 + }, + { + "epoch": 0.22496538994000922, + "grad_norm": 0.1927040070295334, + "learning_rate": 0.00019063881407044373, + "loss": 0.993, + "step": 1950 + }, + { + "epoch": 0.22554222427318874, + "grad_norm": 0.19881246984004974, + "learning_rate": 0.00019055356737901952, + "loss": 0.9679, + "step": 1955 + }, + { + "epoch": 0.22611905860636825, + "grad_norm": 0.18685629963874817, + "learning_rate": 0.00019046795353667965, + "loss": 0.961, + "step": 1960 + }, + { + "epoch": 0.22669589293954776, + "grad_norm": 0.1891525834798813, + "learning_rate": 0.00019038197289054684, + "loss": 0.9496, + "step": 1965 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.1977647989988327, + "learning_rate": 0.00019029562578923106, + "loss": 0.9804, + "step": 1970 + }, + { + "epoch": 0.22784956160590678, + "grad_norm": 0.19999738037586212, + "learning_rate": 0.000190208912582828, + "loss": 1.0139, + "step": 1975 + }, + { + "epoch": 0.22842639593908629, + "grad_norm": 0.18472820520401, + "learning_rate": 0.0001901218336229178, + "loss": 0.989, + "step": 1980 + }, + { + "epoch": 0.2290032302722658, + "grad_norm": 0.18772737681865692, + "learning_rate": 0.0001900343892625635, + "loss": 0.9869, + "step": 1985 + }, + { + "epoch": 0.2295800646054453, + "grad_norm": 0.21390481293201447, + "learning_rate": 0.00018994657985630972, + "loss": 0.9556, + "step": 1990 + }, + { + "epoch": 0.23015689893862482, + "grad_norm": 0.18885917961597443, + "learning_rate": 0.00018985840576018107, + "loss": 0.951, + "step": 1995 + }, + { + "epoch": 0.23073373327180433, + "grad_norm": 0.18941619992256165, + "learning_rate": 0.00018976986733168093, + "loss": 0.9348, + "step": 2000 + }, + { + "epoch": 0.23131056760498384, + "grad_norm": 0.18609663844108582, + "learning_rate": 0.00018968096492978976, + "loss": 0.9704, + "step": 2005 + }, + { + "epoch": 0.23188740193816337, + "grad_norm": 0.19215025007724762, + "learning_rate": 0.0001895916989149638, + "loss": 0.9188, + "step": 2010 + }, + { + "epoch": 0.23246423627134288, + "grad_norm": 0.18575677275657654, + "learning_rate": 0.00018950206964913355, + "loss": 0.9793, + "step": 2015 + }, + { + "epoch": 0.2330410706045224, + "grad_norm": 0.198299378156662, + "learning_rate": 0.00018941207749570237, + "loss": 0.9936, + "step": 2020 + }, + { + "epoch": 0.2336179049377019, + "grad_norm": 0.1827745884656906, + "learning_rate": 0.0001893217228195449, + "loss": 0.9306, + "step": 2025 + }, + { + "epoch": 0.2341947392708814, + "grad_norm": 0.19425687193870544, + "learning_rate": 0.00018923100598700561, + "loss": 1.0467, + "step": 2030 + }, + { + "epoch": 0.23477157360406092, + "grad_norm": 0.18473173677921295, + "learning_rate": 0.00018913992736589746, + "loss": 1.0026, + "step": 2035 + }, + { + "epoch": 0.23534840793724043, + "grad_norm": 0.1885528862476349, + "learning_rate": 0.0001890484873255001, + "loss": 0.9516, + "step": 2040 + }, + { + "epoch": 0.23592524227041994, + "grad_norm": 0.19046476483345032, + "learning_rate": 0.00018895668623655873, + "loss": 0.9957, + "step": 2045 + }, + { + "epoch": 0.23650207660359945, + "grad_norm": 0.1952226310968399, + "learning_rate": 0.0001888645244712824, + "loss": 0.98, + "step": 2050 + }, + { + "epoch": 0.23707891093677896, + "grad_norm": 0.1928633153438568, + "learning_rate": 0.00018877200240334236, + "loss": 1.0137, + "step": 2055 + }, + { + "epoch": 0.23765574526995847, + "grad_norm": 0.1882990151643753, + "learning_rate": 0.00018867912040787096, + "loss": 1.0047, + "step": 2060 + }, + { + "epoch": 0.23823257960313798, + "grad_norm": 0.20092810690402985, + "learning_rate": 0.00018858587886145975, + "loss": 0.9555, + "step": 2065 + }, + { + "epoch": 0.2388094139363175, + "grad_norm": 0.19786730408668518, + "learning_rate": 0.00018849227814215805, + "loss": 0.9894, + "step": 2070 + }, + { + "epoch": 0.239386248269497, + "grad_norm": 0.19983135163784027, + "learning_rate": 0.00018839831862947152, + "loss": 0.9391, + "step": 2075 + }, + { + "epoch": 0.23996308260267651, + "grad_norm": 0.19174017012119293, + "learning_rate": 0.00018830400070436057, + "loss": 0.9526, + "step": 2080 + }, + { + "epoch": 0.24053991693585602, + "grad_norm": 0.1841077357530594, + "learning_rate": 0.00018820932474923873, + "loss": 0.9744, + "step": 2085 + }, + { + "epoch": 0.24111675126903553, + "grad_norm": 0.19150149822235107, + "learning_rate": 0.00018811429114797123, + "loss": 0.9676, + "step": 2090 + }, + { + "epoch": 0.24169358560221504, + "grad_norm": 0.20261262357234955, + "learning_rate": 0.00018801890028587333, + "loss": 0.9706, + "step": 2095 + }, + { + "epoch": 0.24227041993539455, + "grad_norm": 0.199992835521698, + "learning_rate": 0.0001879231525497089, + "loss": 1.0026, + "step": 2100 + }, + { + "epoch": 0.24284725426857406, + "grad_norm": 0.20790398120880127, + "learning_rate": 0.0001878270483276886, + "loss": 1.0086, + "step": 2105 + }, + { + "epoch": 0.24342408860175357, + "grad_norm": 0.1923636943101883, + "learning_rate": 0.00018773058800946858, + "loss": 0.932, + "step": 2110 + }, + { + "epoch": 0.24400092293493308, + "grad_norm": 0.18227846920490265, + "learning_rate": 0.00018763377198614887, + "loss": 0.9612, + "step": 2115 + }, + { + "epoch": 0.2445777572681126, + "grad_norm": 0.18197417259216309, + "learning_rate": 0.00018753660065027152, + "loss": 1.0405, + "step": 2120 + }, + { + "epoch": 0.2451545916012921, + "grad_norm": 0.1864314079284668, + "learning_rate": 0.00018743907439581933, + "loss": 0.9918, + "step": 2125 + }, + { + "epoch": 0.24573142593447161, + "grad_norm": 0.18296851217746735, + "learning_rate": 0.0001873411936182141, + "loss": 0.9839, + "step": 2130 + }, + { + "epoch": 0.24630826026765112, + "grad_norm": 0.20536720752716064, + "learning_rate": 0.000187242958714315, + "loss": 0.9955, + "step": 2135 + }, + { + "epoch": 0.24688509460083063, + "grad_norm": 0.18991383910179138, + "learning_rate": 0.00018714437008241709, + "loss": 0.9564, + "step": 2140 + }, + { + "epoch": 0.24746192893401014, + "grad_norm": 0.21389083564281464, + "learning_rate": 0.00018704542812224956, + "loss": 1.0121, + "step": 2145 + }, + { + "epoch": 0.24803876326718965, + "grad_norm": 0.19424885511398315, + "learning_rate": 0.00018694613323497422, + "loss": 0.9919, + "step": 2150 + }, + { + "epoch": 0.24861559760036916, + "grad_norm": 0.20211443305015564, + "learning_rate": 0.0001868464858231838, + "loss": 1.0288, + "step": 2155 + }, + { + "epoch": 0.24919243193354867, + "grad_norm": 0.1808236688375473, + "learning_rate": 0.0001867464862909004, + "loss": 0.8799, + "step": 2160 + }, + { + "epoch": 0.24976926626672818, + "grad_norm": 0.19431866705417633, + "learning_rate": 0.00018664613504357366, + "loss": 0.942, + "step": 2165 + }, + { + "epoch": 0.2503461005999077, + "grad_norm": 0.18865206837654114, + "learning_rate": 0.0001865454324880794, + "loss": 0.9667, + "step": 2170 + }, + { + "epoch": 0.2509229349330872, + "grad_norm": 0.1844114363193512, + "learning_rate": 0.00018644437903271778, + "loss": 0.9783, + "step": 2175 + }, + { + "epoch": 0.2514997692662667, + "grad_norm": 0.19228576123714447, + "learning_rate": 0.00018634297508721167, + "loss": 0.9547, + "step": 2180 + }, + { + "epoch": 0.2520766035994462, + "grad_norm": 0.19523252546787262, + "learning_rate": 0.00018624122106270506, + "loss": 0.944, + "step": 2185 + }, + { + "epoch": 0.25265343793262574, + "grad_norm": 0.18889620900154114, + "learning_rate": 0.00018613911737176125, + "loss": 0.9372, + "step": 2190 + }, + { + "epoch": 0.25323027226580525, + "grad_norm": 0.1964087188243866, + "learning_rate": 0.0001860366644283613, + "loss": 0.9885, + "step": 2195 + }, + { + "epoch": 0.25380710659898476, + "grad_norm": 0.19151298701763153, + "learning_rate": 0.00018593386264790243, + "loss": 1.023, + "step": 2200 + }, + { + "epoch": 0.25438394093216427, + "grad_norm": 0.181709885597229, + "learning_rate": 0.00018583071244719607, + "loss": 0.8938, + "step": 2205 + }, + { + "epoch": 0.2549607752653438, + "grad_norm": 0.18949133157730103, + "learning_rate": 0.0001857272142444664, + "loss": 0.9712, + "step": 2210 + }, + { + "epoch": 0.2555376095985233, + "grad_norm": 0.19438017904758453, + "learning_rate": 0.0001856233684593486, + "loss": 0.9659, + "step": 2215 + }, + { + "epoch": 0.2561144439317028, + "grad_norm": 0.19530200958251953, + "learning_rate": 0.00018551917551288706, + "loss": 1.0126, + "step": 2220 + }, + { + "epoch": 0.2566912782648823, + "grad_norm": 0.19227471947669983, + "learning_rate": 0.0001854146358275338, + "loss": 0.9807, + "step": 2225 + }, + { + "epoch": 0.2572681125980618, + "grad_norm": 0.19348041713237762, + "learning_rate": 0.00018530974982714667, + "loss": 0.9755, + "step": 2230 + }, + { + "epoch": 0.2578449469312413, + "grad_norm": 0.2049867808818817, + "learning_rate": 0.0001852045179369877, + "loss": 0.9948, + "step": 2235 + }, + { + "epoch": 0.25842178126442084, + "grad_norm": 0.19374214112758636, + "learning_rate": 0.0001850989405837212, + "loss": 1.028, + "step": 2240 + }, + { + "epoch": 0.25899861559760035, + "grad_norm": 0.18960264325141907, + "learning_rate": 0.0001849930181954124, + "loss": 1.0199, + "step": 2245 + }, + { + "epoch": 0.25957544993077986, + "grad_norm": 0.19603078067302704, + "learning_rate": 0.00018488675120152532, + "loss": 0.9788, + "step": 2250 + }, + { + "epoch": 0.26015228426395937, + "grad_norm": 0.19418926537036896, + "learning_rate": 0.00018478014003292116, + "loss": 0.9406, + "step": 2255 + }, + { + "epoch": 0.2607291185971389, + "grad_norm": 0.183867946267128, + "learning_rate": 0.0001846731851218567, + "loss": 0.9217, + "step": 2260 + }, + { + "epoch": 0.2613059529303184, + "grad_norm": 0.19260871410369873, + "learning_rate": 0.00018456588690198236, + "loss": 0.9618, + "step": 2265 + }, + { + "epoch": 0.2618827872634979, + "grad_norm": 0.1969606876373291, + "learning_rate": 0.0001844582458083405, + "loss": 0.9594, + "step": 2270 + }, + { + "epoch": 0.26245962159667746, + "grad_norm": 0.1877615749835968, + "learning_rate": 0.0001843502622773637, + "loss": 0.952, + "step": 2275 + }, + { + "epoch": 0.26303645592985697, + "grad_norm": 0.18486037850379944, + "learning_rate": 0.00018424193674687297, + "loss": 0.9699, + "step": 2280 + }, + { + "epoch": 0.2636132902630365, + "grad_norm": 0.19585862755775452, + "learning_rate": 0.00018413326965607593, + "loss": 0.9894, + "step": 2285 + }, + { + "epoch": 0.264190124596216, + "grad_norm": 0.17970632016658783, + "learning_rate": 0.00018402426144556504, + "loss": 0.9834, + "step": 2290 + }, + { + "epoch": 0.2647669589293955, + "grad_norm": 0.19453051686286926, + "learning_rate": 0.0001839149125573159, + "loss": 0.9737, + "step": 2295 + }, + { + "epoch": 0.265343793262575, + "grad_norm": 0.20039866864681244, + "learning_rate": 0.00018380522343468532, + "loss": 1.0147, + "step": 2300 + }, + { + "epoch": 0.2659206275957545, + "grad_norm": 0.1953950971364975, + "learning_rate": 0.00018369519452240973, + "loss": 0.9651, + "step": 2305 + }, + { + "epoch": 0.26649746192893403, + "grad_norm": 0.1906932145357132, + "learning_rate": 0.00018358482626660303, + "loss": 0.9364, + "step": 2310 + }, + { + "epoch": 0.26707429626211354, + "grad_norm": 0.1904212385416031, + "learning_rate": 0.0001834741191147552, + "loss": 0.9498, + "step": 2315 + }, + { + "epoch": 0.26765113059529305, + "grad_norm": 0.18951915204524994, + "learning_rate": 0.00018336307351573018, + "loss": 1.0005, + "step": 2320 + }, + { + "epoch": 0.26822796492847256, + "grad_norm": 0.19409124553203583, + "learning_rate": 0.00018325168991976408, + "loss": 1.0019, + "step": 2325 + }, + { + "epoch": 0.26880479926165207, + "grad_norm": 0.20801284909248352, + "learning_rate": 0.00018313996877846361, + "loss": 0.9383, + "step": 2330 + }, + { + "epoch": 0.2693816335948316, + "grad_norm": 0.19471175968647003, + "learning_rate": 0.00018302791054480394, + "loss": 1.0274, + "step": 2335 + }, + { + "epoch": 0.2699584679280111, + "grad_norm": 0.20756390690803528, + "learning_rate": 0.00018291551567312694, + "loss": 0.9846, + "step": 2340 + }, + { + "epoch": 0.2705353022611906, + "grad_norm": 0.1832803636789322, + "learning_rate": 0.00018280278461913952, + "loss": 0.9467, + "step": 2345 + }, + { + "epoch": 0.2711121365943701, + "grad_norm": 0.19528664648532867, + "learning_rate": 0.00018268971783991152, + "loss": 0.9564, + "step": 2350 + }, + { + "epoch": 0.2716889709275496, + "grad_norm": 0.18837648630142212, + "learning_rate": 0.00018257631579387412, + "loss": 0.9511, + "step": 2355 + }, + { + "epoch": 0.27226580526072913, + "grad_norm": 0.20164744555950165, + "learning_rate": 0.0001824625789408177, + "loss": 0.9314, + "step": 2360 + }, + { + "epoch": 0.27284263959390864, + "grad_norm": 0.1876562237739563, + "learning_rate": 0.00018234850774189018, + "loss": 0.9518, + "step": 2365 + }, + { + "epoch": 0.27341947392708815, + "grad_norm": 0.18695729970932007, + "learning_rate": 0.00018223410265959516, + "loss": 0.9382, + "step": 2370 + }, + { + "epoch": 0.27399630826026766, + "grad_norm": 0.19776229560375214, + "learning_rate": 0.00018211936415778984, + "loss": 0.9793, + "step": 2375 + }, + { + "epoch": 0.2745731425934472, + "grad_norm": 0.2102830559015274, + "learning_rate": 0.0001820042927016834, + "loss": 0.9862, + "step": 2380 + }, + { + "epoch": 0.2751499769266267, + "grad_norm": 0.19167035818099976, + "learning_rate": 0.0001818888887578349, + "loss": 0.9348, + "step": 2385 + }, + { + "epoch": 0.2757268112598062, + "grad_norm": 0.20402806997299194, + "learning_rate": 0.00018177315279415153, + "loss": 0.9807, + "step": 2390 + }, + { + "epoch": 0.2763036455929857, + "grad_norm": 0.19756613671779633, + "learning_rate": 0.00018165708527988664, + "loss": 1.0354, + "step": 2395 + }, + { + "epoch": 0.2768804799261652, + "grad_norm": 0.19909250736236572, + "learning_rate": 0.00018154068668563782, + "loss": 1.0038, + "step": 2400 + }, + { + "epoch": 0.2774573142593447, + "grad_norm": 0.19040563702583313, + "learning_rate": 0.00018142395748334513, + "loss": 0.9219, + "step": 2405 + }, + { + "epoch": 0.27803414859252423, + "grad_norm": 0.18826067447662354, + "learning_rate": 0.000181306898146289, + "loss": 0.9073, + "step": 2410 + }, + { + "epoch": 0.27861098292570374, + "grad_norm": 0.2101050615310669, + "learning_rate": 0.00018118950914908843, + "loss": 0.9463, + "step": 2415 + }, + { + "epoch": 0.27918781725888325, + "grad_norm": 0.19028016924858093, + "learning_rate": 0.00018107179096769901, + "loss": 0.9478, + "step": 2420 + }, + { + "epoch": 0.27976465159206276, + "grad_norm": 0.21243633329868317, + "learning_rate": 0.00018095374407941104, + "loss": 0.9734, + "step": 2425 + }, + { + "epoch": 0.2803414859252423, + "grad_norm": 0.19807711243629456, + "learning_rate": 0.0001808353689628475, + "loss": 0.9057, + "step": 2430 + }, + { + "epoch": 0.2809183202584218, + "grad_norm": 0.19900068640708923, + "learning_rate": 0.0001807166660979623, + "loss": 0.9656, + "step": 2435 + }, + { + "epoch": 0.2814951545916013, + "grad_norm": 0.18442806601524353, + "learning_rate": 0.00018059763596603814, + "loss": 1.0024, + "step": 2440 + }, + { + "epoch": 0.2820719889247808, + "grad_norm": 0.18683601915836334, + "learning_rate": 0.0001804782790496846, + "loss": 0.9286, + "step": 2445 + }, + { + "epoch": 0.2826488232579603, + "grad_norm": 0.1832648068666458, + "learning_rate": 0.00018035859583283626, + "loss": 1.0318, + "step": 2450 + }, + { + "epoch": 0.2832256575911398, + "grad_norm": 0.1928461343050003, + "learning_rate": 0.00018023858680075061, + "loss": 0.9739, + "step": 2455 + }, + { + "epoch": 0.28380249192431933, + "grad_norm": 0.19916784763336182, + "learning_rate": 0.00018011825244000632, + "loss": 0.9622, + "step": 2460 + }, + { + "epoch": 0.28437932625749884, + "grad_norm": 0.1981884241104126, + "learning_rate": 0.00017999759323850098, + "loss": 0.9899, + "step": 2465 + }, + { + "epoch": 0.28495616059067835, + "grad_norm": 0.19374872744083405, + "learning_rate": 0.0001798766096854493, + "loss": 1.0103, + "step": 2470 + }, + { + "epoch": 0.28553299492385786, + "grad_norm": 0.18874534964561462, + "learning_rate": 0.00017975530227138105, + "loss": 0.9592, + "step": 2475 + }, + { + "epoch": 0.2861098292570374, + "grad_norm": 0.1857873797416687, + "learning_rate": 0.00017963367148813913, + "loss": 0.9453, + "step": 2480 + }, + { + "epoch": 0.2866866635902169, + "grad_norm": 0.18749921023845673, + "learning_rate": 0.0001795117178288775, + "loss": 0.9826, + "step": 2485 + }, + { + "epoch": 0.2872634979233964, + "grad_norm": 0.19447381794452667, + "learning_rate": 0.00017938944178805933, + "loss": 0.9433, + "step": 2490 + }, + { + "epoch": 0.2878403322565759, + "grad_norm": 0.18583492934703827, + "learning_rate": 0.00017926684386145478, + "loss": 0.8993, + "step": 2495 + }, + { + "epoch": 0.2884171665897554, + "grad_norm": 0.21102139353752136, + "learning_rate": 0.00017914392454613913, + "loss": 1.0086, + "step": 2500 + }, + { + "epoch": 0.2889940009229349, + "grad_norm": 0.19170348346233368, + "learning_rate": 0.00017902068434049077, + "loss": 0.9852, + "step": 2505 + }, + { + "epoch": 0.28957083525611443, + "grad_norm": 0.1980980634689331, + "learning_rate": 0.00017889712374418912, + "loss": 0.9699, + "step": 2510 + }, + { + "epoch": 0.29014766958929394, + "grad_norm": 0.20153307914733887, + "learning_rate": 0.00017877324325821264, + "loss": 0.951, + "step": 2515 + }, + { + "epoch": 0.29072450392247345, + "grad_norm": 0.1984160840511322, + "learning_rate": 0.00017864904338483676, + "loss": 0.9704, + "step": 2520 + }, + { + "epoch": 0.29130133825565296, + "grad_norm": 0.20981407165527344, + "learning_rate": 0.00017852452462763192, + "loss": 0.9606, + "step": 2525 + }, + { + "epoch": 0.2918781725888325, + "grad_norm": 0.1887744665145874, + "learning_rate": 0.00017839968749146142, + "loss": 0.9673, + "step": 2530 + }, + { + "epoch": 0.292455006922012, + "grad_norm": 0.2027631551027298, + "learning_rate": 0.0001782745324824795, + "loss": 0.9601, + "step": 2535 + }, + { + "epoch": 0.2930318412551915, + "grad_norm": 0.20790278911590576, + "learning_rate": 0.00017814906010812912, + "loss": 0.9639, + "step": 2540 + }, + { + "epoch": 0.293608675588371, + "grad_norm": 0.1964947134256363, + "learning_rate": 0.00017802327087714016, + "loss": 0.9848, + "step": 2545 + }, + { + "epoch": 0.2941855099215505, + "grad_norm": 0.19823837280273438, + "learning_rate": 0.00017789716529952704, + "loss": 1.0059, + "step": 2550 + }, + { + "epoch": 0.29476234425473, + "grad_norm": 0.20603398978710175, + "learning_rate": 0.00017777074388658693, + "loss": 0.9444, + "step": 2555 + }, + { + "epoch": 0.29533917858790953, + "grad_norm": 0.20617981255054474, + "learning_rate": 0.00017764400715089744, + "loss": 0.9477, + "step": 2560 + }, + { + "epoch": 0.29591601292108904, + "grad_norm": 0.1940777450799942, + "learning_rate": 0.0001775169556063148, + "loss": 0.9033, + "step": 2565 + }, + { + "epoch": 0.29649284725426855, + "grad_norm": 0.2084151804447174, + "learning_rate": 0.00017738958976797157, + "loss": 0.9458, + "step": 2570 + }, + { + "epoch": 0.29706968158744806, + "grad_norm": 0.21188698709011078, + "learning_rate": 0.00017726191015227452, + "loss": 1.0301, + "step": 2575 + }, + { + "epoch": 0.2976465159206276, + "grad_norm": 0.2021496593952179, + "learning_rate": 0.00017713391727690284, + "loss": 0.972, + "step": 2580 + }, + { + "epoch": 0.2982233502538071, + "grad_norm": 0.19584889709949493, + "learning_rate": 0.0001770056116608057, + "loss": 0.9382, + "step": 2585 + }, + { + "epoch": 0.2988001845869866, + "grad_norm": 0.1912315934896469, + "learning_rate": 0.0001768769938242003, + "loss": 0.9805, + "step": 2590 + }, + { + "epoch": 0.2993770189201661, + "grad_norm": 0.2060055434703827, + "learning_rate": 0.0001767480642885698, + "loss": 1.0436, + "step": 2595 + }, + { + "epoch": 0.2999538532533456, + "grad_norm": 0.19867336750030518, + "learning_rate": 0.00017661882357666105, + "loss": 0.9456, + "step": 2600 + }, + { + "epoch": 0.3005306875865251, + "grad_norm": 0.19426298141479492, + "learning_rate": 0.00017648927221248264, + "loss": 1.0145, + "step": 2605 + }, + { + "epoch": 0.30110752191970463, + "grad_norm": 0.2108680009841919, + "learning_rate": 0.00017635941072130268, + "loss": 0.9791, + "step": 2610 + }, + { + "epoch": 0.30168435625288414, + "grad_norm": 0.1892360895872116, + "learning_rate": 0.00017622923962964672, + "loss": 1.0363, + "step": 2615 + }, + { + "epoch": 0.30226119058606365, + "grad_norm": 0.1932888627052307, + "learning_rate": 0.0001760987594652956, + "loss": 0.9866, + "step": 2620 + }, + { + "epoch": 0.3028380249192432, + "grad_norm": 0.19689255952835083, + "learning_rate": 0.00017596797075728322, + "loss": 0.9844, + "step": 2625 + }, + { + "epoch": 0.30341485925242273, + "grad_norm": 0.18802085518836975, + "learning_rate": 0.00017583687403589454, + "loss": 0.9317, + "step": 2630 + }, + { + "epoch": 0.30399169358560224, + "grad_norm": 0.19586369395256042, + "learning_rate": 0.0001757054698326634, + "loss": 0.9983, + "step": 2635 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 0.19697046279907227, + "learning_rate": 0.00017557375868037026, + "loss": 0.9707, + "step": 2640 + }, + { + "epoch": 0.30514536225196126, + "grad_norm": 0.19013574719429016, + "learning_rate": 0.0001754417411130401, + "loss": 0.9764, + "step": 2645 + }, + { + "epoch": 0.30572219658514077, + "grad_norm": 0.2056090384721756, + "learning_rate": 0.0001753094176659403, + "loss": 0.961, + "step": 2650 + }, + { + "epoch": 0.3062990309183203, + "grad_norm": 0.18955689668655396, + "learning_rate": 0.0001751767888755785, + "loss": 0.9444, + "step": 2655 + }, + { + "epoch": 0.3068758652514998, + "grad_norm": 0.20427672564983368, + "learning_rate": 0.00017504385527970028, + "loss": 1.0496, + "step": 2660 + }, + { + "epoch": 0.3074526995846793, + "grad_norm": 0.20869238674640656, + "learning_rate": 0.00017491061741728702, + "loss": 0.9981, + "step": 2665 + }, + { + "epoch": 0.3080295339178588, + "grad_norm": 0.2011020928621292, + "learning_rate": 0.00017477707582855384, + "loss": 0.9737, + "step": 2670 + }, + { + "epoch": 0.3086063682510383, + "grad_norm": 0.20109429955482483, + "learning_rate": 0.00017464323105494727, + "loss": 0.9614, + "step": 2675 + }, + { + "epoch": 0.30918320258421783, + "grad_norm": 0.1839284896850586, + "learning_rate": 0.00017450908363914316, + "loss": 1.0152, + "step": 2680 + }, + { + "epoch": 0.30976003691739734, + "grad_norm": 0.24291379749774933, + "learning_rate": 0.00017437463412504437, + "loss": 0.9756, + "step": 2685 + }, + { + "epoch": 0.31033687125057685, + "grad_norm": 0.20243053138256073, + "learning_rate": 0.00017423988305777864, + "loss": 0.9743, + "step": 2690 + }, + { + "epoch": 0.31091370558375636, + "grad_norm": 0.20824618637561798, + "learning_rate": 0.0001741048309836964, + "loss": 0.9752, + "step": 2695 + }, + { + "epoch": 0.31149053991693587, + "grad_norm": 0.19603955745697021, + "learning_rate": 0.00017396947845036844, + "loss": 0.9745, + "step": 2700 + }, + { + "epoch": 0.3120673742501154, + "grad_norm": 0.2077784240245819, + "learning_rate": 0.00017383382600658388, + "loss": 0.9846, + "step": 2705 + }, + { + "epoch": 0.3126442085832949, + "grad_norm": 0.21430762112140656, + "learning_rate": 0.0001736978742023477, + "loss": 1.0096, + "step": 2710 + }, + { + "epoch": 0.3132210429164744, + "grad_norm": 0.18606549501419067, + "learning_rate": 0.00017356162358887875, + "loss": 0.9688, + "step": 2715 + }, + { + "epoch": 0.3137978772496539, + "grad_norm": 0.1990072876214981, + "learning_rate": 0.00017342507471860733, + "loss": 0.9299, + "step": 2720 + }, + { + "epoch": 0.3143747115828334, + "grad_norm": 0.191040500998497, + "learning_rate": 0.0001732882281451731, + "loss": 0.9946, + "step": 2725 + }, + { + "epoch": 0.31495154591601293, + "grad_norm": 0.21565918624401093, + "learning_rate": 0.0001731510844234227, + "loss": 0.9862, + "step": 2730 + }, + { + "epoch": 0.31552838024919244, + "grad_norm": 0.19910766184329987, + "learning_rate": 0.0001730136441094076, + "loss": 0.9582, + "step": 2735 + }, + { + "epoch": 0.31610521458237195, + "grad_norm": 0.19215498864650726, + "learning_rate": 0.00017287590776038177, + "loss": 0.9837, + "step": 2740 + }, + { + "epoch": 0.31668204891555146, + "grad_norm": 0.19923163950443268, + "learning_rate": 0.0001727378759347995, + "loss": 0.9536, + "step": 2745 + }, + { + "epoch": 0.31725888324873097, + "grad_norm": 0.20635759830474854, + "learning_rate": 0.0001725995491923131, + "loss": 0.9745, + "step": 2750 + }, + { + "epoch": 0.3178357175819105, + "grad_norm": 0.1964532434940338, + "learning_rate": 0.00017246092809377058, + "loss": 1.0143, + "step": 2755 + }, + { + "epoch": 0.31841255191509, + "grad_norm": 0.21438036859035492, + "learning_rate": 0.0001723220132012134, + "loss": 0.9954, + "step": 2760 + }, + { + "epoch": 0.3189893862482695, + "grad_norm": 0.2038785070180893, + "learning_rate": 0.00017218280507787435, + "loss": 1.0069, + "step": 2765 + }, + { + "epoch": 0.319566220581449, + "grad_norm": 0.19554126262664795, + "learning_rate": 0.00017204330428817496, + "loss": 0.933, + "step": 2770 + }, + { + "epoch": 0.3201430549146285, + "grad_norm": 0.20744913816452026, + "learning_rate": 0.00017190351139772348, + "loss": 0.9584, + "step": 2775 + }, + { + "epoch": 0.32071988924780803, + "grad_norm": 0.20495720207691193, + "learning_rate": 0.00017176342697331246, + "loss": 0.9706, + "step": 2780 + }, + { + "epoch": 0.32129672358098754, + "grad_norm": 0.20170055329799652, + "learning_rate": 0.00017162305158291655, + "loss": 0.9542, + "step": 2785 + }, + { + "epoch": 0.32187355791416705, + "grad_norm": 0.18749262392520905, + "learning_rate": 0.00017148238579568995, + "loss": 0.8963, + "step": 2790 + }, + { + "epoch": 0.32245039224734656, + "grad_norm": 0.2015407681465149, + "learning_rate": 0.00017134143018196447, + "loss": 0.9572, + "step": 2795 + }, + { + "epoch": 0.3230272265805261, + "grad_norm": 0.197077214717865, + "learning_rate": 0.00017120018531324689, + "loss": 1.0026, + "step": 2800 + }, + { + "epoch": 0.3236040609137056, + "grad_norm": 0.2059078812599182, + "learning_rate": 0.00017105865176221684, + "loss": 0.9523, + "step": 2805 + }, + { + "epoch": 0.3241808952468851, + "grad_norm": 0.19224663078784943, + "learning_rate": 0.00017091683010272447, + "loss": 0.9887, + "step": 2810 + }, + { + "epoch": 0.3247577295800646, + "grad_norm": 0.20333868265151978, + "learning_rate": 0.00017077472090978798, + "loss": 0.9438, + "step": 2815 + }, + { + "epoch": 0.3253345639132441, + "grad_norm": 0.2038094699382782, + "learning_rate": 0.00017063232475959133, + "loss": 0.9884, + "step": 2820 + }, + { + "epoch": 0.3259113982464236, + "grad_norm": 0.19815954566001892, + "learning_rate": 0.00017048964222948217, + "loss": 0.9506, + "step": 2825 + }, + { + "epoch": 0.32648823257960313, + "grad_norm": 0.19894501566886902, + "learning_rate": 0.00017034667389796904, + "loss": 0.9787, + "step": 2830 + }, + { + "epoch": 0.32706506691278264, + "grad_norm": 0.19535242021083832, + "learning_rate": 0.00017020342034471944, + "loss": 0.9805, + "step": 2835 + }, + { + "epoch": 0.32764190124596215, + "grad_norm": 0.2212100327014923, + "learning_rate": 0.00017005988215055718, + "loss": 0.9826, + "step": 2840 + }, + { + "epoch": 0.32821873557914166, + "grad_norm": 0.20346590876579285, + "learning_rate": 0.00016991605989746025, + "loss": 0.9496, + "step": 2845 + }, + { + "epoch": 0.3287955699123212, + "grad_norm": 0.2145700603723526, + "learning_rate": 0.00016977195416855828, + "loss": 0.977, + "step": 2850 + }, + { + "epoch": 0.3293724042455007, + "grad_norm": 0.18952029943466187, + "learning_rate": 0.00016962756554813037, + "loss": 0.8824, + "step": 2855 + }, + { + "epoch": 0.3299492385786802, + "grad_norm": 0.2040802389383316, + "learning_rate": 0.0001694828946216025, + "loss": 0.9683, + "step": 2860 + }, + { + "epoch": 0.3305260729118597, + "grad_norm": 0.19531820714473724, + "learning_rate": 0.00016933794197554524, + "loss": 1.005, + "step": 2865 + }, + { + "epoch": 0.3311029072450392, + "grad_norm": 0.20256836712360382, + "learning_rate": 0.00016919270819767152, + "loss": 0.9877, + "step": 2870 + }, + { + "epoch": 0.3316797415782187, + "grad_norm": 0.20027059316635132, + "learning_rate": 0.00016904719387683407, + "loss": 0.9758, + "step": 2875 + }, + { + "epoch": 0.33225657591139823, + "grad_norm": 0.1954347789287567, + "learning_rate": 0.00016890139960302304, + "loss": 0.9962, + "step": 2880 + }, + { + "epoch": 0.33283341024457774, + "grad_norm": 0.20312942564487457, + "learning_rate": 0.00016875532596736373, + "loss": 0.986, + "step": 2885 + }, + { + "epoch": 0.33341024457775725, + "grad_norm": 0.20359157025814056, + "learning_rate": 0.00016860897356211403, + "loss": 0.9411, + "step": 2890 + }, + { + "epoch": 0.33398707891093676, + "grad_norm": 0.19114482402801514, + "learning_rate": 0.00016846234298066218, + "loss": 0.9789, + "step": 2895 + }, + { + "epoch": 0.3345639132441163, + "grad_norm": 0.19829359650611877, + "learning_rate": 0.0001683154348175243, + "loss": 0.9444, + "step": 2900 + }, + { + "epoch": 0.3351407475772958, + "grad_norm": 0.18730616569519043, + "learning_rate": 0.00016816824966834183, + "loss": 0.9306, + "step": 2905 + }, + { + "epoch": 0.3357175819104753, + "grad_norm": 0.2252754271030426, + "learning_rate": 0.00016802078812987948, + "loss": 0.9732, + "step": 2910 + }, + { + "epoch": 0.3362944162436548, + "grad_norm": 0.20403608679771423, + "learning_rate": 0.0001678730508000224, + "loss": 0.9538, + "step": 2915 + }, + { + "epoch": 0.3368712505768343, + "grad_norm": 0.23965264856815338, + "learning_rate": 0.00016772503827777396, + "loss": 0.9988, + "step": 2920 + }, + { + "epoch": 0.3374480849100138, + "grad_norm": 0.20559944212436676, + "learning_rate": 0.00016757675116325343, + "loss": 0.9697, + "step": 2925 + }, + { + "epoch": 0.33802491924319333, + "grad_norm": 0.19775480031967163, + "learning_rate": 0.0001674281900576933, + "loss": 0.9915, + "step": 2930 + }, + { + "epoch": 0.33860175357637284, + "grad_norm": 0.20038728415966034, + "learning_rate": 0.00016727935556343698, + "loss": 0.9976, + "step": 2935 + }, + { + "epoch": 0.33917858790955235, + "grad_norm": 0.20284847915172577, + "learning_rate": 0.0001671302482839364, + "loss": 1.0131, + "step": 2940 + }, + { + "epoch": 0.33975542224273186, + "grad_norm": 0.18095138669013977, + "learning_rate": 0.00016698086882374939, + "loss": 0.9535, + "step": 2945 + }, + { + "epoch": 0.3403322565759114, + "grad_norm": 0.18949155509471893, + "learning_rate": 0.00016683121778853746, + "loss": 0.9928, + "step": 2950 + }, + { + "epoch": 0.3409090909090909, + "grad_norm": 0.18761120736598969, + "learning_rate": 0.00016668129578506315, + "loss": 0.968, + "step": 2955 + }, + { + "epoch": 0.3414859252422704, + "grad_norm": 0.19716186821460724, + "learning_rate": 0.00016653110342118764, + "loss": 1.0469, + "step": 2960 + }, + { + "epoch": 0.3420627595754499, + "grad_norm": 0.19194428622722626, + "learning_rate": 0.0001663806413058684, + "loss": 0.9824, + "step": 2965 + }, + { + "epoch": 0.3426395939086294, + "grad_norm": 0.19304610788822174, + "learning_rate": 0.00016622991004915645, + "loss": 0.9739, + "step": 2970 + }, + { + "epoch": 0.343216428241809, + "grad_norm": 0.21132609248161316, + "learning_rate": 0.00016607891026219418, + "loss": 1.0423, + "step": 2975 + }, + { + "epoch": 0.3437932625749885, + "grad_norm": 0.21153788268566132, + "learning_rate": 0.00016592764255721264, + "loss": 0.9585, + "step": 2980 + }, + { + "epoch": 0.344370096908168, + "grad_norm": 0.19157880544662476, + "learning_rate": 0.00016577610754752925, + "loss": 0.93, + "step": 2985 + }, + { + "epoch": 0.3449469312413475, + "grad_norm": 0.2042781114578247, + "learning_rate": 0.00016562430584754516, + "loss": 0.9758, + "step": 2990 + }, + { + "epoch": 0.345523765574527, + "grad_norm": 0.1973501592874527, + "learning_rate": 0.00016547223807274287, + "loss": 0.9822, + "step": 2995 + }, + { + "epoch": 0.34610059990770653, + "grad_norm": 0.20457808673381805, + "learning_rate": 0.00016531990483968357, + "loss": 0.9275, + "step": 3000 + }, + { + "epoch": 0.34667743424088604, + "grad_norm": 0.19708305597305298, + "learning_rate": 0.00016516730676600493, + "loss": 0.9457, + "step": 3005 + }, + { + "epoch": 0.34725426857406555, + "grad_norm": 0.1883085072040558, + "learning_rate": 0.00016501444447041824, + "loss": 0.9429, + "step": 3010 + }, + { + "epoch": 0.34783110290724506, + "grad_norm": 0.20550213754177094, + "learning_rate": 0.00016486131857270628, + "loss": 0.9335, + "step": 3015 + }, + { + "epoch": 0.34840793724042457, + "grad_norm": 0.20632968842983246, + "learning_rate": 0.00016470792969372039, + "loss": 0.9335, + "step": 3020 + }, + { + "epoch": 0.3489847715736041, + "grad_norm": 0.20688195526599884, + "learning_rate": 0.00016455427845537835, + "loss": 0.9474, + "step": 3025 + }, + { + "epoch": 0.3495616059067836, + "grad_norm": 0.19943097233772278, + "learning_rate": 0.0001644003654806616, + "loss": 1.0254, + "step": 3030 + }, + { + "epoch": 0.3501384402399631, + "grad_norm": 0.2087412327528, + "learning_rate": 0.00016424619139361282, + "loss": 1.0255, + "step": 3035 + }, + { + "epoch": 0.3507152745731426, + "grad_norm": 0.1902855485677719, + "learning_rate": 0.00016409175681933328, + "loss": 0.9629, + "step": 3040 + }, + { + "epoch": 0.3512921089063221, + "grad_norm": 0.19096340239048004, + "learning_rate": 0.00016393706238398056, + "loss": 0.9426, + "step": 3045 + }, + { + "epoch": 0.35186894323950163, + "grad_norm": 0.20610526204109192, + "learning_rate": 0.00016378210871476577, + "loss": 0.8949, + "step": 3050 + }, + { + "epoch": 0.35244577757268114, + "grad_norm": 0.20180396735668182, + "learning_rate": 0.00016362689643995105, + "loss": 0.9682, + "step": 3055 + }, + { + "epoch": 0.35302261190586065, + "grad_norm": 0.2666982412338257, + "learning_rate": 0.00016347142618884712, + "loss": 0.999, + "step": 3060 + }, + { + "epoch": 0.35359944623904016, + "grad_norm": 0.20752312242984772, + "learning_rate": 0.00016331569859181062, + "loss": 0.962, + "step": 3065 + }, + { + "epoch": 0.35417628057221967, + "grad_norm": 0.21684733033180237, + "learning_rate": 0.00016315971428024168, + "loss": 0.9512, + "step": 3070 + }, + { + "epoch": 0.3547531149053992, + "grad_norm": 0.20133435726165771, + "learning_rate": 0.0001630034738865812, + "loss": 0.9224, + "step": 3075 + }, + { + "epoch": 0.3553299492385787, + "grad_norm": 0.1901036500930786, + "learning_rate": 0.00016284697804430843, + "loss": 0.928, + "step": 3080 + }, + { + "epoch": 0.3559067835717582, + "grad_norm": 0.19342263042926788, + "learning_rate": 0.00016269022738793832, + "loss": 0.9801, + "step": 3085 + }, + { + "epoch": 0.3564836179049377, + "grad_norm": 0.18987055122852325, + "learning_rate": 0.00016253322255301887, + "loss": 0.9403, + "step": 3090 + }, + { + "epoch": 0.3570604522381172, + "grad_norm": 0.21176302433013916, + "learning_rate": 0.0001623759641761289, + "loss": 0.9854, + "step": 3095 + }, + { + "epoch": 0.35763728657129673, + "grad_norm": 0.198608860373497, + "learning_rate": 0.00016221845289487492, + "loss": 0.9506, + "step": 3100 + }, + { + "epoch": 0.35821412090447624, + "grad_norm": 0.1978042870759964, + "learning_rate": 0.00016206068934788905, + "loss": 0.9745, + "step": 3105 + }, + { + "epoch": 0.35879095523765575, + "grad_norm": 0.20335493981838226, + "learning_rate": 0.0001619026741748262, + "loss": 0.9575, + "step": 3110 + }, + { + "epoch": 0.35936778957083526, + "grad_norm": 0.2059904783964157, + "learning_rate": 0.00016174440801636138, + "loss": 0.9237, + "step": 3115 + }, + { + "epoch": 0.35994462390401477, + "grad_norm": 0.20635630190372467, + "learning_rate": 0.0001615858915141874, + "loss": 0.9222, + "step": 3120 + }, + { + "epoch": 0.3605214582371943, + "grad_norm": 0.2041216492652893, + "learning_rate": 0.00016142712531101196, + "loss": 0.9432, + "step": 3125 + }, + { + "epoch": 0.3610982925703738, + "grad_norm": 0.2060927152633667, + "learning_rate": 0.0001612681100505552, + "loss": 0.9993, + "step": 3130 + }, + { + "epoch": 0.3616751269035533, + "grad_norm": 0.21824884414672852, + "learning_rate": 0.00016110884637754713, + "loss": 0.9735, + "step": 3135 + }, + { + "epoch": 0.3622519612367328, + "grad_norm": 0.20615214109420776, + "learning_rate": 0.00016094933493772487, + "loss": 1.022, + "step": 3140 + }, + { + "epoch": 0.3628287955699123, + "grad_norm": 0.19122281670570374, + "learning_rate": 0.00016078957637783017, + "loss": 0.9451, + "step": 3145 + }, + { + "epoch": 0.36340562990309183, + "grad_norm": 0.20327936112880707, + "learning_rate": 0.00016062957134560675, + "loss": 0.939, + "step": 3150 + }, + { + "epoch": 0.36398246423627134, + "grad_norm": 0.18740370869636536, + "learning_rate": 0.0001604693204897975, + "loss": 0.9801, + "step": 3155 + }, + { + "epoch": 0.36455929856945085, + "grad_norm": 0.21493248641490936, + "learning_rate": 0.00016030882446014234, + "loss": 1.0288, + "step": 3160 + }, + { + "epoch": 0.36513613290263036, + "grad_norm": 0.21232721209526062, + "learning_rate": 0.00016014808390737485, + "loss": 0.9975, + "step": 3165 + }, + { + "epoch": 0.36571296723580987, + "grad_norm": 0.1946122944355011, + "learning_rate": 0.00015998709948322027, + "loss": 0.9658, + "step": 3170 + }, + { + "epoch": 0.3662898015689894, + "grad_norm": 0.18985234200954437, + "learning_rate": 0.00015982587184039263, + "loss": 0.9608, + "step": 3175 + }, + { + "epoch": 0.3668666359021689, + "grad_norm": 0.21031589806079865, + "learning_rate": 0.00015966440163259202, + "loss": 0.9316, + "step": 3180 + }, + { + "epoch": 0.3674434702353484, + "grad_norm": 0.19520461559295654, + "learning_rate": 0.00015950268951450198, + "loss": 0.9502, + "step": 3185 + }, + { + "epoch": 0.3680203045685279, + "grad_norm": 0.20601095259189606, + "learning_rate": 0.00015934073614178696, + "loss": 0.9511, + "step": 3190 + }, + { + "epoch": 0.3685971389017074, + "grad_norm": 0.19720254838466644, + "learning_rate": 0.00015917854217108954, + "loss": 0.9694, + "step": 3195 + }, + { + "epoch": 0.36917397323488693, + "grad_norm": 0.19519665837287903, + "learning_rate": 0.00015901610826002787, + "loss": 0.9564, + "step": 3200 + }, + { + "epoch": 0.36975080756806644, + "grad_norm": 0.21351169049739838, + "learning_rate": 0.0001588534350671928, + "loss": 0.9541, + "step": 3205 + }, + { + "epoch": 0.37032764190124595, + "grad_norm": 0.214279443025589, + "learning_rate": 0.00015869052325214554, + "loss": 0.9811, + "step": 3210 + }, + { + "epoch": 0.37090447623442546, + "grad_norm": 0.2172657698392868, + "learning_rate": 0.00015852737347541465, + "loss": 0.9756, + "step": 3215 + }, + { + "epoch": 0.37148131056760497, + "grad_norm": 0.1879701018333435, + "learning_rate": 0.00015836398639849355, + "loss": 0.9628, + "step": 3220 + }, + { + "epoch": 0.3720581449007845, + "grad_norm": 0.2050919383764267, + "learning_rate": 0.00015820036268383785, + "loss": 0.9715, + "step": 3225 + }, + { + "epoch": 0.372634979233964, + "grad_norm": 0.18730495870113373, + "learning_rate": 0.00015803650299486252, + "loss": 0.9706, + "step": 3230 + }, + { + "epoch": 0.3732118135671435, + "grad_norm": 0.19974547624588013, + "learning_rate": 0.00015787240799593937, + "loss": 0.9911, + "step": 3235 + }, + { + "epoch": 0.373788647900323, + "grad_norm": 0.1979796588420868, + "learning_rate": 0.00015770807835239424, + "loss": 0.9154, + "step": 3240 + }, + { + "epoch": 0.3743654822335025, + "grad_norm": 0.2047133445739746, + "learning_rate": 0.00015754351473050435, + "loss": 0.9593, + "step": 3245 + }, + { + "epoch": 0.37494231656668203, + "grad_norm": 0.19649125635623932, + "learning_rate": 0.0001573787177974956, + "loss": 0.9831, + "step": 3250 + }, + { + "epoch": 0.37551915089986154, + "grad_norm": 0.2003108412027359, + "learning_rate": 0.00015721368822153986, + "loss": 0.876, + "step": 3255 + }, + { + "epoch": 0.37609598523304105, + "grad_norm": 0.21534393727779388, + "learning_rate": 0.0001570484266717522, + "loss": 1.0213, + "step": 3260 + }, + { + "epoch": 0.37667281956622056, + "grad_norm": 0.19399525225162506, + "learning_rate": 0.00015688293381818823, + "loss": 0.9159, + "step": 3265 + }, + { + "epoch": 0.3772496538994001, + "grad_norm": 0.19937078654766083, + "learning_rate": 0.0001567172103318415, + "loss": 0.9598, + "step": 3270 + }, + { + "epoch": 0.3778264882325796, + "grad_norm": 0.1996837705373764, + "learning_rate": 0.00015655125688464062, + "loss": 0.9606, + "step": 3275 + }, + { + "epoch": 0.3784033225657591, + "grad_norm": 0.2030685842037201, + "learning_rate": 0.00015638507414944642, + "loss": 0.9623, + "step": 3280 + }, + { + "epoch": 0.3789801568989386, + "grad_norm": 0.20788177847862244, + "learning_rate": 0.0001562186628000496, + "loss": 1.0118, + "step": 3285 + }, + { + "epoch": 0.3795569912321181, + "grad_norm": 0.19508282840251923, + "learning_rate": 0.00015605202351116765, + "loss": 0.9457, + "step": 3290 + }, + { + "epoch": 0.3801338255652976, + "grad_norm": 0.19498229026794434, + "learning_rate": 0.00015588515695844234, + "loss": 0.9243, + "step": 3295 + }, + { + "epoch": 0.38071065989847713, + "grad_norm": 0.2012222409248352, + "learning_rate": 0.00015571806381843676, + "loss": 0.9631, + "step": 3300 + }, + { + "epoch": 0.38128749423165664, + "grad_norm": 0.1890476495027542, + "learning_rate": 0.00015555074476863282, + "loss": 0.931, + "step": 3305 + }, + { + "epoch": 0.38186432856483615, + "grad_norm": 0.19449810683727264, + "learning_rate": 0.00015538320048742835, + "loss": 0.945, + "step": 3310 + }, + { + "epoch": 0.38244116289801566, + "grad_norm": 0.19659358263015747, + "learning_rate": 0.00015521543165413428, + "loss": 0.9052, + "step": 3315 + }, + { + "epoch": 0.3830179972311952, + "grad_norm": 0.1930277943611145, + "learning_rate": 0.00015504743894897218, + "loss": 0.9379, + "step": 3320 + }, + { + "epoch": 0.38359483156437474, + "grad_norm": 0.1968124955892563, + "learning_rate": 0.00015487922305307118, + "loss": 0.9839, + "step": 3325 + }, + { + "epoch": 0.38417166589755425, + "grad_norm": 0.20579907298088074, + "learning_rate": 0.0001547107846484653, + "loss": 0.9653, + "step": 3330 + }, + { + "epoch": 0.38474850023073376, + "grad_norm": 0.193470761179924, + "learning_rate": 0.00015454212441809095, + "loss": 1.0168, + "step": 3335 + }, + { + "epoch": 0.38532533456391327, + "grad_norm": 0.20029513537883759, + "learning_rate": 0.00015437324304578363, + "loss": 1.0071, + "step": 3340 + }, + { + "epoch": 0.3859021688970928, + "grad_norm": 0.19662067294120789, + "learning_rate": 0.00015420414121627575, + "loss": 0.9741, + "step": 3345 + }, + { + "epoch": 0.3864790032302723, + "grad_norm": 0.21813516318798065, + "learning_rate": 0.00015403481961519334, + "loss": 1.012, + "step": 3350 + }, + { + "epoch": 0.3870558375634518, + "grad_norm": 0.19463011622428894, + "learning_rate": 0.00015386527892905365, + "loss": 0.9786, + "step": 3355 + }, + { + "epoch": 0.3876326718966313, + "grad_norm": 0.19774432480335236, + "learning_rate": 0.0001536955198452621, + "loss": 0.9776, + "step": 3360 + }, + { + "epoch": 0.3882095062298108, + "grad_norm": 0.2053624838590622, + "learning_rate": 0.0001535255430521097, + "loss": 0.9645, + "step": 3365 + }, + { + "epoch": 0.38878634056299033, + "grad_norm": 0.20018287003040314, + "learning_rate": 0.00015335534923877013, + "loss": 0.9489, + "step": 3370 + }, + { + "epoch": 0.38936317489616984, + "grad_norm": 0.1886557936668396, + "learning_rate": 0.000153184939095297, + "loss": 0.9604, + "step": 3375 + }, + { + "epoch": 0.38994000922934935, + "grad_norm": 0.19467659294605255, + "learning_rate": 0.00015301431331262095, + "loss": 0.9485, + "step": 3380 + }, + { + "epoch": 0.39051684356252886, + "grad_norm": 0.20318065583705902, + "learning_rate": 0.00015284347258254704, + "loss": 0.9876, + "step": 3385 + }, + { + "epoch": 0.39109367789570837, + "grad_norm": 0.1873011291027069, + "learning_rate": 0.0001526724175977518, + "loss": 0.9538, + "step": 3390 + }, + { + "epoch": 0.3916705122288879, + "grad_norm": 0.19449511170387268, + "learning_rate": 0.0001525011490517805, + "loss": 0.9369, + "step": 3395 + }, + { + "epoch": 0.3922473465620674, + "grad_norm": 0.19777143001556396, + "learning_rate": 0.00015232966763904416, + "loss": 0.9926, + "step": 3400 + }, + { + "epoch": 0.3928241808952469, + "grad_norm": 0.20093326270580292, + "learning_rate": 0.00015215797405481704, + "loss": 0.9737, + "step": 3405 + }, + { + "epoch": 0.3934010152284264, + "grad_norm": 0.20396889746189117, + "learning_rate": 0.00015198606899523352, + "loss": 0.9654, + "step": 3410 + }, + { + "epoch": 0.3939778495616059, + "grad_norm": 0.20018213987350464, + "learning_rate": 0.00015181395315728554, + "loss": 0.9835, + "step": 3415 + }, + { + "epoch": 0.39455468389478543, + "grad_norm": 0.18661239743232727, + "learning_rate": 0.00015164162723881947, + "loss": 0.9946, + "step": 3420 + }, + { + "epoch": 0.39513151822796494, + "grad_norm": 0.20598599314689636, + "learning_rate": 0.00015146909193853363, + "loss": 0.9495, + "step": 3425 + }, + { + "epoch": 0.39570835256114445, + "grad_norm": 0.2064884752035141, + "learning_rate": 0.0001512963479559752, + "loss": 0.9524, + "step": 3430 + }, + { + "epoch": 0.39628518689432396, + "grad_norm": 0.19914241135120392, + "learning_rate": 0.00015112339599153746, + "loss": 0.9643, + "step": 3435 + }, + { + "epoch": 0.39686202122750347, + "grad_norm": 0.20408232510089874, + "learning_rate": 0.00015095023674645698, + "loss": 0.9757, + "step": 3440 + }, + { + "epoch": 0.397438855560683, + "grad_norm": 0.1998940259218216, + "learning_rate": 0.00015077687092281074, + "loss": 0.9318, + "step": 3445 + }, + { + "epoch": 0.3980156898938625, + "grad_norm": 0.20163275301456451, + "learning_rate": 0.00015060329922351326, + "loss": 0.9465, + "step": 3450 + }, + { + "epoch": 0.398592524227042, + "grad_norm": 0.20353001356124878, + "learning_rate": 0.0001504295223523139, + "loss": 0.9631, + "step": 3455 + }, + { + "epoch": 0.3991693585602215, + "grad_norm": 0.20348602533340454, + "learning_rate": 0.00015025554101379379, + "loss": 0.9685, + "step": 3460 + }, + { + "epoch": 0.399746192893401, + "grad_norm": 0.19552728533744812, + "learning_rate": 0.0001500813559133631, + "loss": 0.9729, + "step": 3465 + }, + { + "epoch": 0.40032302722658053, + "grad_norm": 0.19121238589286804, + "learning_rate": 0.00014990696775725812, + "loss": 0.9514, + "step": 3470 + }, + { + "epoch": 0.40089986155976004, + "grad_norm": 0.18907782435417175, + "learning_rate": 0.0001497323772525385, + "loss": 0.9234, + "step": 3475 + }, + { + "epoch": 0.40147669589293955, + "grad_norm": 0.19860269129276276, + "learning_rate": 0.00014955758510708434, + "loss": 0.9192, + "step": 3480 + }, + { + "epoch": 0.40205353022611906, + "grad_norm": 0.19196364283561707, + "learning_rate": 0.00014938259202959317, + "loss": 0.9216, + "step": 3485 + }, + { + "epoch": 0.40263036455929857, + "grad_norm": 0.20269255340099335, + "learning_rate": 0.00014920739872957732, + "loss": 0.9733, + "step": 3490 + }, + { + "epoch": 0.4032071988924781, + "grad_norm": 0.20808875560760498, + "learning_rate": 0.00014903200591736087, + "loss": 0.9984, + "step": 3495 + }, + { + "epoch": 0.4037840332256576, + "grad_norm": 0.20577386021614075, + "learning_rate": 0.00014885641430407686, + "loss": 0.9771, + "step": 3500 + }, + { + "epoch": 0.4043608675588371, + "grad_norm": 0.20498362183570862, + "learning_rate": 0.0001486806246016643, + "loss": 0.9883, + "step": 3505 + }, + { + "epoch": 0.4049377018920166, + "grad_norm": 0.1883656233549118, + "learning_rate": 0.00014850463752286543, + "loss": 0.9662, + "step": 3510 + }, + { + "epoch": 0.4055145362251961, + "grad_norm": 0.18973584473133087, + "learning_rate": 0.00014832845378122276, + "loss": 0.9403, + "step": 3515 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 0.20748434960842133, + "learning_rate": 0.00014815207409107608, + "loss": 0.945, + "step": 3520 + }, + { + "epoch": 0.40666820489155514, + "grad_norm": 0.19455088675022125, + "learning_rate": 0.00014797549916755975, + "loss": 0.9646, + "step": 3525 + }, + { + "epoch": 0.40724503922473465, + "grad_norm": 0.20486250519752502, + "learning_rate": 0.0001477987297265997, + "loss": 0.9901, + "step": 3530 + }, + { + "epoch": 0.40782187355791416, + "grad_norm": 0.20417264103889465, + "learning_rate": 0.0001476217664849105, + "loss": 0.9385, + "step": 3535 + }, + { + "epoch": 0.40839870789109367, + "grad_norm": 0.19334882497787476, + "learning_rate": 0.00014744461015999248, + "loss": 1.0049, + "step": 3540 + }, + { + "epoch": 0.4089755422242732, + "grad_norm": 0.18688935041427612, + "learning_rate": 0.00014726726147012889, + "loss": 0.9574, + "step": 3545 + }, + { + "epoch": 0.4095523765574527, + "grad_norm": 0.1947825849056244, + "learning_rate": 0.00014708972113438285, + "loss": 0.957, + "step": 3550 + }, + { + "epoch": 0.4101292108906322, + "grad_norm": 0.20580242574214935, + "learning_rate": 0.00014691198987259454, + "loss": 0.9648, + "step": 3555 + }, + { + "epoch": 0.4107060452238117, + "grad_norm": 0.1960141509771347, + "learning_rate": 0.00014673406840537824, + "loss": 0.9558, + "step": 3560 + }, + { + "epoch": 0.4112828795569912, + "grad_norm": 0.188209667801857, + "learning_rate": 0.00014655595745411955, + "loss": 0.9761, + "step": 3565 + }, + { + "epoch": 0.41185971389017073, + "grad_norm": 0.20472858846187592, + "learning_rate": 0.00014637765774097206, + "loss": 0.9597, + "step": 3570 + }, + { + "epoch": 0.41243654822335024, + "grad_norm": 0.20903244614601135, + "learning_rate": 0.000146199169988855, + "loss": 0.9749, + "step": 3575 + }, + { + "epoch": 0.41301338255652975, + "grad_norm": 0.19724762439727783, + "learning_rate": 0.00014602049492144984, + "loss": 0.9621, + "step": 3580 + }, + { + "epoch": 0.41359021688970926, + "grad_norm": 0.190316304564476, + "learning_rate": 0.00014584163326319754, + "loss": 0.9804, + "step": 3585 + }, + { + "epoch": 0.41416705122288877, + "grad_norm": 0.18492008745670319, + "learning_rate": 0.00014566258573929557, + "loss": 0.9791, + "step": 3590 + }, + { + "epoch": 0.4147438855560683, + "grad_norm": 0.19851504266262054, + "learning_rate": 0.0001454833530756951, + "loss": 0.9497, + "step": 3595 + }, + { + "epoch": 0.4153207198892478, + "grad_norm": 0.20571914315223694, + "learning_rate": 0.0001453039359990979, + "loss": 1.0079, + "step": 3600 + }, + { + "epoch": 0.4158975542224273, + "grad_norm": 0.22042030096054077, + "learning_rate": 0.00014512433523695332, + "loss": 0.9887, + "step": 3605 + }, + { + "epoch": 0.4164743885556068, + "grad_norm": 0.1839991807937622, + "learning_rate": 0.0001449445515174557, + "loss": 0.9404, + "step": 3610 + }, + { + "epoch": 0.4170512228887863, + "grad_norm": 0.19041991233825684, + "learning_rate": 0.000144764585569541, + "loss": 0.9374, + "step": 3615 + }, + { + "epoch": 0.41762805722196583, + "grad_norm": 0.19904352724552155, + "learning_rate": 0.00014458443812288415, + "loss": 0.9219, + "step": 3620 + }, + { + "epoch": 0.41820489155514534, + "grad_norm": 0.19090399146080017, + "learning_rate": 0.00014440410990789582, + "loss": 0.9379, + "step": 3625 + }, + { + "epoch": 0.41878172588832485, + "grad_norm": 0.18457092344760895, + "learning_rate": 0.00014422360165571976, + "loss": 0.9966, + "step": 3630 + }, + { + "epoch": 0.41935856022150436, + "grad_norm": 0.20009112358093262, + "learning_rate": 0.0001440429140982296, + "loss": 0.9714, + "step": 3635 + }, + { + "epoch": 0.41993539455468387, + "grad_norm": 0.19650514423847198, + "learning_rate": 0.000143862047968026, + "loss": 0.9502, + "step": 3640 + }, + { + "epoch": 0.4205122288878634, + "grad_norm": 0.20459064841270447, + "learning_rate": 0.00014368100399843366, + "loss": 0.9208, + "step": 3645 + }, + { + "epoch": 0.4210890632210429, + "grad_norm": 0.20869080722332, + "learning_rate": 0.00014349978292349825, + "loss": 0.9478, + "step": 3650 + }, + { + "epoch": 0.4216658975542224, + "grad_norm": 0.21228952705860138, + "learning_rate": 0.0001433183854779836, + "loss": 0.957, + "step": 3655 + }, + { + "epoch": 0.4222427318874019, + "grad_norm": 0.19315268099308014, + "learning_rate": 0.00014313681239736865, + "loss": 0.973, + "step": 3660 + }, + { + "epoch": 0.4228195662205814, + "grad_norm": 0.19466190040111542, + "learning_rate": 0.00014295506441784435, + "loss": 1.0031, + "step": 3665 + }, + { + "epoch": 0.423396400553761, + "grad_norm": 0.1872572898864746, + "learning_rate": 0.00014277314227631086, + "loss": 0.9398, + "step": 3670 + }, + { + "epoch": 0.4239732348869405, + "grad_norm": 0.18912255764007568, + "learning_rate": 0.00014259104671037452, + "loss": 0.9261, + "step": 3675 + }, + { + "epoch": 0.42455006922012, + "grad_norm": 0.18942441046237946, + "learning_rate": 0.00014240877845834472, + "loss": 0.9511, + "step": 3680 + }, + { + "epoch": 0.4251269035532995, + "grad_norm": 0.2267604023218155, + "learning_rate": 0.00014222633825923108, + "loss": 0.9919, + "step": 3685 + }, + { + "epoch": 0.42570373788647903, + "grad_norm": 0.19375835359096527, + "learning_rate": 0.00014204372685274039, + "loss": 0.9862, + "step": 3690 + }, + { + "epoch": 0.42628057221965854, + "grad_norm": 0.19178146123886108, + "learning_rate": 0.00014186094497927352, + "loss": 0.9777, + "step": 3695 + }, + { + "epoch": 0.42685740655283805, + "grad_norm": 0.198550745844841, + "learning_rate": 0.00014167799337992258, + "loss": 0.936, + "step": 3700 + }, + { + "epoch": 0.42743424088601756, + "grad_norm": 0.2064967155456543, + "learning_rate": 0.00014149487279646781, + "loss": 0.9877, + "step": 3705 + }, + { + "epoch": 0.42801107521919707, + "grad_norm": 0.20294541120529175, + "learning_rate": 0.00014131158397137462, + "loss": 0.9306, + "step": 3710 + }, + { + "epoch": 0.4285879095523766, + "grad_norm": 0.21081644296646118, + "learning_rate": 0.00014112812764779053, + "loss": 0.9482, + "step": 3715 + }, + { + "epoch": 0.4291647438855561, + "grad_norm": 0.20666174590587616, + "learning_rate": 0.00014094450456954218, + "loss": 0.9848, + "step": 3720 + }, + { + "epoch": 0.4297415782187356, + "grad_norm": 0.19854900240898132, + "learning_rate": 0.00014076071548113238, + "loss": 0.9593, + "step": 3725 + }, + { + "epoch": 0.4303184125519151, + "grad_norm": 0.2024005800485611, + "learning_rate": 0.0001405767611277369, + "loss": 0.9753, + "step": 3730 + }, + { + "epoch": 0.4308952468850946, + "grad_norm": 0.19519449770450592, + "learning_rate": 0.00014039264225520175, + "loss": 0.9844, + "step": 3735 + }, + { + "epoch": 0.43147208121827413, + "grad_norm": 0.19514302909374237, + "learning_rate": 0.0001402083596100399, + "loss": 0.991, + "step": 3740 + }, + { + "epoch": 0.43204891555145364, + "grad_norm": 0.1931222528219223, + "learning_rate": 0.00014002391393942826, + "loss": 0.9693, + "step": 3745 + }, + { + "epoch": 0.43262574988463315, + "grad_norm": 0.203064426779747, + "learning_rate": 0.00013983930599120487, + "loss": 0.9822, + "step": 3750 + }, + { + "epoch": 0.43320258421781266, + "grad_norm": 0.19102463126182556, + "learning_rate": 0.0001396545365138657, + "loss": 0.9643, + "step": 3755 + }, + { + "epoch": 0.43377941855099217, + "grad_norm": 0.18826338648796082, + "learning_rate": 0.00013946960625656153, + "loss": 0.9706, + "step": 3760 + }, + { + "epoch": 0.4343562528841717, + "grad_norm": 0.20435942709445953, + "learning_rate": 0.00013928451596909516, + "loss": 0.9388, + "step": 3765 + }, + { + "epoch": 0.4349330872173512, + "grad_norm": 0.18556763231754303, + "learning_rate": 0.00013909926640191813, + "loss": 0.9218, + "step": 3770 + }, + { + "epoch": 0.4355099215505307, + "grad_norm": 0.1957630217075348, + "learning_rate": 0.0001389138583061279, + "loss": 1.0146, + "step": 3775 + }, + { + "epoch": 0.4360867558837102, + "grad_norm": 0.19505439698696136, + "learning_rate": 0.00013872829243346453, + "loss": 1.0006, + "step": 3780 + }, + { + "epoch": 0.4366635902168897, + "grad_norm": 0.20385412871837616, + "learning_rate": 0.00013854256953630797, + "loss": 0.997, + "step": 3785 + }, + { + "epoch": 0.43724042455006923, + "grad_norm": 0.1895400434732437, + "learning_rate": 0.00013835669036767466, + "loss": 1.0025, + "step": 3790 + }, + { + "epoch": 0.43781725888324874, + "grad_norm": 0.19983290135860443, + "learning_rate": 0.00013817065568121477, + "loss": 0.9643, + "step": 3795 + }, + { + "epoch": 0.43839409321642825, + "grad_norm": 0.20115922391414642, + "learning_rate": 0.00013798446623120893, + "loss": 0.9225, + "step": 3800 + }, + { + "epoch": 0.43897092754960776, + "grad_norm": 0.19573155045509338, + "learning_rate": 0.00013779812277256537, + "loss": 0.9408, + "step": 3805 + }, + { + "epoch": 0.43954776188278727, + "grad_norm": 0.21077951788902283, + "learning_rate": 0.0001376116260608166, + "loss": 0.9573, + "step": 3810 + }, + { + "epoch": 0.4401245962159668, + "grad_norm": 0.2025730162858963, + "learning_rate": 0.0001374249768521166, + "loss": 0.9674, + "step": 3815 + }, + { + "epoch": 0.4407014305491463, + "grad_norm": 0.1951024830341339, + "learning_rate": 0.0001372381759032377, + "loss": 0.9782, + "step": 3820 + }, + { + "epoch": 0.4412782648823258, + "grad_norm": 0.1892070323228836, + "learning_rate": 0.00013705122397156727, + "loss": 0.9297, + "step": 3825 + }, + { + "epoch": 0.4418550992155053, + "grad_norm": 0.2148219794034958, + "learning_rate": 0.00013686412181510504, + "loss": 0.9735, + "step": 3830 + }, + { + "epoch": 0.4424319335486848, + "grad_norm": 0.2071019560098648, + "learning_rate": 0.0001366768701924598, + "loss": 0.9767, + "step": 3835 + }, + { + "epoch": 0.44300876788186433, + "grad_norm": 0.19552481174468994, + "learning_rate": 0.0001364894698628462, + "loss": 0.9675, + "step": 3840 + }, + { + "epoch": 0.44358560221504384, + "grad_norm": 0.2387782335281372, + "learning_rate": 0.00013630192158608202, + "loss": 1.0083, + "step": 3845 + }, + { + "epoch": 0.44416243654822335, + "grad_norm": 0.19381476938724518, + "learning_rate": 0.00013611422612258477, + "loss": 0.9669, + "step": 3850 + }, + { + "epoch": 0.44473927088140286, + "grad_norm": 0.19321493804454803, + "learning_rate": 0.00013592638423336875, + "loss": 0.9885, + "step": 3855 + }, + { + "epoch": 0.44531610521458237, + "grad_norm": 0.20854459702968597, + "learning_rate": 0.00013573839668004202, + "loss": 1.0008, + "step": 3860 + }, + { + "epoch": 0.4458929395477619, + "grad_norm": 0.2074405997991562, + "learning_rate": 0.00013555026422480313, + "loss": 0.9222, + "step": 3865 + }, + { + "epoch": 0.4464697738809414, + "grad_norm": 0.194743350148201, + "learning_rate": 0.00013536198763043823, + "loss": 0.9856, + "step": 3870 + }, + { + "epoch": 0.4470466082141209, + "grad_norm": 0.20063389837741852, + "learning_rate": 0.00013517356766031777, + "loss": 1.0056, + "step": 3875 + }, + { + "epoch": 0.4476234425473004, + "grad_norm": 0.1934989094734192, + "learning_rate": 0.00013498500507839363, + "loss": 0.9928, + "step": 3880 + }, + { + "epoch": 0.4482002768804799, + "grad_norm": 0.20212046802043915, + "learning_rate": 0.00013479630064919593, + "loss": 0.8963, + "step": 3885 + }, + { + "epoch": 0.44877711121365943, + "grad_norm": 0.21239322423934937, + "learning_rate": 0.00013460745513782976, + "loss": 0.9812, + "step": 3890 + }, + { + "epoch": 0.44935394554683894, + "grad_norm": 0.2325180619955063, + "learning_rate": 0.0001344184693099724, + "loss": 0.9476, + "step": 3895 + }, + { + "epoch": 0.44993077988001845, + "grad_norm": 0.19209115207195282, + "learning_rate": 0.00013422934393186994, + "loss": 0.9289, + "step": 3900 + }, + { + "epoch": 0.45050761421319796, + "grad_norm": 0.19307377934455872, + "learning_rate": 0.0001340400797703343, + "loss": 0.983, + "step": 3905 + }, + { + "epoch": 0.45108444854637747, + "grad_norm": 0.19258156418800354, + "learning_rate": 0.00013385067759274014, + "loss": 0.986, + "step": 3910 + }, + { + "epoch": 0.451661282879557, + "grad_norm": 0.1946646273136139, + "learning_rate": 0.00013366113816702164, + "loss": 0.9962, + "step": 3915 + }, + { + "epoch": 0.4522381172127365, + "grad_norm": 0.20190544426441193, + "learning_rate": 0.0001334714622616695, + "loss": 0.9591, + "step": 3920 + }, + { + "epoch": 0.452814951545916, + "grad_norm": 0.1982085108757019, + "learning_rate": 0.0001332816506457278, + "loss": 0.9545, + "step": 3925 + }, + { + "epoch": 0.4533917858790955, + "grad_norm": 0.18541787564754486, + "learning_rate": 0.0001330917040887908, + "loss": 0.9652, + "step": 3930 + }, + { + "epoch": 0.453968620212275, + "grad_norm": 0.19790363311767578, + "learning_rate": 0.00013290162336099996, + "loss": 0.923, + "step": 3935 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.21102024614810944, + "learning_rate": 0.00013271140923304064, + "loss": 0.9701, + "step": 3940 + }, + { + "epoch": 0.45512228887863404, + "grad_norm": 0.1868615597486496, + "learning_rate": 0.00013252106247613914, + "loss": 0.9216, + "step": 3945 + }, + { + "epoch": 0.45569912321181355, + "grad_norm": 0.19408555328845978, + "learning_rate": 0.00013233058386205948, + "loss": 0.9397, + "step": 3950 + }, + { + "epoch": 0.45627595754499306, + "grad_norm": 0.19673417508602142, + "learning_rate": 0.00013213997416310034, + "loss": 0.9116, + "step": 3955 + }, + { + "epoch": 0.45685279187817257, + "grad_norm": 0.2141309231519699, + "learning_rate": 0.00013194923415209183, + "loss": 0.9748, + "step": 3960 + }, + { + "epoch": 0.4574296262113521, + "grad_norm": 0.19085197150707245, + "learning_rate": 0.00013175836460239243, + "loss": 1.0119, + "step": 3965 + }, + { + "epoch": 0.4580064605445316, + "grad_norm": 0.19188842177391052, + "learning_rate": 0.00013156736628788584, + "loss": 0.9487, + "step": 3970 + }, + { + "epoch": 0.4585832948777111, + "grad_norm": 0.19430477917194366, + "learning_rate": 0.00013137623998297785, + "loss": 0.9753, + "step": 3975 + }, + { + "epoch": 0.4591601292108906, + "grad_norm": 0.1962517946958542, + "learning_rate": 0.00013118498646259323, + "loss": 0.9655, + "step": 3980 + }, + { + "epoch": 0.4597369635440701, + "grad_norm": 0.20422694087028503, + "learning_rate": 0.0001309936065021724, + "loss": 1.0045, + "step": 3985 + }, + { + "epoch": 0.46031379787724963, + "grad_norm": 0.19590309262275696, + "learning_rate": 0.0001308021008776686, + "loss": 0.9858, + "step": 3990 + }, + { + "epoch": 0.46089063221042914, + "grad_norm": 0.19435778260231018, + "learning_rate": 0.00013061047036554444, + "loss": 0.9605, + "step": 3995 + }, + { + "epoch": 0.46146746654360865, + "grad_norm": 0.2044801414012909, + "learning_rate": 0.00013041871574276905, + "loss": 0.9802, + "step": 4000 + }, + { + "epoch": 0.46204430087678816, + "grad_norm": 0.21514268219470978, + "learning_rate": 0.00013022683778681458, + "loss": 0.9468, + "step": 4005 + }, + { + "epoch": 0.46262113520996767, + "grad_norm": 0.19823044538497925, + "learning_rate": 0.00013003483727565344, + "loss": 0.9975, + "step": 4010 + }, + { + "epoch": 0.4631979695431472, + "grad_norm": 0.1900324672460556, + "learning_rate": 0.00012984271498775473, + "loss": 0.9505, + "step": 4015 + }, + { + "epoch": 0.46377480387632675, + "grad_norm": 0.20325466990470886, + "learning_rate": 0.00012965047170208145, + "loss": 0.958, + "step": 4020 + }, + { + "epoch": 0.46435163820950626, + "grad_norm": 0.18988734483718872, + "learning_rate": 0.00012945810819808715, + "loss": 0.9684, + "step": 4025 + }, + { + "epoch": 0.46492847254268577, + "grad_norm": 0.19148334860801697, + "learning_rate": 0.00012926562525571273, + "loss": 0.953, + "step": 4030 + }, + { + "epoch": 0.4655053068758653, + "grad_norm": 0.19238923490047455, + "learning_rate": 0.00012907302365538348, + "loss": 0.9194, + "step": 4035 + }, + { + "epoch": 0.4660821412090448, + "grad_norm": 0.1843547224998474, + "learning_rate": 0.0001288803041780057, + "loss": 0.9375, + "step": 4040 + }, + { + "epoch": 0.4666589755422243, + "grad_norm": 0.20036543905735016, + "learning_rate": 0.0001286874676049637, + "loss": 0.9753, + "step": 4045 + }, + { + "epoch": 0.4672358098754038, + "grad_norm": 0.19511838257312775, + "learning_rate": 0.00012849451471811643, + "loss": 0.9217, + "step": 4050 + }, + { + "epoch": 0.4678126442085833, + "grad_norm": 0.2069857269525528, + "learning_rate": 0.00012830144629979456, + "loss": 0.9803, + "step": 4055 + }, + { + "epoch": 0.4683894785417628, + "grad_norm": 0.19766615331172943, + "learning_rate": 0.00012810826313279717, + "loss": 0.9806, + "step": 4060 + }, + { + "epoch": 0.46896631287494234, + "grad_norm": 0.19592691957950592, + "learning_rate": 0.00012791496600038854, + "loss": 0.9848, + "step": 4065 + }, + { + "epoch": 0.46954314720812185, + "grad_norm": 0.19588027894496918, + "learning_rate": 0.00012772155568629499, + "loss": 0.9274, + "step": 4070 + }, + { + "epoch": 0.47011998154130136, + "grad_norm": 0.2110724300146103, + "learning_rate": 0.00012752803297470187, + "loss": 0.9989, + "step": 4075 + }, + { + "epoch": 0.47069681587448087, + "grad_norm": 0.2043410688638687, + "learning_rate": 0.00012733439865025012, + "loss": 0.9706, + "step": 4080 + }, + { + "epoch": 0.4712736502076604, + "grad_norm": 0.22159966826438904, + "learning_rate": 0.0001271406534980333, + "loss": 0.9409, + "step": 4085 + }, + { + "epoch": 0.4718504845408399, + "grad_norm": 0.19468598067760468, + "learning_rate": 0.0001269467983035943, + "loss": 0.9749, + "step": 4090 + }, + { + "epoch": 0.4724273188740194, + "grad_norm": 0.1822662055492401, + "learning_rate": 0.00012675283385292212, + "loss": 0.9994, + "step": 4095 + }, + { + "epoch": 0.4730041532071989, + "grad_norm": 0.19376279413700104, + "learning_rate": 0.00012655876093244878, + "loss": 0.9757, + "step": 4100 + }, + { + "epoch": 0.4735809875403784, + "grad_norm": 0.18976148962974548, + "learning_rate": 0.00012636458032904617, + "loss": 1.0159, + "step": 4105 + }, + { + "epoch": 0.4741578218735579, + "grad_norm": 0.20439419150352478, + "learning_rate": 0.00012617029283002265, + "loss": 1.0269, + "step": 4110 + }, + { + "epoch": 0.47473465620673744, + "grad_norm": 0.18871116638183594, + "learning_rate": 0.00012597589922312008, + "loss": 0.944, + "step": 4115 + }, + { + "epoch": 0.47531149053991695, + "grad_norm": 0.19103854894638062, + "learning_rate": 0.00012578140029651053, + "loss": 0.9384, + "step": 4120 + }, + { + "epoch": 0.47588832487309646, + "grad_norm": 0.1954331398010254, + "learning_rate": 0.00012558679683879301, + "loss": 0.9838, + "step": 4125 + }, + { + "epoch": 0.47646515920627597, + "grad_norm": 0.20008032023906708, + "learning_rate": 0.0001253920896389905, + "loss": 1.012, + "step": 4130 + }, + { + "epoch": 0.4770419935394555, + "grad_norm": 0.18915073573589325, + "learning_rate": 0.00012519727948654642, + "loss": 0.965, + "step": 4135 + }, + { + "epoch": 0.477618827872635, + "grad_norm": 0.19700497388839722, + "learning_rate": 0.00012500236717132178, + "loss": 1.0043, + "step": 4140 + }, + { + "epoch": 0.4781956622058145, + "grad_norm": 0.23750852048397064, + "learning_rate": 0.0001248073534835917, + "loss": 0.955, + "step": 4145 + }, + { + "epoch": 0.478772496538994, + "grad_norm": 0.1965513378381729, + "learning_rate": 0.0001246122392140424, + "loss": 0.9257, + "step": 4150 + }, + { + "epoch": 0.4793493308721735, + "grad_norm": 0.20664075016975403, + "learning_rate": 0.00012441702515376786, + "loss": 0.9276, + "step": 4155 + }, + { + "epoch": 0.47992616520535303, + "grad_norm": 0.20796158909797668, + "learning_rate": 0.0001242217120942666, + "loss": 0.9813, + "step": 4160 + }, + { + "epoch": 0.48050299953853254, + "grad_norm": 0.19869232177734375, + "learning_rate": 0.00012402630082743868, + "loss": 0.9262, + "step": 4165 + }, + { + "epoch": 0.48107983387171205, + "grad_norm": 0.1941554695367813, + "learning_rate": 0.00012383079214558227, + "loss": 0.9682, + "step": 4170 + }, + { + "epoch": 0.48165666820489156, + "grad_norm": 0.19581513106822968, + "learning_rate": 0.00012363518684139043, + "loss": 0.952, + "step": 4175 + }, + { + "epoch": 0.48223350253807107, + "grad_norm": 0.18992619216442108, + "learning_rate": 0.00012343948570794815, + "loss": 0.9541, + "step": 4180 + }, + { + "epoch": 0.4828103368712506, + "grad_norm": 0.19012705981731415, + "learning_rate": 0.00012324368953872883, + "loss": 0.9461, + "step": 4185 + }, + { + "epoch": 0.4833871712044301, + "grad_norm": 0.24560825526714325, + "learning_rate": 0.00012304779912759118, + "loss": 0.9679, + "step": 4190 + }, + { + "epoch": 0.4839640055376096, + "grad_norm": 0.19769693911075592, + "learning_rate": 0.00012285181526877615, + "loss": 0.926, + "step": 4195 + }, + { + "epoch": 0.4845408398707891, + "grad_norm": 0.20233696699142456, + "learning_rate": 0.00012265573875690344, + "loss": 0.9854, + "step": 4200 + }, + { + "epoch": 0.4851176742039686, + "grad_norm": 0.1947474330663681, + "learning_rate": 0.0001224595703869685, + "loss": 0.9606, + "step": 4205 + }, + { + "epoch": 0.48569450853714813, + "grad_norm": 0.1961849331855774, + "learning_rate": 0.0001222633109543392, + "loss": 1.0326, + "step": 4210 + }, + { + "epoch": 0.48627134287032764, + "grad_norm": 0.1963188499212265, + "learning_rate": 0.00012206696125475249, + "loss": 0.979, + "step": 4215 + }, + { + "epoch": 0.48684817720350715, + "grad_norm": 0.19063878059387207, + "learning_rate": 0.00012187052208431158, + "loss": 0.9483, + "step": 4220 + }, + { + "epoch": 0.48742501153668666, + "grad_norm": 0.2054065316915512, + "learning_rate": 0.0001216739942394822, + "loss": 0.9725, + "step": 4225 + }, + { + "epoch": 0.48800184586986617, + "grad_norm": 0.19815371930599213, + "learning_rate": 0.00012147737851708973, + "loss": 0.9445, + "step": 4230 + }, + { + "epoch": 0.4885786802030457, + "grad_norm": 0.19612999260425568, + "learning_rate": 0.00012128067571431583, + "loss": 0.9167, + "step": 4235 + }, + { + "epoch": 0.4891555145362252, + "grad_norm": 0.18682846426963806, + "learning_rate": 0.00012108388662869519, + "loss": 0.9596, + "step": 4240 + }, + { + "epoch": 0.4897323488694047, + "grad_norm": 0.19077306985855103, + "learning_rate": 0.0001208870120581124, + "loss": 0.9491, + "step": 4245 + }, + { + "epoch": 0.4903091832025842, + "grad_norm": 0.1911584734916687, + "learning_rate": 0.00012069005280079862, + "loss": 0.9399, + "step": 4250 + }, + { + "epoch": 0.4908860175357637, + "grad_norm": 0.18746261298656464, + "learning_rate": 0.00012049300965532832, + "loss": 0.9954, + "step": 4255 + }, + { + "epoch": 0.49146285186894323, + "grad_norm": 0.19653597474098206, + "learning_rate": 0.00012029588342061621, + "loss": 0.9635, + "step": 4260 + }, + { + "epoch": 0.49203968620212274, + "grad_norm": 0.1984453648328781, + "learning_rate": 0.00012009867489591377, + "loss": 0.901, + "step": 4265 + }, + { + "epoch": 0.49261652053530225, + "grad_norm": 0.20906962454319, + "learning_rate": 0.00011990138488080622, + "loss": 0.9282, + "step": 4270 + }, + { + "epoch": 0.49319335486848176, + "grad_norm": 0.2029707431793213, + "learning_rate": 0.00011970401417520913, + "loss": 0.9731, + "step": 4275 + }, + { + "epoch": 0.49377018920166127, + "grad_norm": 0.19050882756710052, + "learning_rate": 0.00011950656357936525, + "loss": 0.9431, + "step": 4280 + }, + { + "epoch": 0.4943470235348408, + "grad_norm": 0.19275051355361938, + "learning_rate": 0.00011930903389384123, + "loss": 0.9576, + "step": 4285 + }, + { + "epoch": 0.4949238578680203, + "grad_norm": 0.21345672011375427, + "learning_rate": 0.00011911142591952437, + "loss": 0.9696, + "step": 4290 + }, + { + "epoch": 0.4955006922011998, + "grad_norm": 0.1972283273935318, + "learning_rate": 0.0001189137404576195, + "loss": 0.9669, + "step": 4295 + }, + { + "epoch": 0.4960775265343793, + "grad_norm": 0.19907104969024658, + "learning_rate": 0.00011871597830964551, + "loss": 0.9477, + "step": 4300 + }, + { + "epoch": 0.4966543608675588, + "grad_norm": 0.1972326934337616, + "learning_rate": 0.00011851814027743223, + "loss": 0.9962, + "step": 4305 + }, + { + "epoch": 0.49723119520073833, + "grad_norm": 0.18552403151988983, + "learning_rate": 0.00011832022716311722, + "loss": 0.9556, + "step": 4310 + }, + { + "epoch": 0.49780802953391784, + "grad_norm": 0.18974804878234863, + "learning_rate": 0.00011812223976914243, + "loss": 0.9446, + "step": 4315 + }, + { + "epoch": 0.49838486386709735, + "grad_norm": 0.2123657763004303, + "learning_rate": 0.00011792417889825094, + "loss": 0.9444, + "step": 4320 + }, + { + "epoch": 0.49896169820027686, + "grad_norm": 0.1999022513628006, + "learning_rate": 0.00011772604535348382, + "loss": 0.9767, + "step": 4325 + }, + { + "epoch": 0.49953853253345637, + "grad_norm": 0.19423572719097137, + "learning_rate": 0.00011752783993817675, + "loss": 0.9548, + "step": 4330 + }, + { + "epoch": 0.5001153668666359, + "grad_norm": 0.19109071791172028, + "learning_rate": 0.00011732956345595682, + "loss": 0.9343, + "step": 4335 + }, + { + "epoch": 0.5006922011998154, + "grad_norm": 0.18442274630069733, + "learning_rate": 0.00011713121671073924, + "loss": 0.9759, + "step": 4340 + }, + { + "epoch": 0.501269035532995, + "grad_norm": 0.20228290557861328, + "learning_rate": 0.00011693280050672417, + "loss": 0.9676, + "step": 4345 + }, + { + "epoch": 0.5018458698661744, + "grad_norm": 0.19034543633460999, + "learning_rate": 0.00011673431564839327, + "loss": 0.87, + "step": 4350 + }, + { + "epoch": 0.502422704199354, + "grad_norm": 0.2001582235097885, + "learning_rate": 0.0001165357629405067, + "loss": 0.9489, + "step": 4355 + }, + { + "epoch": 0.5029995385325334, + "grad_norm": 0.18779706954956055, + "learning_rate": 0.00011633714318809962, + "loss": 0.9755, + "step": 4360 + }, + { + "epoch": 0.503576372865713, + "grad_norm": 0.19904857873916626, + "learning_rate": 0.00011613845719647909, + "loss": 0.9586, + "step": 4365 + }, + { + "epoch": 0.5041532071988925, + "grad_norm": 0.20691759884357452, + "learning_rate": 0.00011593970577122067, + "loss": 0.9716, + "step": 4370 + }, + { + "epoch": 0.504730041532072, + "grad_norm": 0.19286218285560608, + "learning_rate": 0.00011574088971816523, + "loss": 0.9424, + "step": 4375 + }, + { + "epoch": 0.5053068758652515, + "grad_norm": 0.1918749362230301, + "learning_rate": 0.00011554200984341577, + "loss": 0.9339, + "step": 4380 + }, + { + "epoch": 0.505883710198431, + "grad_norm": 0.20204107463359833, + "learning_rate": 0.00011534306695333395, + "loss": 1.0028, + "step": 4385 + }, + { + "epoch": 0.5064605445316105, + "grad_norm": 0.21450471878051758, + "learning_rate": 0.00011514406185453692, + "loss": 0.95, + "step": 4390 + }, + { + "epoch": 0.5070373788647901, + "grad_norm": 0.22814354300498962, + "learning_rate": 0.00011494499535389418, + "loss": 0.9179, + "step": 4395 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 0.19863393902778625, + "learning_rate": 0.00011474586825852405, + "loss": 0.9664, + "step": 4400 + }, + { + "epoch": 0.5081910475311491, + "grad_norm": 0.21064569056034088, + "learning_rate": 0.00011454668137579059, + "loss": 0.9269, + "step": 4405 + }, + { + "epoch": 0.5087678818643285, + "grad_norm": 0.19144834578037262, + "learning_rate": 0.00011434743551330028, + "loss": 0.9448, + "step": 4410 + }, + { + "epoch": 0.5093447161975081, + "grad_norm": 0.19969388842582703, + "learning_rate": 0.00011414813147889868, + "loss": 0.9967, + "step": 4415 + }, + { + "epoch": 0.5099215505306876, + "grad_norm": 0.1932876855134964, + "learning_rate": 0.00011394877008066731, + "loss": 0.9726, + "step": 4420 + }, + { + "epoch": 0.5104983848638671, + "grad_norm": 0.2045474648475647, + "learning_rate": 0.00011374935212692018, + "loss": 0.9356, + "step": 4425 + }, + { + "epoch": 0.5110752191970466, + "grad_norm": 0.2248217910528183, + "learning_rate": 0.00011354987842620061, + "loss": 0.9842, + "step": 4430 + }, + { + "epoch": 0.5116520535302261, + "grad_norm": 0.18337203562259674, + "learning_rate": 0.000113350349787278, + "loss": 1.0268, + "step": 4435 + }, + { + "epoch": 0.5122288878634056, + "grad_norm": 0.21334248781204224, + "learning_rate": 0.00011315076701914449, + "loss": 0.952, + "step": 4440 + }, + { + "epoch": 0.5128057221965852, + "grad_norm": 0.20142047107219696, + "learning_rate": 0.00011295113093101162, + "loss": 0.9348, + "step": 4445 + }, + { + "epoch": 0.5133825565297646, + "grad_norm": 0.19259627163410187, + "learning_rate": 0.0001127514423323072, + "loss": 0.9867, + "step": 4450 + }, + { + "epoch": 0.5139593908629442, + "grad_norm": 0.2004094421863556, + "learning_rate": 0.00011255170203267186, + "loss": 0.9208, + "step": 4455 + }, + { + "epoch": 0.5145362251961236, + "grad_norm": 0.18530438840389252, + "learning_rate": 0.000112351910841956, + "loss": 0.9743, + "step": 4460 + }, + { + "epoch": 0.5151130595293032, + "grad_norm": 0.20678523182868958, + "learning_rate": 0.00011215206957021618, + "loss": 0.9476, + "step": 4465 + }, + { + "epoch": 0.5156898938624827, + "grad_norm": 0.20912997424602509, + "learning_rate": 0.00011195217902771212, + "loss": 0.9338, + "step": 4470 + }, + { + "epoch": 0.5162667281956622, + "grad_norm": 0.1984136551618576, + "learning_rate": 0.0001117522400249033, + "loss": 0.9563, + "step": 4475 + }, + { + "epoch": 0.5168435625288417, + "grad_norm": 0.21053320169448853, + "learning_rate": 0.00011155225337244562, + "loss": 0.9753, + "step": 4480 + }, + { + "epoch": 0.5174203968620212, + "grad_norm": 0.19541747868061066, + "learning_rate": 0.00011135221988118825, + "loss": 0.9495, + "step": 4485 + }, + { + "epoch": 0.5179972311952007, + "grad_norm": 0.1899503469467163, + "learning_rate": 0.00011115214036217026, + "loss": 0.9259, + "step": 4490 + }, + { + "epoch": 0.5185740655283803, + "grad_norm": 0.1909545511007309, + "learning_rate": 0.0001109520156266173, + "loss": 0.9308, + "step": 4495 + }, + { + "epoch": 0.5191508998615597, + "grad_norm": 0.18733327090740204, + "learning_rate": 0.00011075184648593838, + "loss": 0.989, + "step": 4500 + }, + { + "epoch": 0.5197277341947393, + "grad_norm": 0.1994892954826355, + "learning_rate": 0.00011055163375172257, + "loss": 0.9611, + "step": 4505 + }, + { + "epoch": 0.5203045685279187, + "grad_norm": 0.1912652552127838, + "learning_rate": 0.00011035137823573561, + "loss": 0.9785, + "step": 4510 + }, + { + "epoch": 0.5208814028610983, + "grad_norm": 0.20350737869739532, + "learning_rate": 0.0001101510807499168, + "loss": 0.9672, + "step": 4515 + }, + { + "epoch": 0.5214582371942778, + "grad_norm": 0.2018175572156906, + "learning_rate": 0.00010995074210637557, + "loss": 0.9362, + "step": 4520 + }, + { + "epoch": 0.5220350715274573, + "grad_norm": 0.20151209831237793, + "learning_rate": 0.00010975036311738818, + "loss": 0.9485, + "step": 4525 + }, + { + "epoch": 0.5226119058606368, + "grad_norm": 0.21562981605529785, + "learning_rate": 0.00010954994459539452, + "loss": 0.9553, + "step": 4530 + }, + { + "epoch": 0.5231887401938163, + "grad_norm": 0.19124732911586761, + "learning_rate": 0.00010934948735299475, + "loss": 0.9422, + "step": 4535 + }, + { + "epoch": 0.5237655745269958, + "grad_norm": 0.20208267867565155, + "learning_rate": 0.00010914899220294607, + "loss": 0.9729, + "step": 4540 + }, + { + "epoch": 0.5243424088601754, + "grad_norm": 0.20815403759479523, + "learning_rate": 0.00010894845995815928, + "loss": 0.98, + "step": 4545 + }, + { + "epoch": 0.5249192431933549, + "grad_norm": 0.19528432190418243, + "learning_rate": 0.00010874789143169568, + "loss": 0.9301, + "step": 4550 + }, + { + "epoch": 0.5254960775265344, + "grad_norm": 0.20784156024456024, + "learning_rate": 0.00010854728743676362, + "loss": 0.9553, + "step": 4555 + }, + { + "epoch": 0.5260729118597139, + "grad_norm": 0.20081031322479248, + "learning_rate": 0.00010834664878671525, + "loss": 0.943, + "step": 4560 + }, + { + "epoch": 0.5266497461928934, + "grad_norm": 0.19570979475975037, + "learning_rate": 0.00010814597629504324, + "loss": 0.9876, + "step": 4565 + }, + { + "epoch": 0.527226580526073, + "grad_norm": 0.1984855979681015, + "learning_rate": 0.00010794527077537755, + "loss": 1.0065, + "step": 4570 + }, + { + "epoch": 0.5278034148592524, + "grad_norm": 0.18974007666110992, + "learning_rate": 0.00010774453304148192, + "loss": 0.881, + "step": 4575 + }, + { + "epoch": 0.528380249192432, + "grad_norm": 0.21458855271339417, + "learning_rate": 0.00010754376390725074, + "loss": 0.922, + "step": 4580 + }, + { + "epoch": 0.5289570835256114, + "grad_norm": 0.20850373804569244, + "learning_rate": 0.00010734296418670582, + "loss": 0.9884, + "step": 4585 + }, + { + "epoch": 0.529533917858791, + "grad_norm": 0.20355555415153503, + "learning_rate": 0.00010714213469399283, + "loss": 0.9743, + "step": 4590 + }, + { + "epoch": 0.5301107521919705, + "grad_norm": 0.20082198083400726, + "learning_rate": 0.00010694127624337826, + "loss": 0.9368, + "step": 4595 + }, + { + "epoch": 0.53068758652515, + "grad_norm": 0.1965116560459137, + "learning_rate": 0.00010674038964924597, + "loss": 0.9374, + "step": 4600 + }, + { + "epoch": 0.5312644208583295, + "grad_norm": 0.1953085958957672, + "learning_rate": 0.00010653947572609393, + "loss": 0.9168, + "step": 4605 + }, + { + "epoch": 0.531841255191509, + "grad_norm": 0.18610374629497528, + "learning_rate": 0.0001063385352885309, + "loss": 0.9191, + "step": 4610 + }, + { + "epoch": 0.5324180895246885, + "grad_norm": 0.18412019312381744, + "learning_rate": 0.00010613756915127319, + "loss": 0.9549, + "step": 4615 + }, + { + "epoch": 0.5329949238578681, + "grad_norm": 0.20428583025932312, + "learning_rate": 0.00010593657812914129, + "loss": 0.9849, + "step": 4620 + }, + { + "epoch": 0.5335717581910475, + "grad_norm": 0.1892022341489792, + "learning_rate": 0.00010573556303705652, + "loss": 0.9892, + "step": 4625 + }, + { + "epoch": 0.5341485925242271, + "grad_norm": 0.19151116907596588, + "learning_rate": 0.00010553452469003789, + "loss": 0.9291, + "step": 4630 + }, + { + "epoch": 0.5347254268574065, + "grad_norm": 0.20716647803783417, + "learning_rate": 0.00010533346390319867, + "loss": 0.9391, + "step": 4635 + }, + { + "epoch": 0.5353022611905861, + "grad_norm": 0.19385290145874023, + "learning_rate": 0.00010513238149174304, + "loss": 1.0, + "step": 4640 + }, + { + "epoch": 0.5358790955237656, + "grad_norm": 0.20355677604675293, + "learning_rate": 0.00010493127827096298, + "loss": 0.9311, + "step": 4645 + }, + { + "epoch": 0.5364559298569451, + "grad_norm": 0.2063637375831604, + "learning_rate": 0.00010473015505623477, + "loss": 0.9521, + "step": 4650 + }, + { + "epoch": 0.5370327641901246, + "grad_norm": 0.201882466673851, + "learning_rate": 0.00010452901266301574, + "loss": 0.9302, + "step": 4655 + }, + { + "epoch": 0.5376095985233041, + "grad_norm": 0.19823043048381805, + "learning_rate": 0.000104327851906841, + "loss": 1.0078, + "step": 4660 + }, + { + "epoch": 0.5381864328564836, + "grad_norm": 0.19284914433956146, + "learning_rate": 0.00010412667360332013, + "loss": 0.9246, + "step": 4665 + }, + { + "epoch": 0.5387632671896632, + "grad_norm": 0.19816389679908752, + "learning_rate": 0.00010392547856813384, + "loss": 0.9565, + "step": 4670 + }, + { + "epoch": 0.5393401015228426, + "grad_norm": 0.2038826197385788, + "learning_rate": 0.00010372426761703067, + "loss": 0.9511, + "step": 4675 + }, + { + "epoch": 0.5399169358560222, + "grad_norm": 0.2072780877351761, + "learning_rate": 0.00010352304156582376, + "loss": 0.9883, + "step": 4680 + }, + { + "epoch": 0.5404937701892016, + "grad_norm": 0.2100173830986023, + "learning_rate": 0.0001033218012303873, + "loss": 0.9626, + "step": 4685 + }, + { + "epoch": 0.5410706045223812, + "grad_norm": 0.1984483003616333, + "learning_rate": 0.00010312054742665362, + "loss": 0.9235, + "step": 4690 + }, + { + "epoch": 0.5416474388555607, + "grad_norm": 0.19214801490306854, + "learning_rate": 0.0001029192809706095, + "loss": 0.9675, + "step": 4695 + }, + { + "epoch": 0.5422242731887402, + "grad_norm": 0.19624173641204834, + "learning_rate": 0.00010271800267829308, + "loss": 0.9274, + "step": 4700 + }, + { + "epoch": 0.5428011075219197, + "grad_norm": 0.19869272410869598, + "learning_rate": 0.00010251671336579048, + "loss": 0.9557, + "step": 4705 + }, + { + "epoch": 0.5433779418550992, + "grad_norm": 0.191603422164917, + "learning_rate": 0.00010231541384923248, + "loss": 0.9239, + "step": 4710 + }, + { + "epoch": 0.5439547761882787, + "grad_norm": 0.18842673301696777, + "learning_rate": 0.0001021141049447913, + "loss": 0.9652, + "step": 4715 + }, + { + "epoch": 0.5445316105214583, + "grad_norm": 0.1997225433588028, + "learning_rate": 0.00010191278746867714, + "loss": 0.964, + "step": 4720 + }, + { + "epoch": 0.5451084448546377, + "grad_norm": 0.22433891892433167, + "learning_rate": 0.00010171146223713496, + "loss": 0.9204, + "step": 4725 + }, + { + "epoch": 0.5456852791878173, + "grad_norm": 0.20622776448726654, + "learning_rate": 0.00010151013006644128, + "loss": 0.9701, + "step": 4730 + }, + { + "epoch": 0.5462621135209967, + "grad_norm": 0.19883517920970917, + "learning_rate": 0.00010130879177290061, + "loss": 0.9816, + "step": 4735 + }, + { + "epoch": 0.5468389478541763, + "grad_norm": 0.1920338273048401, + "learning_rate": 0.00010110744817284232, + "loss": 0.9579, + "step": 4740 + }, + { + "epoch": 0.5474157821873558, + "grad_norm": 0.18235036730766296, + "learning_rate": 0.00010090610008261738, + "loss": 0.9488, + "step": 4745 + }, + { + "epoch": 0.5479926165205353, + "grad_norm": 0.2129899263381958, + "learning_rate": 0.00010070474831859486, + "loss": 1.0436, + "step": 4750 + }, + { + "epoch": 0.5485694508537148, + "grad_norm": 0.19054663181304932, + "learning_rate": 0.0001005033936971588, + "loss": 0.9736, + "step": 4755 + }, + { + "epoch": 0.5491462851868943, + "grad_norm": 0.199018195271492, + "learning_rate": 0.00010030203703470477, + "loss": 0.9589, + "step": 4760 + }, + { + "epoch": 0.5497231195200738, + "grad_norm": 0.20436997711658478, + "learning_rate": 0.00010010067914763668, + "loss": 0.9825, + "step": 4765 + }, + { + "epoch": 0.5502999538532534, + "grad_norm": 0.19828738272190094, + "learning_rate": 9.989932085236334e-05, + "loss": 0.9739, + "step": 4770 + }, + { + "epoch": 0.5508767881864328, + "grad_norm": 0.19354400038719177, + "learning_rate": 9.969796296529525e-05, + "loss": 0.968, + "step": 4775 + }, + { + "epoch": 0.5514536225196124, + "grad_norm": 0.2065267264842987, + "learning_rate": 9.949660630284122e-05, + "loss": 0.9612, + "step": 4780 + }, + { + "epoch": 0.5520304568527918, + "grad_norm": 0.18790316581726074, + "learning_rate": 9.929525168140516e-05, + "loss": 0.9412, + "step": 4785 + }, + { + "epoch": 0.5526072911859714, + "grad_norm": 0.19820088148117065, + "learning_rate": 9.909389991738263e-05, + "loss": 0.9354, + "step": 4790 + }, + { + "epoch": 0.5531841255191509, + "grad_norm": 0.1902572065591812, + "learning_rate": 9.889255182715769e-05, + "loss": 0.9176, + "step": 4795 + }, + { + "epoch": 0.5537609598523304, + "grad_norm": 0.18741828203201294, + "learning_rate": 9.869120822709946e-05, + "loss": 0.9258, + "step": 4800 + }, + { + "epoch": 0.5543377941855099, + "grad_norm": 0.18927207589149475, + "learning_rate": 9.848986993355877e-05, + "loss": 0.9675, + "step": 4805 + }, + { + "epoch": 0.5549146285186894, + "grad_norm": 0.20099502801895142, + "learning_rate": 9.828853776286505e-05, + "loss": 0.937, + "step": 4810 + }, + { + "epoch": 0.5554914628518689, + "grad_norm": 0.19836010038852692, + "learning_rate": 9.808721253132289e-05, + "loss": 0.9674, + "step": 4815 + }, + { + "epoch": 0.5560682971850485, + "grad_norm": 0.18979863822460175, + "learning_rate": 9.78858950552087e-05, + "loss": 0.9916, + "step": 4820 + }, + { + "epoch": 0.5566451315182279, + "grad_norm": 0.191952183842659, + "learning_rate": 9.768458615076751e-05, + "loss": 0.9519, + "step": 4825 + }, + { + "epoch": 0.5572219658514075, + "grad_norm": 0.19669634103775024, + "learning_rate": 9.748328663420952e-05, + "loss": 0.9389, + "step": 4830 + }, + { + "epoch": 0.5577988001845869, + "grad_norm": 0.20401979982852936, + "learning_rate": 9.728199732170696e-05, + "loss": 0.9875, + "step": 4835 + }, + { + "epoch": 0.5583756345177665, + "grad_norm": 0.2189038097858429, + "learning_rate": 9.708071902939054e-05, + "loss": 0.9388, + "step": 4840 + }, + { + "epoch": 0.558952468850946, + "grad_norm": 0.2018914520740509, + "learning_rate": 9.687945257334641e-05, + "loss": 1.0281, + "step": 4845 + }, + { + "epoch": 0.5595293031841255, + "grad_norm": 0.20172595977783203, + "learning_rate": 9.667819876961272e-05, + "loss": 0.9957, + "step": 4850 + }, + { + "epoch": 0.560106137517305, + "grad_norm": 0.2063581645488739, + "learning_rate": 9.647695843417628e-05, + "loss": 0.9723, + "step": 4855 + }, + { + "epoch": 0.5606829718504845, + "grad_norm": 0.2113044112920761, + "learning_rate": 9.627573238296933e-05, + "loss": 0.9577, + "step": 4860 + }, + { + "epoch": 0.561259806183664, + "grad_norm": 0.20113973319530487, + "learning_rate": 9.60745214318662e-05, + "loss": 0.9239, + "step": 4865 + }, + { + "epoch": 0.5618366405168436, + "grad_norm": 0.20177756249904633, + "learning_rate": 9.58733263966799e-05, + "loss": 1.0015, + "step": 4870 + }, + { + "epoch": 0.562413474850023, + "grad_norm": 0.19315795600414276, + "learning_rate": 9.567214809315903e-05, + "loss": 0.9231, + "step": 4875 + }, + { + "epoch": 0.5629903091832026, + "grad_norm": 0.19903239607810974, + "learning_rate": 9.547098733698428e-05, + "loss": 0.9608, + "step": 4880 + }, + { + "epoch": 0.563567143516382, + "grad_norm": 0.19510619342327118, + "learning_rate": 9.526984494376524e-05, + "loss": 1.0001, + "step": 4885 + }, + { + "epoch": 0.5641439778495616, + "grad_norm": 0.1909225881099701, + "learning_rate": 9.5068721729037e-05, + "loss": 0.9288, + "step": 4890 + }, + { + "epoch": 0.5647208121827412, + "grad_norm": 0.19778591394424438, + "learning_rate": 9.486761850825694e-05, + "loss": 0.9509, + "step": 4895 + }, + { + "epoch": 0.5652976465159206, + "grad_norm": 0.19866321980953217, + "learning_rate": 9.466653609680137e-05, + "loss": 0.9815, + "step": 4900 + }, + { + "epoch": 0.5658744808491002, + "grad_norm": 0.19185493886470795, + "learning_rate": 9.446547530996214e-05, + "loss": 0.9667, + "step": 4905 + }, + { + "epoch": 0.5664513151822796, + "grad_norm": 0.22153504192829132, + "learning_rate": 9.426443696294351e-05, + "loss": 0.9367, + "step": 4910 + }, + { + "epoch": 0.5670281495154592, + "grad_norm": 0.18705500662326813, + "learning_rate": 9.406342187085875e-05, + "loss": 0.9668, + "step": 4915 + }, + { + "epoch": 0.5676049838486387, + "grad_norm": 0.19349828362464905, + "learning_rate": 9.386243084872682e-05, + "loss": 0.892, + "step": 4920 + }, + { + "epoch": 0.5681818181818182, + "grad_norm": 0.21577003598213196, + "learning_rate": 9.36614647114691e-05, + "loss": 0.9722, + "step": 4925 + }, + { + "epoch": 0.5687586525149977, + "grad_norm": 0.2050502598285675, + "learning_rate": 9.34605242739061e-05, + "loss": 0.9489, + "step": 4930 + }, + { + "epoch": 0.5693354868481773, + "grad_norm": 0.19747060537338257, + "learning_rate": 9.325961035075405e-05, + "loss": 0.9862, + "step": 4935 + }, + { + "epoch": 0.5699123211813567, + "grad_norm": 0.2020748257637024, + "learning_rate": 9.305872375662176e-05, + "loss": 1.017, + "step": 4940 + }, + { + "epoch": 0.5704891555145363, + "grad_norm": 0.19945184886455536, + "learning_rate": 9.285786530600718e-05, + "loss": 0.9749, + "step": 4945 + }, + { + "epoch": 0.5710659898477157, + "grad_norm": 0.20474183559417725, + "learning_rate": 9.26570358132942e-05, + "loss": 0.93, + "step": 4950 + }, + { + "epoch": 0.5716428241808953, + "grad_norm": 0.1998247355222702, + "learning_rate": 9.245623609274928e-05, + "loss": 0.9278, + "step": 4955 + }, + { + "epoch": 0.5722196585140747, + "grad_norm": 0.19674921035766602, + "learning_rate": 9.225546695851815e-05, + "loss": 0.9899, + "step": 4960 + }, + { + "epoch": 0.5727964928472543, + "grad_norm": 0.19790132343769073, + "learning_rate": 9.20547292246225e-05, + "loss": 0.9343, + "step": 4965 + }, + { + "epoch": 0.5733733271804338, + "grad_norm": 0.20167304575443268, + "learning_rate": 9.185402370495677e-05, + "loss": 0.9547, + "step": 4970 + }, + { + "epoch": 0.5739501615136133, + "grad_norm": 0.19420599937438965, + "learning_rate": 9.165335121328477e-05, + "loss": 0.9824, + "step": 4975 + }, + { + "epoch": 0.5745269958467928, + "grad_norm": 0.18731547892093658, + "learning_rate": 9.14527125632364e-05, + "loss": 0.9304, + "step": 4980 + }, + { + "epoch": 0.5751038301799724, + "grad_norm": 0.19297674298286438, + "learning_rate": 9.125210856830433e-05, + "loss": 0.9895, + "step": 4985 + }, + { + "epoch": 0.5756806645131518, + "grad_norm": 0.19416366517543793, + "learning_rate": 9.105154004184071e-05, + "loss": 0.9606, + "step": 4990 + }, + { + "epoch": 0.5762574988463314, + "grad_norm": 0.19022150337696075, + "learning_rate": 9.085100779705398e-05, + "loss": 0.9269, + "step": 4995 + }, + { + "epoch": 0.5768343331795108, + "grad_norm": 0.19337041676044464, + "learning_rate": 9.065051264700527e-05, + "loss": 0.9502, + "step": 5000 + }, + { + "epoch": 0.5774111675126904, + "grad_norm": 0.19673341512680054, + "learning_rate": 9.045005540460552e-05, + "loss": 0.9453, + "step": 5005 + }, + { + "epoch": 0.5779880018458698, + "grad_norm": 0.19977766275405884, + "learning_rate": 9.024963688261186e-05, + "loss": 0.9208, + "step": 5010 + }, + { + "epoch": 0.5785648361790494, + "grad_norm": 0.19135598838329315, + "learning_rate": 9.004925789362446e-05, + "loss": 0.961, + "step": 5015 + }, + { + "epoch": 0.5791416705122289, + "grad_norm": 0.1971130669116974, + "learning_rate": 8.984891925008321e-05, + "loss": 0.945, + "step": 5020 + }, + { + "epoch": 0.5797185048454084, + "grad_norm": 0.19552946090698242, + "learning_rate": 8.964862176426443e-05, + "loss": 0.9618, + "step": 5025 + }, + { + "epoch": 0.5802953391785879, + "grad_norm": 0.18302129209041595, + "learning_rate": 8.944836624827748e-05, + "loss": 0.956, + "step": 5030 + }, + { + "epoch": 0.5808721735117675, + "grad_norm": 0.2028164565563202, + "learning_rate": 8.924815351406163e-05, + "loss": 1.0094, + "step": 5035 + }, + { + "epoch": 0.5814490078449469, + "grad_norm": 0.19891835749149323, + "learning_rate": 8.904798437338272e-05, + "loss": 0.9727, + "step": 5040 + }, + { + "epoch": 0.5820258421781265, + "grad_norm": 0.1948157548904419, + "learning_rate": 8.884785963782975e-05, + "loss": 1.0068, + "step": 5045 + }, + { + "epoch": 0.5826026765113059, + "grad_norm": 0.19020549952983856, + "learning_rate": 8.864778011881175e-05, + "loss": 0.9164, + "step": 5050 + }, + { + "epoch": 0.5831795108444855, + "grad_norm": 0.20140686631202698, + "learning_rate": 8.84477466275544e-05, + "loss": 0.949, + "step": 5055 + }, + { + "epoch": 0.583756345177665, + "grad_norm": 0.203651562333107, + "learning_rate": 8.824775997509675e-05, + "loss": 0.9788, + "step": 5060 + }, + { + "epoch": 0.5843331795108445, + "grad_norm": 0.20070117712020874, + "learning_rate": 8.80478209722879e-05, + "loss": 0.9566, + "step": 5065 + }, + { + "epoch": 0.584910013844024, + "grad_norm": 0.2062043994665146, + "learning_rate": 8.784793042978384e-05, + "loss": 0.9331, + "step": 5070 + }, + { + "epoch": 0.5854868481772035, + "grad_norm": 0.19828034937381744, + "learning_rate": 8.764808915804401e-05, + "loss": 0.9926, + "step": 5075 + }, + { + "epoch": 0.586063682510383, + "grad_norm": 0.2180069088935852, + "learning_rate": 8.744829796732812e-05, + "loss": 1.0008, + "step": 5080 + }, + { + "epoch": 0.5866405168435626, + "grad_norm": 0.2032460868358612, + "learning_rate": 8.724855766769282e-05, + "loss": 1.017, + "step": 5085 + }, + { + "epoch": 0.587217351176742, + "grad_norm": 0.21724148094654083, + "learning_rate": 8.70488690689884e-05, + "loss": 0.9026, + "step": 5090 + }, + { + "epoch": 0.5877941855099216, + "grad_norm": 0.21417102217674255, + "learning_rate": 8.684923298085555e-05, + "loss": 0.9439, + "step": 5095 + }, + { + "epoch": 0.588371019843101, + "grad_norm": 0.19951827824115753, + "learning_rate": 8.6649650212722e-05, + "loss": 0.9587, + "step": 5100 + }, + { + "epoch": 0.5889478541762806, + "grad_norm": 0.20338566601276398, + "learning_rate": 8.645012157379941e-05, + "loss": 1.0392, + "step": 5105 + }, + { + "epoch": 0.58952468850946, + "grad_norm": 0.18534879386425018, + "learning_rate": 8.625064787307986e-05, + "loss": 0.975, + "step": 5110 + }, + { + "epoch": 0.5901015228426396, + "grad_norm": 0.20185095071792603, + "learning_rate": 8.605122991933271e-05, + "loss": 0.9446, + "step": 5115 + }, + { + "epoch": 0.5906783571758191, + "grad_norm": 0.19836807250976562, + "learning_rate": 8.585186852110134e-05, + "loss": 0.9678, + "step": 5120 + }, + { + "epoch": 0.5912551915089986, + "grad_norm": 0.20136135816574097, + "learning_rate": 8.565256448669976e-05, + "loss": 0.9662, + "step": 5125 + }, + { + "epoch": 0.5918320258421781, + "grad_norm": 0.1838986873626709, + "learning_rate": 8.545331862420944e-05, + "loss": 0.9133, + "step": 5130 + }, + { + "epoch": 0.5924088601753577, + "grad_norm": 0.19178135693073273, + "learning_rate": 8.525413174147598e-05, + "loss": 0.9451, + "step": 5135 + }, + { + "epoch": 0.5929856945085371, + "grad_norm": 0.20643360912799835, + "learning_rate": 8.505500464610584e-05, + "loss": 0.9355, + "step": 5140 + }, + { + "epoch": 0.5935625288417167, + "grad_norm": 0.20259740948677063, + "learning_rate": 8.485593814546307e-05, + "loss": 0.9678, + "step": 5145 + }, + { + "epoch": 0.5941393631748961, + "grad_norm": 0.19384346902370453, + "learning_rate": 8.465693304666606e-05, + "loss": 0.966, + "step": 5150 + }, + { + "epoch": 0.5947161975080757, + "grad_norm": 0.2989238202571869, + "learning_rate": 8.445799015658427e-05, + "loss": 0.9356, + "step": 5155 + }, + { + "epoch": 0.5952930318412551, + "grad_norm": 0.212250217795372, + "learning_rate": 8.425911028183479e-05, + "loss": 0.9423, + "step": 5160 + }, + { + "epoch": 0.5958698661744347, + "grad_norm": 0.19762969017028809, + "learning_rate": 8.406029422877937e-05, + "loss": 0.9727, + "step": 5165 + }, + { + "epoch": 0.5964467005076142, + "grad_norm": 0.18385392427444458, + "learning_rate": 8.386154280352094e-05, + "loss": 0.9671, + "step": 5170 + }, + { + "epoch": 0.5970235348407937, + "grad_norm": 0.21260400116443634, + "learning_rate": 8.366285681190039e-05, + "loss": 0.9678, + "step": 5175 + }, + { + "epoch": 0.5976003691739732, + "grad_norm": 0.20906022191047668, + "learning_rate": 8.34642370594933e-05, + "loss": 0.9252, + "step": 5180 + }, + { + "epoch": 0.5981772035071528, + "grad_norm": 0.18937109410762787, + "learning_rate": 8.326568435160677e-05, + "loss": 1.0374, + "step": 5185 + }, + { + "epoch": 0.5987540378403322, + "grad_norm": 0.19254235923290253, + "learning_rate": 8.306719949327588e-05, + "loss": 0.9621, + "step": 5190 + }, + { + "epoch": 0.5993308721735118, + "grad_norm": 0.1981174647808075, + "learning_rate": 8.286878328926077e-05, + "loss": 0.9276, + "step": 5195 + }, + { + "epoch": 0.5999077065066912, + "grad_norm": 0.2028452455997467, + "learning_rate": 8.26704365440432e-05, + "loss": 0.9824, + "step": 5200 + }, + { + "epoch": 0.6004845408398708, + "grad_norm": 0.2027309387922287, + "learning_rate": 8.247216006182326e-05, + "loss": 0.9204, + "step": 5205 + }, + { + "epoch": 0.6010613751730502, + "grad_norm": 0.19740572571754456, + "learning_rate": 8.227395464651618e-05, + "loss": 0.9799, + "step": 5210 + }, + { + "epoch": 0.6016382095062298, + "grad_norm": 0.19563211500644684, + "learning_rate": 8.20758211017491e-05, + "loss": 0.9518, + "step": 5215 + }, + { + "epoch": 0.6022150438394093, + "grad_norm": 0.19438865780830383, + "learning_rate": 8.187776023085762e-05, + "loss": 1.0022, + "step": 5220 + }, + { + "epoch": 0.6027918781725888, + "grad_norm": 0.1977875679731369, + "learning_rate": 8.167977283688282e-05, + "loss": 0.997, + "step": 5225 + }, + { + "epoch": 0.6033687125057683, + "grad_norm": 0.22014763951301575, + "learning_rate": 8.148185972256778e-05, + "loss": 0.9436, + "step": 5230 + }, + { + "epoch": 0.6039455468389479, + "grad_norm": 0.19123868644237518, + "learning_rate": 8.128402169035451e-05, + "loss": 0.971, + "step": 5235 + }, + { + "epoch": 0.6045223811721273, + "grad_norm": 0.18875616788864136, + "learning_rate": 8.108625954238051e-05, + "loss": 0.9399, + "step": 5240 + }, + { + "epoch": 0.6050992155053069, + "grad_norm": 0.20804201066493988, + "learning_rate": 8.088857408047562e-05, + "loss": 0.9634, + "step": 5245 + }, + { + "epoch": 0.6056760498384864, + "grad_norm": 0.1881391406059265, + "learning_rate": 8.06909661061588e-05, + "loss": 0.9885, + "step": 5250 + }, + { + "epoch": 0.6062528841716659, + "grad_norm": 0.19537098705768585, + "learning_rate": 8.049343642063477e-05, + "loss": 0.952, + "step": 5255 + }, + { + "epoch": 0.6068297185048455, + "grad_norm": 0.18685182929039001, + "learning_rate": 8.029598582479088e-05, + "loss": 0.9603, + "step": 5260 + }, + { + "epoch": 0.6074065528380249, + "grad_norm": 0.2082066535949707, + "learning_rate": 8.00986151191938e-05, + "loss": 0.8972, + "step": 5265 + }, + { + "epoch": 0.6079833871712045, + "grad_norm": 0.20010879635810852, + "learning_rate": 7.990132510408625e-05, + "loss": 0.934, + "step": 5270 + }, + { + "epoch": 0.6085602215043839, + "grad_norm": 0.19527243077754974, + "learning_rate": 7.970411657938381e-05, + "loss": 0.9687, + "step": 5275 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 0.2064102441072464, + "learning_rate": 7.95069903446717e-05, + "loss": 0.9407, + "step": 5280 + }, + { + "epoch": 0.609713890170743, + "grad_norm": 0.19495844841003418, + "learning_rate": 7.930994719920142e-05, + "loss": 0.9628, + "step": 5285 + }, + { + "epoch": 0.6102907245039225, + "grad_norm": 0.21486780047416687, + "learning_rate": 7.911298794188761e-05, + "loss": 0.9869, + "step": 5290 + }, + { + "epoch": 0.610867558837102, + "grad_norm": 0.19653169810771942, + "learning_rate": 7.891611337130482e-05, + "loss": 0.9998, + "step": 5295 + }, + { + "epoch": 0.6114443931702815, + "grad_norm": 0.21410493552684784, + "learning_rate": 7.871932428568418e-05, + "loss": 0.9142, + "step": 5300 + }, + { + "epoch": 0.612021227503461, + "grad_norm": 0.18754735589027405, + "learning_rate": 7.852262148291028e-05, + "loss": 1.0069, + "step": 5305 + }, + { + "epoch": 0.6125980618366406, + "grad_norm": 0.19620005786418915, + "learning_rate": 7.832600576051779e-05, + "loss": 0.9078, + "step": 5310 + }, + { + "epoch": 0.61317489616982, + "grad_norm": 0.20171235501766205, + "learning_rate": 7.812947791568845e-05, + "loss": 0.9696, + "step": 5315 + }, + { + "epoch": 0.6137517305029996, + "grad_norm": 0.20567715167999268, + "learning_rate": 7.793303874524752e-05, + "loss": 0.9431, + "step": 5320 + }, + { + "epoch": 0.614328564836179, + "grad_norm": 0.19829270243644714, + "learning_rate": 7.773668904566085e-05, + "loss": 0.9473, + "step": 5325 + }, + { + "epoch": 0.6149053991693586, + "grad_norm": 0.1850794106721878, + "learning_rate": 7.75404296130315e-05, + "loss": 0.9487, + "step": 5330 + }, + { + "epoch": 0.6154822335025381, + "grad_norm": 0.19462130963802338, + "learning_rate": 7.734426124309656e-05, + "loss": 0.9599, + "step": 5335 + }, + { + "epoch": 0.6160590678357176, + "grad_norm": 0.20998431742191315, + "learning_rate": 7.714818473122385e-05, + "loss": 0.9605, + "step": 5340 + }, + { + "epoch": 0.6166359021688971, + "grad_norm": 0.18975423276424408, + "learning_rate": 7.695220087240885e-05, + "loss": 0.9829, + "step": 5345 + }, + { + "epoch": 0.6172127365020766, + "grad_norm": 0.19463218748569489, + "learning_rate": 7.675631046127123e-05, + "loss": 0.9586, + "step": 5350 + }, + { + "epoch": 0.6177895708352561, + "grad_norm": 0.19710615277290344, + "learning_rate": 7.656051429205188e-05, + "loss": 0.9812, + "step": 5355 + }, + { + "epoch": 0.6183664051684357, + "grad_norm": 0.19291462004184723, + "learning_rate": 7.636481315860958e-05, + "loss": 0.9615, + "step": 5360 + }, + { + "epoch": 0.6189432395016151, + "grad_norm": 0.19099998474121094, + "learning_rate": 7.616920785441777e-05, + "loss": 0.9017, + "step": 5365 + }, + { + "epoch": 0.6195200738347947, + "grad_norm": 0.1884920597076416, + "learning_rate": 7.597369917256132e-05, + "loss": 0.9232, + "step": 5370 + }, + { + "epoch": 0.6200969081679741, + "grad_norm": 0.20266391336917877, + "learning_rate": 7.577828790573345e-05, + "loss": 0.973, + "step": 5375 + }, + { + "epoch": 0.6206737425011537, + "grad_norm": 0.19208677113056183, + "learning_rate": 7.55829748462322e-05, + "loss": 0.966, + "step": 5380 + }, + { + "epoch": 0.6212505768343332, + "grad_norm": 0.1974397599697113, + "learning_rate": 7.538776078595762e-05, + "loss": 0.9816, + "step": 5385 + }, + { + "epoch": 0.6218274111675127, + "grad_norm": 0.1980104148387909, + "learning_rate": 7.519264651640829e-05, + "loss": 0.9531, + "step": 5390 + }, + { + "epoch": 0.6224042455006922, + "grad_norm": 0.19935230910778046, + "learning_rate": 7.499763282867823e-05, + "loss": 0.9754, + "step": 5395 + }, + { + "epoch": 0.6229810798338717, + "grad_norm": 0.19416543841362, + "learning_rate": 7.480272051345358e-05, + "loss": 0.9571, + "step": 5400 + }, + { + "epoch": 0.6235579141670512, + "grad_norm": 0.21511389315128326, + "learning_rate": 7.460791036100952e-05, + "loss": 0.9454, + "step": 5405 + }, + { + "epoch": 0.6241347485002308, + "grad_norm": 0.1961875706911087, + "learning_rate": 7.4413203161207e-05, + "loss": 0.9851, + "step": 5410 + }, + { + "epoch": 0.6247115828334102, + "grad_norm": 0.20345531404018402, + "learning_rate": 7.421859970348949e-05, + "loss": 0.9334, + "step": 5415 + }, + { + "epoch": 0.6252884171665898, + "grad_norm": 0.18896692991256714, + "learning_rate": 7.402410077687993e-05, + "loss": 0.9288, + "step": 5420 + }, + { + "epoch": 0.6258652514997692, + "grad_norm": 0.21268202364444733, + "learning_rate": 7.382970716997736e-05, + "loss": 0.9789, + "step": 5425 + }, + { + "epoch": 0.6264420858329488, + "grad_norm": 0.20382317900657654, + "learning_rate": 7.363541967095387e-05, + "loss": 0.9449, + "step": 5430 + }, + { + "epoch": 0.6270189201661283, + "grad_norm": 0.1883535534143448, + "learning_rate": 7.344123906755124e-05, + "loss": 0.9609, + "step": 5435 + }, + { + "epoch": 0.6275957544993078, + "grad_norm": 0.2033308893442154, + "learning_rate": 7.324716614707793e-05, + "loss": 0.9712, + "step": 5440 + }, + { + "epoch": 0.6281725888324873, + "grad_norm": 0.1854039430618286, + "learning_rate": 7.305320169640575e-05, + "loss": 0.9199, + "step": 5445 + }, + { + "epoch": 0.6287494231656668, + "grad_norm": 0.19366517663002014, + "learning_rate": 7.285934650196672e-05, + "loss": 0.9421, + "step": 5450 + }, + { + "epoch": 0.6293262574988463, + "grad_norm": 0.1920800656080246, + "learning_rate": 7.266560134974989e-05, + "loss": 0.9357, + "step": 5455 + }, + { + "epoch": 0.6299030918320259, + "grad_norm": 0.1931942254304886, + "learning_rate": 7.247196702529815e-05, + "loss": 0.9787, + "step": 5460 + }, + { + "epoch": 0.6304799261652053, + "grad_norm": 0.20151135325431824, + "learning_rate": 7.227844431370502e-05, + "loss": 1.0103, + "step": 5465 + }, + { + "epoch": 0.6310567604983849, + "grad_norm": 0.19090351462364197, + "learning_rate": 7.208503399961149e-05, + "loss": 0.9166, + "step": 5470 + }, + { + "epoch": 0.6316335948315643, + "grad_norm": 0.19098447263240814, + "learning_rate": 7.189173686720287e-05, + "loss": 0.9626, + "step": 5475 + }, + { + "epoch": 0.6322104291647439, + "grad_norm": 0.20782425999641418, + "learning_rate": 7.169855370020547e-05, + "loss": 1.0002, + "step": 5480 + }, + { + "epoch": 0.6327872634979234, + "grad_norm": 0.20794013142585754, + "learning_rate": 7.15054852818836e-05, + "loss": 0.9396, + "step": 5485 + }, + { + "epoch": 0.6333640978311029, + "grad_norm": 0.2069510966539383, + "learning_rate": 7.131253239503635e-05, + "loss": 0.9651, + "step": 5490 + }, + { + "epoch": 0.6339409321642824, + "grad_norm": 0.19113782048225403, + "learning_rate": 7.111969582199431e-05, + "loss": 0.9243, + "step": 5495 + }, + { + "epoch": 0.6345177664974619, + "grad_norm": 0.19338031113147736, + "learning_rate": 7.092697634461654e-05, + "loss": 0.9379, + "step": 5500 + }, + { + "epoch": 0.6350946008306414, + "grad_norm": 0.214319109916687, + "learning_rate": 7.073437474428732e-05, + "loss": 0.99, + "step": 5505 + }, + { + "epoch": 0.635671435163821, + "grad_norm": 0.19014938175678253, + "learning_rate": 7.05418918019129e-05, + "loss": 0.9966, + "step": 5510 + }, + { + "epoch": 0.6362482694970004, + "grad_norm": 0.20483070611953735, + "learning_rate": 7.034952829791858e-05, + "loss": 0.9126, + "step": 5515 + }, + { + "epoch": 0.63682510383018, + "grad_norm": 0.19414497911930084, + "learning_rate": 7.01572850122453e-05, + "loss": 0.959, + "step": 5520 + }, + { + "epoch": 0.6374019381633594, + "grad_norm": 0.20513789355754852, + "learning_rate": 6.996516272434658e-05, + "loss": 0.9301, + "step": 5525 + }, + { + "epoch": 0.637978772496539, + "grad_norm": 0.19759726524353027, + "learning_rate": 6.97731622131854e-05, + "loss": 0.9727, + "step": 5530 + }, + { + "epoch": 0.6385556068297185, + "grad_norm": 0.20151971280574799, + "learning_rate": 6.9581284257231e-05, + "loss": 0.9625, + "step": 5535 + }, + { + "epoch": 0.639132441162898, + "grad_norm": 0.2085186392068863, + "learning_rate": 6.938952963445559e-05, + "loss": 0.9663, + "step": 5540 + }, + { + "epoch": 0.6397092754960775, + "grad_norm": 0.19066348671913147, + "learning_rate": 6.919789912233146e-05, + "loss": 0.9397, + "step": 5545 + }, + { + "epoch": 0.640286109829257, + "grad_norm": 0.2034011334180832, + "learning_rate": 6.900639349782762e-05, + "loss": 0.975, + "step": 5550 + }, + { + "epoch": 0.6408629441624365, + "grad_norm": 0.19911380112171173, + "learning_rate": 6.88150135374068e-05, + "loss": 1.0096, + "step": 5555 + }, + { + "epoch": 0.6414397784956161, + "grad_norm": 0.19491539895534515, + "learning_rate": 6.862376001702213e-05, + "loss": 0.9999, + "step": 5560 + }, + { + "epoch": 0.6420166128287955, + "grad_norm": 0.18677499890327454, + "learning_rate": 6.843263371211414e-05, + "loss": 0.9005, + "step": 5565 + }, + { + "epoch": 0.6425934471619751, + "grad_norm": 0.19133366644382477, + "learning_rate": 6.824163539760759e-05, + "loss": 0.9202, + "step": 5570 + }, + { + "epoch": 0.6431702814951545, + "grad_norm": 0.19170129299163818, + "learning_rate": 6.805076584790818e-05, + "loss": 0.955, + "step": 5575 + }, + { + "epoch": 0.6437471158283341, + "grad_norm": 0.18898151814937592, + "learning_rate": 6.786002583689968e-05, + "loss": 0.9515, + "step": 5580 + }, + { + "epoch": 0.6443239501615136, + "grad_norm": 0.20018716156482697, + "learning_rate": 6.766941613794053e-05, + "loss": 0.9429, + "step": 5585 + }, + { + "epoch": 0.6449007844946931, + "grad_norm": 0.20243898034095764, + "learning_rate": 6.747893752386088e-05, + "loss": 0.9879, + "step": 5590 + }, + { + "epoch": 0.6454776188278727, + "grad_norm": 0.18688704073429108, + "learning_rate": 6.728859076695938e-05, + "loss": 0.9039, + "step": 5595 + }, + { + "epoch": 0.6460544531610521, + "grad_norm": 0.2041657567024231, + "learning_rate": 6.709837663900007e-05, + "loss": 0.9449, + "step": 5600 + }, + { + "epoch": 0.6466312874942317, + "grad_norm": 0.17731288075447083, + "learning_rate": 6.690829591120922e-05, + "loss": 0.8981, + "step": 5605 + }, + { + "epoch": 0.6472081218274112, + "grad_norm": 0.19162966310977936, + "learning_rate": 6.671834935427222e-05, + "loss": 0.9003, + "step": 5610 + }, + { + "epoch": 0.6477849561605907, + "grad_norm": 0.19198764860630035, + "learning_rate": 6.652853773833052e-05, + "loss": 0.9338, + "step": 5615 + }, + { + "epoch": 0.6483617904937702, + "grad_norm": 0.1949075311422348, + "learning_rate": 6.633886183297838e-05, + "loss": 0.9595, + "step": 5620 + }, + { + "epoch": 0.6489386248269498, + "grad_norm": 0.18270482122898102, + "learning_rate": 6.614932240725989e-05, + "loss": 0.9107, + "step": 5625 + }, + { + "epoch": 0.6495154591601292, + "grad_norm": 0.19540977478027344, + "learning_rate": 6.595992022966571e-05, + "loss": 0.9186, + "step": 5630 + }, + { + "epoch": 0.6500922934933088, + "grad_norm": 0.19692561030387878, + "learning_rate": 6.577065606813011e-05, + "loss": 0.9674, + "step": 5635 + }, + { + "epoch": 0.6506691278264882, + "grad_norm": 0.19444316625595093, + "learning_rate": 6.558153069002764e-05, + "loss": 0.998, + "step": 5640 + }, + { + "epoch": 0.6512459621596678, + "grad_norm": 0.19267070293426514, + "learning_rate": 6.539254486217026e-05, + "loss": 0.9694, + "step": 5645 + }, + { + "epoch": 0.6518227964928472, + "grad_norm": 0.1884683221578598, + "learning_rate": 6.520369935080411e-05, + "loss": 0.9626, + "step": 5650 + }, + { + "epoch": 0.6523996308260268, + "grad_norm": 0.19955401122570038, + "learning_rate": 6.501499492160636e-05, + "loss": 0.9644, + "step": 5655 + }, + { + "epoch": 0.6529764651592063, + "grad_norm": 0.18771077692508698, + "learning_rate": 6.482643233968224e-05, + "loss": 0.9485, + "step": 5660 + }, + { + "epoch": 0.6535532994923858, + "grad_norm": 0.2017538845539093, + "learning_rate": 6.463801236956184e-05, + "loss": 0.9341, + "step": 5665 + }, + { + "epoch": 0.6541301338255653, + "grad_norm": 0.2100268006324768, + "learning_rate": 6.44497357751969e-05, + "loss": 0.9625, + "step": 5670 + }, + { + "epoch": 0.6547069681587449, + "grad_norm": 0.1906213015317917, + "learning_rate": 6.426160331995801e-05, + "loss": 0.9099, + "step": 5675 + }, + { + "epoch": 0.6552838024919243, + "grad_norm": 0.18438196182250977, + "learning_rate": 6.407361576663124e-05, + "loss": 0.8879, + "step": 5680 + }, + { + "epoch": 0.6558606368251039, + "grad_norm": 0.18689702451229095, + "learning_rate": 6.388577387741524e-05, + "loss": 0.9613, + "step": 5685 + }, + { + "epoch": 0.6564374711582833, + "grad_norm": 0.19837065041065216, + "learning_rate": 6.369807841391798e-05, + "loss": 0.9303, + "step": 5690 + }, + { + "epoch": 0.6570143054914629, + "grad_norm": 0.19491511583328247, + "learning_rate": 6.351053013715383e-05, + "loss": 0.9777, + "step": 5695 + }, + { + "epoch": 0.6575911398246423, + "grad_norm": 0.19005633890628815, + "learning_rate": 6.332312980754025e-05, + "loss": 0.9305, + "step": 5700 + }, + { + "epoch": 0.6581679741578219, + "grad_norm": 0.19769403338432312, + "learning_rate": 6.313587818489497e-05, + "loss": 0.9505, + "step": 5705 + }, + { + "epoch": 0.6587448084910014, + "grad_norm": 0.22214584052562714, + "learning_rate": 6.294877602843275e-05, + "loss": 0.9718, + "step": 5710 + }, + { + "epoch": 0.6593216428241809, + "grad_norm": 0.18468452990055084, + "learning_rate": 6.276182409676234e-05, + "loss": 0.9605, + "step": 5715 + }, + { + "epoch": 0.6598984771573604, + "grad_norm": 0.19918876886367798, + "learning_rate": 6.25750231478834e-05, + "loss": 0.9539, + "step": 5720 + }, + { + "epoch": 0.66047531149054, + "grad_norm": 0.18475128710269928, + "learning_rate": 6.238837393918341e-05, + "loss": 0.9419, + "step": 5725 + }, + { + "epoch": 0.6610521458237194, + "grad_norm": 0.20297956466674805, + "learning_rate": 6.220187722743466e-05, + "loss": 0.96, + "step": 5730 + }, + { + "epoch": 0.661628980156899, + "grad_norm": 0.19551438093185425, + "learning_rate": 6.201553376879108e-05, + "loss": 0.95, + "step": 5735 + }, + { + "epoch": 0.6622058144900784, + "grad_norm": 0.20556505024433136, + "learning_rate": 6.182934431878526e-05, + "loss": 0.9811, + "step": 5740 + }, + { + "epoch": 0.662782648823258, + "grad_norm": 0.19358351826667786, + "learning_rate": 6.164330963232535e-05, + "loss": 0.9539, + "step": 5745 + }, + { + "epoch": 0.6633594831564374, + "grad_norm": 0.18913935124874115, + "learning_rate": 6.145743046369205e-05, + "loss": 0.9677, + "step": 5750 + }, + { + "epoch": 0.663936317489617, + "grad_norm": 0.1924053579568863, + "learning_rate": 6.127170756653546e-05, + "loss": 0.9302, + "step": 5755 + }, + { + "epoch": 0.6645131518227965, + "grad_norm": 0.1916627734899521, + "learning_rate": 6.108614169387215e-05, + "loss": 0.9588, + "step": 5760 + }, + { + "epoch": 0.665089986155976, + "grad_norm": 0.21800926327705383, + "learning_rate": 6.090073359808188e-05, + "loss": 0.9329, + "step": 5765 + }, + { + "epoch": 0.6656668204891555, + "grad_norm": 0.19843755662441254, + "learning_rate": 6.071548403090488e-05, + "loss": 0.9722, + "step": 5770 + }, + { + "epoch": 0.666243654822335, + "grad_norm": 0.1899397373199463, + "learning_rate": 6.053039374343849e-05, + "loss": 0.9167, + "step": 5775 + }, + { + "epoch": 0.6668204891555145, + "grad_norm": 0.19542363286018372, + "learning_rate": 6.0345463486134325e-05, + "loss": 0.9741, + "step": 5780 + }, + { + "epoch": 0.6673973234886941, + "grad_norm": 0.1900932788848877, + "learning_rate": 6.0160694008795114e-05, + "loss": 0.9137, + "step": 5785 + }, + { + "epoch": 0.6679741578218735, + "grad_norm": 0.192066490650177, + "learning_rate": 5.9976086060571765e-05, + "loss": 0.947, + "step": 5790 + }, + { + "epoch": 0.6685509921550531, + "grad_norm": 0.1989891529083252, + "learning_rate": 5.979164038996015e-05, + "loss": 0.9692, + "step": 5795 + }, + { + "epoch": 0.6691278264882325, + "grad_norm": 0.18487831950187683, + "learning_rate": 5.960735774479826e-05, + "loss": 0.9288, + "step": 5800 + }, + { + "epoch": 0.6697046608214121, + "grad_norm": 0.18360459804534912, + "learning_rate": 5.942323887226311e-05, + "loss": 0.966, + "step": 5805 + }, + { + "epoch": 0.6702814951545916, + "grad_norm": 0.1812918782234192, + "learning_rate": 5.923928451886767e-05, + "loss": 0.907, + "step": 5810 + }, + { + "epoch": 0.6708583294877711, + "grad_norm": 0.19430682063102722, + "learning_rate": 5.905549543045783e-05, + "loss": 0.962, + "step": 5815 + }, + { + "epoch": 0.6714351638209506, + "grad_norm": 0.1931554228067398, + "learning_rate": 5.887187235220948e-05, + "loss": 0.957, + "step": 5820 + }, + { + "epoch": 0.6720119981541302, + "grad_norm": 0.19421172142028809, + "learning_rate": 5.868841602862541e-05, + "loss": 0.9318, + "step": 5825 + }, + { + "epoch": 0.6725888324873096, + "grad_norm": 0.18080562353134155, + "learning_rate": 5.8505127203532216e-05, + "loss": 0.9359, + "step": 5830 + }, + { + "epoch": 0.6731656668204892, + "grad_norm": 0.19878889620304108, + "learning_rate": 5.8322006620077426e-05, + "loss": 0.9495, + "step": 5835 + }, + { + "epoch": 0.6737425011536686, + "grad_norm": 0.18784356117248535, + "learning_rate": 5.8139055020726494e-05, + "loss": 0.9684, + "step": 5840 + }, + { + "epoch": 0.6743193354868482, + "grad_norm": 0.17742076516151428, + "learning_rate": 5.7956273147259645e-05, + "loss": 0.9648, + "step": 5845 + }, + { + "epoch": 0.6748961698200276, + "grad_norm": 0.19306622445583344, + "learning_rate": 5.77736617407689e-05, + "loss": 0.9388, + "step": 5850 + }, + { + "epoch": 0.6754730041532072, + "grad_norm": 0.20553186535835266, + "learning_rate": 5.7591221541655285e-05, + "loss": 0.9764, + "step": 5855 + }, + { + "epoch": 0.6760498384863867, + "grad_norm": 0.20302440226078033, + "learning_rate": 5.74089532896255e-05, + "loss": 0.9554, + "step": 5860 + }, + { + "epoch": 0.6766266728195662, + "grad_norm": 0.19851718842983246, + "learning_rate": 5.722685772368912e-05, + "loss": 0.9692, + "step": 5865 + }, + { + "epoch": 0.6772035071527457, + "grad_norm": 0.20243926346302032, + "learning_rate": 5.704493558215567e-05, + "loss": 0.967, + "step": 5870 + }, + { + "epoch": 0.6777803414859253, + "grad_norm": 0.19658678770065308, + "learning_rate": 5.6863187602631354e-05, + "loss": 0.9167, + "step": 5875 + }, + { + "epoch": 0.6783571758191047, + "grad_norm": 0.18363118171691895, + "learning_rate": 5.668161452201639e-05, + "loss": 0.9346, + "step": 5880 + }, + { + "epoch": 0.6789340101522843, + "grad_norm": 0.2019900381565094, + "learning_rate": 5.650021707650173e-05, + "loss": 0.9602, + "step": 5885 + }, + { + "epoch": 0.6795108444854637, + "grad_norm": 0.18240347504615784, + "learning_rate": 5.6318996001566384e-05, + "loss": 0.9663, + "step": 5890 + }, + { + "epoch": 0.6800876788186433, + "grad_norm": 0.19030514359474182, + "learning_rate": 5.613795203197401e-05, + "loss": 0.9382, + "step": 5895 + }, + { + "epoch": 0.6806645131518227, + "grad_norm": 0.18463625013828278, + "learning_rate": 5.5957085901770424e-05, + "loss": 0.9487, + "step": 5900 + }, + { + "epoch": 0.6812413474850023, + "grad_norm": 0.18482211232185364, + "learning_rate": 5.577639834428026e-05, + "loss": 0.96, + "step": 5905 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.19854898750782013, + "learning_rate": 5.559589009210421e-05, + "loss": 0.9632, + "step": 5910 + }, + { + "epoch": 0.6823950161513613, + "grad_norm": 0.19982431828975677, + "learning_rate": 5.5415561877115876e-05, + "loss": 0.9312, + "step": 5915 + }, + { + "epoch": 0.6829718504845408, + "grad_norm": 0.19143183529376984, + "learning_rate": 5.523541443045904e-05, + "loss": 0.9736, + "step": 5920 + }, + { + "epoch": 0.6835486848177204, + "grad_norm": 0.20476052165031433, + "learning_rate": 5.505544848254432e-05, + "loss": 0.9658, + "step": 5925 + }, + { + "epoch": 0.6841255191508998, + "grad_norm": 0.19165083765983582, + "learning_rate": 5.4875664763046705e-05, + "loss": 0.9917, + "step": 5930 + }, + { + "epoch": 0.6847023534840794, + "grad_norm": 0.19160091876983643, + "learning_rate": 5.4696064000902146e-05, + "loss": 0.952, + "step": 5935 + }, + { + "epoch": 0.6852791878172588, + "grad_norm": 0.22697453200817108, + "learning_rate": 5.451664692430493e-05, + "loss": 0.959, + "step": 5940 + }, + { + "epoch": 0.6858560221504384, + "grad_norm": 0.21347078680992126, + "learning_rate": 5.433741426070442e-05, + "loss": 0.9489, + "step": 5945 + }, + { + "epoch": 0.686432856483618, + "grad_norm": 0.19163408875465393, + "learning_rate": 5.415836673680253e-05, + "loss": 0.9566, + "step": 5950 + }, + { + "epoch": 0.6870096908167974, + "grad_norm": 0.19686076045036316, + "learning_rate": 5.3979505078550184e-05, + "loss": 0.9389, + "step": 5955 + }, + { + "epoch": 0.687586525149977, + "grad_norm": 0.1912279576063156, + "learning_rate": 5.380083001114503e-05, + "loss": 0.9249, + "step": 5960 + }, + { + "epoch": 0.6881633594831564, + "grad_norm": 0.20188121497631073, + "learning_rate": 5.362234225902794e-05, + "loss": 0.9683, + "step": 5965 + }, + { + "epoch": 0.688740193816336, + "grad_norm": 0.20273169875144958, + "learning_rate": 5.3444042545880514e-05, + "loss": 0.9125, + "step": 5970 + }, + { + "epoch": 0.6893170281495155, + "grad_norm": 0.2052476704120636, + "learning_rate": 5.3265931594621756e-05, + "loss": 0.9327, + "step": 5975 + }, + { + "epoch": 0.689893862482695, + "grad_norm": 0.19370770454406738, + "learning_rate": 5.3088010127405496e-05, + "loss": 0.9664, + "step": 5980 + }, + { + "epoch": 0.6904706968158745, + "grad_norm": 0.19764676690101624, + "learning_rate": 5.29102788656172e-05, + "loss": 0.9241, + "step": 5985 + }, + { + "epoch": 0.691047531149054, + "grad_norm": 0.2198079228401184, + "learning_rate": 5.273273852987113e-05, + "loss": 0.9722, + "step": 5990 + }, + { + "epoch": 0.6916243654822335, + "grad_norm": 0.1917632520198822, + "learning_rate": 5.255538984000753e-05, + "loss": 0.9572, + "step": 5995 + }, + { + "epoch": 0.6922011998154131, + "grad_norm": 0.19673052430152893, + "learning_rate": 5.237823351508953e-05, + "loss": 0.9546, + "step": 6000 + }, + { + "epoch": 0.6927780341485925, + "grad_norm": 0.18753381073474884, + "learning_rate": 5.2201270273400296e-05, + "loss": 0.9448, + "step": 6005 + }, + { + "epoch": 0.6933548684817721, + "grad_norm": 0.18277138471603394, + "learning_rate": 5.202450083244026e-05, + "loss": 0.9748, + "step": 6010 + }, + { + "epoch": 0.6939317028149515, + "grad_norm": 0.1970137506723404, + "learning_rate": 5.184792590892397e-05, + "loss": 0.9961, + "step": 6015 + }, + { + "epoch": 0.6945085371481311, + "grad_norm": 0.19395023584365845, + "learning_rate": 5.167154621877728e-05, + "loss": 0.9396, + "step": 6020 + }, + { + "epoch": 0.6950853714813106, + "grad_norm": 0.1935676783323288, + "learning_rate": 5.14953624771346e-05, + "loss": 0.9277, + "step": 6025 + }, + { + "epoch": 0.6956622058144901, + "grad_norm": 0.21159899234771729, + "learning_rate": 5.131937539833571e-05, + "loss": 0.9511, + "step": 6030 + }, + { + "epoch": 0.6962390401476696, + "grad_norm": 0.1984950304031372, + "learning_rate": 5.1143585695923166e-05, + "loss": 0.9859, + "step": 6035 + }, + { + "epoch": 0.6968158744808491, + "grad_norm": 0.19086718559265137, + "learning_rate": 5.09679940826391e-05, + "loss": 0.9942, + "step": 6040 + }, + { + "epoch": 0.6973927088140286, + "grad_norm": 0.197454035282135, + "learning_rate": 5.079260127042267e-05, + "loss": 0.9919, + "step": 6045 + }, + { + "epoch": 0.6979695431472082, + "grad_norm": 0.20797502994537354, + "learning_rate": 5.061740797040684e-05, + "loss": 0.9213, + "step": 6050 + }, + { + "epoch": 0.6985463774803876, + "grad_norm": 0.18882066011428833, + "learning_rate": 5.044241489291569e-05, + "loss": 0.9691, + "step": 6055 + }, + { + "epoch": 0.6991232118135672, + "grad_norm": 0.19341005384922028, + "learning_rate": 5.0267622747461487e-05, + "loss": 0.9267, + "step": 6060 + }, + { + "epoch": 0.6997000461467466, + "grad_norm": 0.20529086887836456, + "learning_rate": 5.009303224274191e-05, + "loss": 0.9937, + "step": 6065 + }, + { + "epoch": 0.7002768804799262, + "grad_norm": 0.1887139081954956, + "learning_rate": 4.991864408663692e-05, + "loss": 0.9477, + "step": 6070 + }, + { + "epoch": 0.7008537148131057, + "grad_norm": 0.19098982214927673, + "learning_rate": 4.974445898620622e-05, + "loss": 0.9689, + "step": 6075 + }, + { + "epoch": 0.7014305491462852, + "grad_norm": 0.19821666181087494, + "learning_rate": 4.957047764768612e-05, + "loss": 0.9333, + "step": 6080 + }, + { + "epoch": 0.7020073834794647, + "grad_norm": 0.1881197690963745, + "learning_rate": 4.939670077648676e-05, + "loss": 0.9504, + "step": 6085 + }, + { + "epoch": 0.7025842178126442, + "grad_norm": 0.19426091015338898, + "learning_rate": 4.922312907718929e-05, + "loss": 1.0074, + "step": 6090 + }, + { + "epoch": 0.7031610521458237, + "grad_norm": 0.18885859847068787, + "learning_rate": 4.9049763253543054e-05, + "loss": 0.9706, + "step": 6095 + }, + { + "epoch": 0.7037378864790033, + "grad_norm": 0.20289480686187744, + "learning_rate": 4.8876604008462554e-05, + "loss": 0.9547, + "step": 6100 + }, + { + "epoch": 0.7043147208121827, + "grad_norm": 0.2023826539516449, + "learning_rate": 4.870365204402483e-05, + "loss": 1.005, + "step": 6105 + }, + { + "epoch": 0.7048915551453623, + "grad_norm": 0.18636111915111542, + "learning_rate": 4.8530908061466404e-05, + "loss": 0.9311, + "step": 6110 + }, + { + "epoch": 0.7054683894785417, + "grad_norm": 0.18160183727741241, + "learning_rate": 4.835837276118058e-05, + "loss": 0.9579, + "step": 6115 + }, + { + "epoch": 0.7060452238117213, + "grad_norm": 0.19551381468772888, + "learning_rate": 4.8186046842714504e-05, + "loss": 0.8944, + "step": 6120 + }, + { + "epoch": 0.7066220581449008, + "grad_norm": 0.18871080875396729, + "learning_rate": 4.801393100476651e-05, + "loss": 1.0176, + "step": 6125 + }, + { + "epoch": 0.7071988924780803, + "grad_norm": 0.1864621639251709, + "learning_rate": 4.784202594518298e-05, + "loss": 0.945, + "step": 6130 + }, + { + "epoch": 0.7077757268112598, + "grad_norm": 0.1970556527376175, + "learning_rate": 4.767033236095585e-05, + "loss": 0.9587, + "step": 6135 + }, + { + "epoch": 0.7083525611444393, + "grad_norm": 0.19122757017612457, + "learning_rate": 4.749885094821951e-05, + "loss": 0.9473, + "step": 6140 + }, + { + "epoch": 0.7089293954776188, + "grad_norm": 0.1870802640914917, + "learning_rate": 4.732758240224818e-05, + "loss": 0.965, + "step": 6145 + }, + { + "epoch": 0.7095062298107984, + "grad_norm": 0.20964084565639496, + "learning_rate": 4.715652741745298e-05, + "loss": 0.9074, + "step": 6150 + }, + { + "epoch": 0.7100830641439778, + "grad_norm": 0.1921871155500412, + "learning_rate": 4.6985686687379103e-05, + "loss": 0.9391, + "step": 6155 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 0.205277681350708, + "learning_rate": 4.6815060904703046e-05, + "loss": 0.9321, + "step": 6160 + }, + { + "epoch": 0.7112367328103368, + "grad_norm": 0.18848411738872528, + "learning_rate": 4.664465076122991e-05, + "loss": 0.9028, + "step": 6165 + }, + { + "epoch": 0.7118135671435164, + "grad_norm": 0.18404066562652588, + "learning_rate": 4.647445694789032e-05, + "loss": 0.9967, + "step": 6170 + }, + { + "epoch": 0.7123904014766959, + "grad_norm": 0.18522439897060394, + "learning_rate": 4.630448015473794e-05, + "loss": 0.9022, + "step": 6175 + }, + { + "epoch": 0.7129672358098754, + "grad_norm": 0.1914357990026474, + "learning_rate": 4.613472107094641e-05, + "loss": 0.9313, + "step": 6180 + }, + { + "epoch": 0.7135440701430549, + "grad_norm": 0.18472833931446075, + "learning_rate": 4.596518038480667e-05, + "loss": 0.9544, + "step": 6185 + }, + { + "epoch": 0.7141209044762344, + "grad_norm": 0.1849919557571411, + "learning_rate": 4.579585878372428e-05, + "loss": 0.9521, + "step": 6190 + }, + { + "epoch": 0.7146977388094139, + "grad_norm": 0.2014801949262619, + "learning_rate": 4.562675695421634e-05, + "loss": 0.9397, + "step": 6195 + }, + { + "epoch": 0.7152745731425935, + "grad_norm": 0.19745898246765137, + "learning_rate": 4.545787558190907e-05, + "loss": 0.9671, + "step": 6200 + }, + { + "epoch": 0.7158514074757729, + "grad_norm": 0.2034626305103302, + "learning_rate": 4.5289215351534666e-05, + "loss": 1.0031, + "step": 6205 + }, + { + "epoch": 0.7164282418089525, + "grad_norm": 0.19451607763767242, + "learning_rate": 4.512077694692888e-05, + "loss": 0.924, + "step": 6210 + }, + { + "epoch": 0.7170050761421319, + "grad_norm": 0.19638855755329132, + "learning_rate": 4.495256105102784e-05, + "loss": 0.9372, + "step": 6215 + }, + { + "epoch": 0.7175819104753115, + "grad_norm": 0.18954864144325256, + "learning_rate": 4.478456834586574e-05, + "loss": 0.9775, + "step": 6220 + }, + { + "epoch": 0.718158744808491, + "grad_norm": 0.1834801435470581, + "learning_rate": 4.4616799512571675e-05, + "loss": 0.946, + "step": 6225 + }, + { + "epoch": 0.7187355791416705, + "grad_norm": 0.2009892612695694, + "learning_rate": 4.4449255231367183e-05, + "loss": 0.9131, + "step": 6230 + }, + { + "epoch": 0.71931241347485, + "grad_norm": 0.1912214159965515, + "learning_rate": 4.428193618156322e-05, + "loss": 0.9248, + "step": 6235 + }, + { + "epoch": 0.7198892478080295, + "grad_norm": 0.19237951934337616, + "learning_rate": 4.411484304155771e-05, + "loss": 0.9305, + "step": 6240 + }, + { + "epoch": 0.720466082141209, + "grad_norm": 0.1901046186685562, + "learning_rate": 4.394797648883236e-05, + "loss": 0.9259, + "step": 6245 + }, + { + "epoch": 0.7210429164743886, + "grad_norm": 0.19405438005924225, + "learning_rate": 4.378133719995044e-05, + "loss": 0.9308, + "step": 6250 + }, + { + "epoch": 0.721619750807568, + "grad_norm": 0.19073431193828583, + "learning_rate": 4.36149258505536e-05, + "loss": 0.9325, + "step": 6255 + }, + { + "epoch": 0.7221965851407476, + "grad_norm": 0.18729381263256073, + "learning_rate": 4.344874311535944e-05, + "loss": 0.9674, + "step": 6260 + }, + { + "epoch": 0.722773419473927, + "grad_norm": 0.19104944169521332, + "learning_rate": 4.3282789668158476e-05, + "loss": 0.9328, + "step": 6265 + }, + { + "epoch": 0.7233502538071066, + "grad_norm": 0.20195722579956055, + "learning_rate": 4.31170661818118e-05, + "loss": 0.9785, + "step": 6270 + }, + { + "epoch": 0.7239270881402861, + "grad_norm": 0.18547381460666656, + "learning_rate": 4.295157332824785e-05, + "loss": 0.9534, + "step": 6275 + }, + { + "epoch": 0.7245039224734656, + "grad_norm": 0.18389153480529785, + "learning_rate": 4.27863117784602e-05, + "loss": 0.9408, + "step": 6280 + }, + { + "epoch": 0.7250807568066451, + "grad_norm": 0.1920400708913803, + "learning_rate": 4.262128220250441e-05, + "loss": 0.9413, + "step": 6285 + }, + { + "epoch": 0.7256575911398246, + "grad_norm": 0.18754833936691284, + "learning_rate": 4.245648526949567e-05, + "loss": 0.9961, + "step": 6290 + }, + { + "epoch": 0.7262344254730042, + "grad_norm": 0.19839036464691162, + "learning_rate": 4.229192164760576e-05, + "loss": 0.961, + "step": 6295 + }, + { + "epoch": 0.7268112598061837, + "grad_norm": 0.20525625348091125, + "learning_rate": 4.212759200406065e-05, + "loss": 0.9767, + "step": 6300 + }, + { + "epoch": 0.7273880941393632, + "grad_norm": 0.20254795253276825, + "learning_rate": 4.1963497005137516e-05, + "loss": 0.9062, + "step": 6305 + }, + { + "epoch": 0.7279649284725427, + "grad_norm": 0.20592284202575684, + "learning_rate": 4.179963731616221e-05, + "loss": 0.9707, + "step": 6310 + }, + { + "epoch": 0.7285417628057222, + "grad_norm": 0.1987779587507248, + "learning_rate": 4.163601360150646e-05, + "loss": 0.9244, + "step": 6315 + }, + { + "epoch": 0.7291185971389017, + "grad_norm": 0.1847141832113266, + "learning_rate": 4.147262652458539e-05, + "loss": 0.9771, + "step": 6320 + }, + { + "epoch": 0.7296954314720813, + "grad_norm": 0.18493050336837769, + "learning_rate": 4.130947674785447e-05, + "loss": 0.9005, + "step": 6325 + }, + { + "epoch": 0.7302722658052607, + "grad_norm": 0.18646039068698883, + "learning_rate": 4.114656493280721e-05, + "loss": 0.938, + "step": 6330 + }, + { + "epoch": 0.7308491001384403, + "grad_norm": 0.2044810801744461, + "learning_rate": 4.098389173997218e-05, + "loss": 0.9636, + "step": 6335 + }, + { + "epoch": 0.7314259344716197, + "grad_norm": 0.20317162573337555, + "learning_rate": 4.082145782891046e-05, + "loss": 0.9499, + "step": 6340 + }, + { + "epoch": 0.7320027688047993, + "grad_norm": 0.19699914753437042, + "learning_rate": 4.065926385821307e-05, + "loss": 0.9391, + "step": 6345 + }, + { + "epoch": 0.7325796031379788, + "grad_norm": 0.18906693160533905, + "learning_rate": 4.049731048549804e-05, + "loss": 0.9166, + "step": 6350 + }, + { + "epoch": 0.7331564374711583, + "grad_norm": 0.20645971596240997, + "learning_rate": 4.033559836740801e-05, + "loss": 0.9581, + "step": 6355 + }, + { + "epoch": 0.7337332718043378, + "grad_norm": 0.18750208616256714, + "learning_rate": 4.017412815960735e-05, + "loss": 0.9418, + "step": 6360 + }, + { + "epoch": 0.7343101061375173, + "grad_norm": 0.18464316427707672, + "learning_rate": 4.001290051677975e-05, + "loss": 0.9937, + "step": 6365 + }, + { + "epoch": 0.7348869404706968, + "grad_norm": 0.19745340943336487, + "learning_rate": 3.985191609262519e-05, + "loss": 0.9188, + "step": 6370 + }, + { + "epoch": 0.7354637748038764, + "grad_norm": 0.2214672714471817, + "learning_rate": 3.969117553985772e-05, + "loss": 0.9521, + "step": 6375 + }, + { + "epoch": 0.7360406091370558, + "grad_norm": 0.1932508647441864, + "learning_rate": 3.9530679510202476e-05, + "loss": 0.9164, + "step": 6380 + }, + { + "epoch": 0.7366174434702354, + "grad_norm": 0.1878437101840973, + "learning_rate": 3.9370428654393296e-05, + "loss": 0.9119, + "step": 6385 + }, + { + "epoch": 0.7371942778034148, + "grad_norm": 0.19011224806308746, + "learning_rate": 3.921042362216983e-05, + "loss": 0.9232, + "step": 6390 + }, + { + "epoch": 0.7377711121365944, + "grad_norm": 0.17758631706237793, + "learning_rate": 3.905066506227515e-05, + "loss": 0.937, + "step": 6395 + }, + { + "epoch": 0.7383479464697739, + "grad_norm": 0.19590388238430023, + "learning_rate": 3.8891153622452904e-05, + "loss": 0.8993, + "step": 6400 + }, + { + "epoch": 0.7389247808029534, + "grad_norm": 0.1892813891172409, + "learning_rate": 3.873188994944483e-05, + "loss": 0.9759, + "step": 6405 + }, + { + "epoch": 0.7395016151361329, + "grad_norm": 0.19297149777412415, + "learning_rate": 3.857287468898806e-05, + "loss": 0.9918, + "step": 6410 + }, + { + "epoch": 0.7400784494693124, + "grad_norm": 0.1931808739900589, + "learning_rate": 3.8414108485812613e-05, + "loss": 0.9332, + "step": 6415 + }, + { + "epoch": 0.7406552838024919, + "grad_norm": 0.18963101506233215, + "learning_rate": 3.825559198363861e-05, + "loss": 0.9274, + "step": 6420 + }, + { + "epoch": 0.7412321181356715, + "grad_norm": 0.18943539261817932, + "learning_rate": 3.8097325825173826e-05, + "loss": 0.9751, + "step": 6425 + }, + { + "epoch": 0.7418089524688509, + "grad_norm": 0.1900118887424469, + "learning_rate": 3.793931065211096e-05, + "loss": 0.9527, + "step": 6430 + }, + { + "epoch": 0.7423857868020305, + "grad_norm": 0.1904015690088272, + "learning_rate": 3.778154710512512e-05, + "loss": 0.9884, + "step": 6435 + }, + { + "epoch": 0.7429626211352099, + "grad_norm": 0.19013242423534393, + "learning_rate": 3.762403582387114e-05, + "loss": 0.9467, + "step": 6440 + }, + { + "epoch": 0.7435394554683895, + "grad_norm": 0.18729767203330994, + "learning_rate": 3.746677744698114e-05, + "loss": 0.9803, + "step": 6445 + }, + { + "epoch": 0.744116289801569, + "grad_norm": 0.20547720789909363, + "learning_rate": 3.730977261206171e-05, + "loss": 0.9168, + "step": 6450 + }, + { + "epoch": 0.7446931241347485, + "grad_norm": 0.20597174763679504, + "learning_rate": 3.715302195569159e-05, + "loss": 0.9868, + "step": 6455 + }, + { + "epoch": 0.745269958467928, + "grad_norm": 0.1934199035167694, + "learning_rate": 3.69965261134188e-05, + "loss": 1.036, + "step": 6460 + }, + { + "epoch": 0.7458467928011075, + "grad_norm": 0.19105862081050873, + "learning_rate": 3.684028571975836e-05, + "loss": 0.9528, + "step": 6465 + }, + { + "epoch": 0.746423627134287, + "grad_norm": 0.19618894159793854, + "learning_rate": 3.6684301408189406e-05, + "loss": 1.0373, + "step": 6470 + }, + { + "epoch": 0.7470004614674666, + "grad_norm": 0.18884247541427612, + "learning_rate": 3.652857381115293e-05, + "loss": 0.9981, + "step": 6475 + }, + { + "epoch": 0.747577295800646, + "grad_norm": 0.1874423772096634, + "learning_rate": 3.637310356004897e-05, + "loss": 0.964, + "step": 6480 + }, + { + "epoch": 0.7481541301338256, + "grad_norm": 0.19521760940551758, + "learning_rate": 3.6217891285234265e-05, + "loss": 0.9497, + "step": 6485 + }, + { + "epoch": 0.748730964467005, + "grad_norm": 0.19494710862636566, + "learning_rate": 3.6062937616019433e-05, + "loss": 0.9687, + "step": 6490 + }, + { + "epoch": 0.7493077988001846, + "grad_norm": 0.18842960894107819, + "learning_rate": 3.5908243180666734e-05, + "loss": 0.9522, + "step": 6495 + }, + { + "epoch": 0.7498846331333641, + "grad_norm": 0.19816794991493225, + "learning_rate": 3.5753808606387226e-05, + "loss": 0.9548, + "step": 6500 + }, + { + "epoch": 0.7504614674665436, + "grad_norm": 0.19967734813690186, + "learning_rate": 3.55996345193384e-05, + "loss": 1.0035, + "step": 6505 + }, + { + "epoch": 0.7510383017997231, + "grad_norm": 0.19166813790798187, + "learning_rate": 3.544572154462165e-05, + "loss": 0.9374, + "step": 6510 + }, + { + "epoch": 0.7516151361329027, + "grad_norm": 0.2017005980014801, + "learning_rate": 3.529207030627959e-05, + "loss": 1.0029, + "step": 6515 + }, + { + "epoch": 0.7521919704660821, + "grad_norm": 0.19286920130252838, + "learning_rate": 3.513868142729373e-05, + "loss": 0.9457, + "step": 6520 + }, + { + "epoch": 0.7527688047992617, + "grad_norm": 0.20132075250148773, + "learning_rate": 3.498555552958176e-05, + "loss": 0.9899, + "step": 6525 + }, + { + "epoch": 0.7533456391324411, + "grad_norm": 0.22670696675777435, + "learning_rate": 3.483269323399512e-05, + "loss": 0.9568, + "step": 6530 + }, + { + "epoch": 0.7539224734656207, + "grad_norm": 0.1912250965833664, + "learning_rate": 3.468009516031644e-05, + "loss": 0.9871, + "step": 6535 + }, + { + "epoch": 0.7544993077988001, + "grad_norm": 0.1899501532316208, + "learning_rate": 3.452776192725717e-05, + "loss": 0.9634, + "step": 6540 + }, + { + "epoch": 0.7550761421319797, + "grad_norm": 0.1889965683221817, + "learning_rate": 3.437569415245483e-05, + "loss": 0.9733, + "step": 6545 + }, + { + "epoch": 0.7556529764651592, + "grad_norm": 0.19462092220783234, + "learning_rate": 3.422389245247075e-05, + "loss": 0.951, + "step": 6550 + }, + { + "epoch": 0.7562298107983387, + "grad_norm": 0.1985846608877182, + "learning_rate": 3.407235744278734e-05, + "loss": 0.9462, + "step": 6555 + }, + { + "epoch": 0.7568066451315182, + "grad_norm": 0.187138170003891, + "learning_rate": 3.3921089737805866e-05, + "loss": 0.9234, + "step": 6560 + }, + { + "epoch": 0.7573834794646978, + "grad_norm": 0.188985213637352, + "learning_rate": 3.3770089950843564e-05, + "loss": 0.9074, + "step": 6565 + }, + { + "epoch": 0.7579603137978772, + "grad_norm": 0.20506353676319122, + "learning_rate": 3.361935869413163e-05, + "loss": 0.9758, + "step": 6570 + }, + { + "epoch": 0.7585371481310568, + "grad_norm": 0.18956059217453003, + "learning_rate": 3.3468896578812344e-05, + "loss": 0.9461, + "step": 6575 + }, + { + "epoch": 0.7591139824642362, + "grad_norm": 0.2013210505247116, + "learning_rate": 3.331870421493688e-05, + "loss": 0.9575, + "step": 6580 + }, + { + "epoch": 0.7596908167974158, + "grad_norm": 0.19527879357337952, + "learning_rate": 3.316878221146253e-05, + "loss": 0.9456, + "step": 6585 + }, + { + "epoch": 0.7602676511305952, + "grad_norm": 0.1831914484500885, + "learning_rate": 3.301913117625065e-05, + "loss": 0.9144, + "step": 6590 + }, + { + "epoch": 0.7608444854637748, + "grad_norm": 0.18544113636016846, + "learning_rate": 3.286975171606362e-05, + "loss": 0.9525, + "step": 6595 + }, + { + "epoch": 0.7614213197969543, + "grad_norm": 0.20716486871242523, + "learning_rate": 3.272064443656303e-05, + "loss": 0.9571, + "step": 6600 + }, + { + "epoch": 0.7619981541301338, + "grad_norm": 0.18836161494255066, + "learning_rate": 3.257180994230671e-05, + "loss": 0.9857, + "step": 6605 + }, + { + "epoch": 0.7625749884633133, + "grad_norm": 0.19700607657432556, + "learning_rate": 3.2423248836746575e-05, + "loss": 0.9054, + "step": 6610 + }, + { + "epoch": 0.7631518227964929, + "grad_norm": 0.18617579340934753, + "learning_rate": 3.227496172222603e-05, + "loss": 0.9512, + "step": 6615 + }, + { + "epoch": 0.7637286571296723, + "grad_norm": 0.18982037901878357, + "learning_rate": 3.212694919997764e-05, + "loss": 0.9649, + "step": 6620 + }, + { + "epoch": 0.7643054914628519, + "grad_norm": 0.19119331240653992, + "learning_rate": 3.197921187012055e-05, + "loss": 0.9482, + "step": 6625 + }, + { + "epoch": 0.7648823257960313, + "grad_norm": 0.2058209329843521, + "learning_rate": 3.1831750331658196e-05, + "loss": 0.9741, + "step": 6630 + }, + { + "epoch": 0.7654591601292109, + "grad_norm": 0.19063866138458252, + "learning_rate": 3.168456518247575e-05, + "loss": 0.9556, + "step": 6635 + }, + { + "epoch": 0.7660359944623903, + "grad_norm": 0.1826866716146469, + "learning_rate": 3.153765701933784e-05, + "loss": 0.9362, + "step": 6640 + }, + { + "epoch": 0.7666128287955699, + "grad_norm": 0.20662148296833038, + "learning_rate": 3.1391026437885984e-05, + "loss": 0.9941, + "step": 6645 + }, + { + "epoch": 0.7671896631287495, + "grad_norm": 0.19351264834403992, + "learning_rate": 3.12446740326363e-05, + "loss": 0.9544, + "step": 6650 + }, + { + "epoch": 0.7677664974619289, + "grad_norm": 0.19532477855682373, + "learning_rate": 3.109860039697699e-05, + "loss": 1.0098, + "step": 6655 + }, + { + "epoch": 0.7683433317951085, + "grad_norm": 0.19506360590457916, + "learning_rate": 3.0952806123165945e-05, + "loss": 0.9201, + "step": 6660 + }, + { + "epoch": 0.768920166128288, + "grad_norm": 0.19203807413578033, + "learning_rate": 3.0807291802328494e-05, + "loss": 0.9102, + "step": 6665 + }, + { + "epoch": 0.7694970004614675, + "grad_norm": 0.19363170862197876, + "learning_rate": 3.066205802445477e-05, + "loss": 0.9485, + "step": 6670 + }, + { + "epoch": 0.770073834794647, + "grad_norm": 0.19659185409545898, + "learning_rate": 3.0517105378397536e-05, + "loss": 0.9946, + "step": 6675 + }, + { + "epoch": 0.7706506691278265, + "grad_norm": 0.20665878057479858, + "learning_rate": 3.037243445186965e-05, + "loss": 0.9688, + "step": 6680 + }, + { + "epoch": 0.771227503461006, + "grad_norm": 0.1891080141067505, + "learning_rate": 3.0228045831441733e-05, + "loss": 0.9905, + "step": 6685 + }, + { + "epoch": 0.7718043377941856, + "grad_norm": 0.19424353539943695, + "learning_rate": 3.0083940102539763e-05, + "loss": 0.9682, + "step": 6690 + }, + { + "epoch": 0.772381172127365, + "grad_norm": 0.1884242296218872, + "learning_rate": 2.994011784944284e-05, + "loss": 1.0159, + "step": 6695 + }, + { + "epoch": 0.7729580064605446, + "grad_norm": 0.191155344247818, + "learning_rate": 2.9796579655280576e-05, + "loss": 0.9149, + "step": 6700 + }, + { + "epoch": 0.773534840793724, + "grad_norm": 0.20115402340888977, + "learning_rate": 2.9653326102030964e-05, + "loss": 0.9818, + "step": 6705 + }, + { + "epoch": 0.7741116751269036, + "grad_norm": 0.19099047780036926, + "learning_rate": 2.9510357770517825e-05, + "loss": 1.0363, + "step": 6710 + }, + { + "epoch": 0.774688509460083, + "grad_norm": 0.18843354284763336, + "learning_rate": 2.9367675240408654e-05, + "loss": 0.9309, + "step": 6715 + }, + { + "epoch": 0.7752653437932626, + "grad_norm": 0.19343096017837524, + "learning_rate": 2.9225279090212067e-05, + "loss": 0.9773, + "step": 6720 + }, + { + "epoch": 0.7758421781264421, + "grad_norm": 0.19770221412181854, + "learning_rate": 2.9083169897275552e-05, + "loss": 0.9453, + "step": 6725 + }, + { + "epoch": 0.7764190124596216, + "grad_norm": 0.1835956573486328, + "learning_rate": 2.894134823778315e-05, + "loss": 0.9375, + "step": 6730 + }, + { + "epoch": 0.7769958467928011, + "grad_norm": 0.18945716321468353, + "learning_rate": 2.8799814686753134e-05, + "loss": 0.9738, + "step": 6735 + }, + { + "epoch": 0.7775726811259807, + "grad_norm": 0.18679681420326233, + "learning_rate": 2.8658569818035542e-05, + "loss": 0.9195, + "step": 6740 + }, + { + "epoch": 0.7781495154591601, + "grad_norm": 0.19722817838191986, + "learning_rate": 2.851761420431006e-05, + "loss": 0.9791, + "step": 6745 + }, + { + "epoch": 0.7787263497923397, + "grad_norm": 0.1932711899280548, + "learning_rate": 2.8376948417083483e-05, + "loss": 0.9314, + "step": 6750 + }, + { + "epoch": 0.7793031841255191, + "grad_norm": 0.2086237072944641, + "learning_rate": 2.823657302668755e-05, + "loss": 0.9436, + "step": 6755 + }, + { + "epoch": 0.7798800184586987, + "grad_norm": 0.18653713166713715, + "learning_rate": 2.8096488602276528e-05, + "loss": 0.9361, + "step": 6760 + }, + { + "epoch": 0.7804568527918782, + "grad_norm": 0.1907120645046234, + "learning_rate": 2.7956695711825075e-05, + "loss": 0.9439, + "step": 6765 + }, + { + "epoch": 0.7810336871250577, + "grad_norm": 0.18436580896377563, + "learning_rate": 2.7817194922125666e-05, + "loss": 0.9436, + "step": 6770 + }, + { + "epoch": 0.7816105214582372, + "grad_norm": 0.18907365202903748, + "learning_rate": 2.7677986798786615e-05, + "loss": 0.9657, + "step": 6775 + }, + { + "epoch": 0.7821873557914167, + "grad_norm": 0.19230635464191437, + "learning_rate": 2.753907190622944e-05, + "loss": 0.9517, + "step": 6780 + }, + { + "epoch": 0.7827641901245962, + "grad_norm": 0.19662320613861084, + "learning_rate": 2.7400450807686938e-05, + "loss": 0.9558, + "step": 6785 + }, + { + "epoch": 0.7833410244577758, + "grad_norm": 0.18257470428943634, + "learning_rate": 2.726212406520051e-05, + "loss": 0.9132, + "step": 6790 + }, + { + "epoch": 0.7839178587909552, + "grad_norm": 0.19373179972171783, + "learning_rate": 2.712409223961826e-05, + "loss": 0.9496, + "step": 6795 + }, + { + "epoch": 0.7844946931241348, + "grad_norm": 0.19530361890792847, + "learning_rate": 2.698635589059242e-05, + "loss": 0.9682, + "step": 6800 + }, + { + "epoch": 0.7850715274573142, + "grad_norm": 0.18772435188293457, + "learning_rate": 2.6848915576577317e-05, + "loss": 0.9552, + "step": 6805 + }, + { + "epoch": 0.7856483617904938, + "grad_norm": 0.19148732721805573, + "learning_rate": 2.6711771854826905e-05, + "loss": 0.9047, + "step": 6810 + }, + { + "epoch": 0.7862251961236733, + "grad_norm": 0.19749705493450165, + "learning_rate": 2.657492528139268e-05, + "loss": 0.9442, + "step": 6815 + }, + { + "epoch": 0.7868020304568528, + "grad_norm": 0.18885494768619537, + "learning_rate": 2.643837641112128e-05, + "loss": 0.9223, + "step": 6820 + }, + { + "epoch": 0.7873788647900323, + "grad_norm": 0.19046206772327423, + "learning_rate": 2.630212579765231e-05, + "loss": 0.9801, + "step": 6825 + }, + { + "epoch": 0.7879556991232118, + "grad_norm": 0.19383732974529266, + "learning_rate": 2.6166173993416154e-05, + "loss": 1.0104, + "step": 6830 + }, + { + "epoch": 0.7885325334563913, + "grad_norm": 0.20124277472496033, + "learning_rate": 2.603052154963158e-05, + "loss": 1.008, + "step": 6835 + }, + { + "epoch": 0.7891093677895709, + "grad_norm": 0.20482531189918518, + "learning_rate": 2.5895169016303623e-05, + "loss": 0.8936, + "step": 6840 + }, + { + "epoch": 0.7896862021227503, + "grad_norm": 0.1819208562374115, + "learning_rate": 2.576011694222139e-05, + "loss": 0.9452, + "step": 6845 + }, + { + "epoch": 0.7902630364559299, + "grad_norm": 0.18517275154590607, + "learning_rate": 2.5625365874955674e-05, + "loss": 0.8879, + "step": 6850 + }, + { + "epoch": 0.7908398707891093, + "grad_norm": 0.18698491156101227, + "learning_rate": 2.5490916360856853e-05, + "loss": 0.929, + "step": 6855 + }, + { + "epoch": 0.7914167051222889, + "grad_norm": 0.18061292171478271, + "learning_rate": 2.5356768945052745e-05, + "loss": 0.9385, + "step": 6860 + }, + { + "epoch": 0.7919935394554684, + "grad_norm": 0.1996062994003296, + "learning_rate": 2.522292417144617e-05, + "loss": 0.9852, + "step": 6865 + }, + { + "epoch": 0.7925703737886479, + "grad_norm": 0.1814109832048416, + "learning_rate": 2.5089382582712994e-05, + "loss": 0.9543, + "step": 6870 + }, + { + "epoch": 0.7931472081218274, + "grad_norm": 0.19972620904445648, + "learning_rate": 2.4956144720299712e-05, + "loss": 0.9139, + "step": 6875 + }, + { + "epoch": 0.7937240424550069, + "grad_norm": 0.19318068027496338, + "learning_rate": 2.482321112442151e-05, + "loss": 1.0012, + "step": 6880 + }, + { + "epoch": 0.7943008767881864, + "grad_norm": 0.18499110639095306, + "learning_rate": 2.4690582334059685e-05, + "loss": 0.9307, + "step": 6885 + }, + { + "epoch": 0.794877711121366, + "grad_norm": 0.1849159598350525, + "learning_rate": 2.455825888695994e-05, + "loss": 0.9354, + "step": 6890 + }, + { + "epoch": 0.7954545454545454, + "grad_norm": 0.1951223909854889, + "learning_rate": 2.4426241319629772e-05, + "loss": 0.9596, + "step": 6895 + }, + { + "epoch": 0.796031379787725, + "grad_norm": 0.19276456534862518, + "learning_rate": 2.4294530167336615e-05, + "loss": 0.9273, + "step": 6900 + }, + { + "epoch": 0.7966082141209044, + "grad_norm": 0.1889081597328186, + "learning_rate": 2.4163125964105448e-05, + "loss": 0.979, + "step": 6905 + }, + { + "epoch": 0.797185048454084, + "grad_norm": 0.19000983238220215, + "learning_rate": 2.4032029242716826e-05, + "loss": 0.9286, + "step": 6910 + }, + { + "epoch": 0.7977618827872635, + "grad_norm": 0.19312219321727753, + "learning_rate": 2.390124053470443e-05, + "loss": 0.9655, + "step": 6915 + }, + { + "epoch": 0.798338717120443, + "grad_norm": 0.19184233248233795, + "learning_rate": 2.3770760370353294e-05, + "loss": 0.964, + "step": 6920 + }, + { + "epoch": 0.7989155514536225, + "grad_norm": 0.19048276543617249, + "learning_rate": 2.364058927869732e-05, + "loss": 0.9578, + "step": 6925 + }, + { + "epoch": 0.799492385786802, + "grad_norm": 0.19102397561073303, + "learning_rate": 2.3510727787517382e-05, + "loss": 0.9342, + "step": 6930 + }, + { + "epoch": 0.8000692201199815, + "grad_norm": 0.2003382444381714, + "learning_rate": 2.3381176423338956e-05, + "loss": 0.9429, + "step": 6935 + }, + { + "epoch": 0.8006460544531611, + "grad_norm": 0.1823520064353943, + "learning_rate": 2.325193571143024e-05, + "loss": 0.9617, + "step": 6940 + }, + { + "epoch": 0.8012228887863405, + "grad_norm": 0.18638677895069122, + "learning_rate": 2.31230061757997e-05, + "loss": 0.9731, + "step": 6945 + }, + { + "epoch": 0.8017997231195201, + "grad_norm": 0.19570128619670868, + "learning_rate": 2.299438833919432e-05, + "loss": 0.9684, + "step": 6950 + }, + { + "epoch": 0.8023765574526995, + "grad_norm": 0.18794956803321838, + "learning_rate": 2.286608272309716e-05, + "loss": 0.9115, + "step": 6955 + }, + { + "epoch": 0.8029533917858791, + "grad_norm": 0.20243585109710693, + "learning_rate": 2.2738089847725497e-05, + "loss": 0.9751, + "step": 6960 + }, + { + "epoch": 0.8035302261190586, + "grad_norm": 0.18690462410449982, + "learning_rate": 2.2610410232028467e-05, + "loss": 0.9299, + "step": 6965 + }, + { + "epoch": 0.8041070604522381, + "grad_norm": 0.1870652735233307, + "learning_rate": 2.2483044393685215e-05, + "loss": 0.9366, + "step": 6970 + }, + { + "epoch": 0.8046838947854176, + "grad_norm": 0.2046535462141037, + "learning_rate": 2.235599284910258e-05, + "loss": 0.9911, + "step": 6975 + }, + { + "epoch": 0.8052607291185971, + "grad_norm": 0.18992750346660614, + "learning_rate": 2.2229256113413087e-05, + "loss": 0.9751, + "step": 6980 + }, + { + "epoch": 0.8058375634517766, + "grad_norm": 0.19411560893058777, + "learning_rate": 2.210283470047296e-05, + "loss": 0.937, + "step": 6985 + }, + { + "epoch": 0.8064143977849562, + "grad_norm": 0.1944170445203781, + "learning_rate": 2.1976729122859864e-05, + "loss": 0.9326, + "step": 6990 + }, + { + "epoch": 0.8069912321181357, + "grad_norm": 0.18680644035339355, + "learning_rate": 2.185093989187087e-05, + "loss": 0.936, + "step": 6995 + }, + { + "epoch": 0.8075680664513152, + "grad_norm": 0.18400752544403076, + "learning_rate": 2.1725467517520526e-05, + "loss": 0.9861, + "step": 7000 + }, + { + "epoch": 0.8081449007844947, + "grad_norm": 0.19621169567108154, + "learning_rate": 2.1600312508538602e-05, + "loss": 0.9284, + "step": 7005 + }, + { + "epoch": 0.8087217351176742, + "grad_norm": 0.18958927690982819, + "learning_rate": 2.1475475372368094e-05, + "loss": 0.9258, + "step": 7010 + }, + { + "epoch": 0.8092985694508538, + "grad_norm": 0.20696218311786652, + "learning_rate": 2.1350956615163254e-05, + "loss": 0.9426, + "step": 7015 + }, + { + "epoch": 0.8098754037840332, + "grad_norm": 0.20199459791183472, + "learning_rate": 2.1226756741787356e-05, + "loss": 0.9625, + "step": 7020 + }, + { + "epoch": 0.8104522381172128, + "grad_norm": 0.1961347460746765, + "learning_rate": 2.1102876255810887e-05, + "loss": 0.9664, + "step": 7025 + }, + { + "epoch": 0.8110290724503922, + "grad_norm": 0.19894124567508698, + "learning_rate": 2.0979315659509223e-05, + "loss": 0.9508, + "step": 7030 + }, + { + "epoch": 0.8116059067835718, + "grad_norm": 0.1998283416032791, + "learning_rate": 2.085607545386088e-05, + "loss": 0.9117, + "step": 7035 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 0.18765446543693542, + "learning_rate": 2.0733156138545252e-05, + "loss": 0.9908, + "step": 7040 + }, + { + "epoch": 0.8127595754499308, + "grad_norm": 0.18497121334075928, + "learning_rate": 2.0610558211940702e-05, + "loss": 0.9453, + "step": 7045 + }, + { + "epoch": 0.8133364097831103, + "grad_norm": 0.20545774698257446, + "learning_rate": 2.0488282171122498e-05, + "loss": 1.0136, + "step": 7050 + }, + { + "epoch": 0.8139132441162898, + "grad_norm": 0.18799108266830444, + "learning_rate": 2.036632851186091e-05, + "loss": 0.9834, + "step": 7055 + }, + { + "epoch": 0.8144900784494693, + "grad_norm": 0.17781130969524384, + "learning_rate": 2.0244697728618966e-05, + "loss": 0.906, + "step": 7060 + }, + { + "epoch": 0.8150669127826489, + "grad_norm": 0.1909274309873581, + "learning_rate": 2.0123390314550717e-05, + "loss": 0.9443, + "step": 7065 + }, + { + "epoch": 0.8156437471158283, + "grad_norm": 0.1991771012544632, + "learning_rate": 2.000240676149904e-05, + "loss": 0.9451, + "step": 7070 + }, + { + "epoch": 0.8162205814490079, + "grad_norm": 0.18566997349262238, + "learning_rate": 1.9881747559993703e-05, + "loss": 0.9015, + "step": 7075 + }, + { + "epoch": 0.8167974157821873, + "grad_norm": 0.19024524092674255, + "learning_rate": 1.976141319924939e-05, + "loss": 0.9458, + "step": 7080 + }, + { + "epoch": 0.8173742501153669, + "grad_norm": 0.19629044830799103, + "learning_rate": 1.964140416716379e-05, + "loss": 0.9608, + "step": 7085 + }, + { + "epoch": 0.8179510844485464, + "grad_norm": 0.185940220952034, + "learning_rate": 1.9521720950315403e-05, + "loss": 0.8914, + "step": 7090 + }, + { + "epoch": 0.8185279187817259, + "grad_norm": 0.18228422105312347, + "learning_rate": 1.940236403396186e-05, + "loss": 0.9645, + "step": 7095 + }, + { + "epoch": 0.8191047531149054, + "grad_norm": 0.18275770545005798, + "learning_rate": 1.9283333902037694e-05, + "loss": 0.9554, + "step": 7100 + }, + { + "epoch": 0.819681587448085, + "grad_norm": 0.19047953188419342, + "learning_rate": 1.9164631037152513e-05, + "loss": 0.9352, + "step": 7105 + }, + { + "epoch": 0.8202584217812644, + "grad_norm": 0.211176335811615, + "learning_rate": 1.9046255920588985e-05, + "loss": 0.9209, + "step": 7110 + }, + { + "epoch": 0.820835256114444, + "grad_norm": 0.19080542027950287, + "learning_rate": 1.8928209032301013e-05, + "loss": 0.9404, + "step": 7115 + }, + { + "epoch": 0.8214120904476234, + "grad_norm": 0.20228640735149384, + "learning_rate": 1.8810490850911577e-05, + "loss": 0.9741, + "step": 7120 + }, + { + "epoch": 0.821988924780803, + "grad_norm": 0.19433507323265076, + "learning_rate": 1.8693101853711004e-05, + "loss": 0.9596, + "step": 7125 + }, + { + "epoch": 0.8225657591139824, + "grad_norm": 0.20206230878829956, + "learning_rate": 1.857604251665487e-05, + "loss": 0.9728, + "step": 7130 + }, + { + "epoch": 0.823142593447162, + "grad_norm": 0.1768370121717453, + "learning_rate": 1.845931331436219e-05, + "loss": 0.9523, + "step": 7135 + }, + { + "epoch": 0.8237194277803415, + "grad_norm": 0.19058267772197723, + "learning_rate": 1.8342914720113404e-05, + "loss": 0.982, + "step": 7140 + }, + { + "epoch": 0.824296262113521, + "grad_norm": 0.1983305811882019, + "learning_rate": 1.822684720584852e-05, + "loss": 1.0006, + "step": 7145 + }, + { + "epoch": 0.8248730964467005, + "grad_norm": 0.19151514768600464, + "learning_rate": 1.8111111242165124e-05, + "loss": 1.0013, + "step": 7150 + }, + { + "epoch": 0.82544993077988, + "grad_norm": 0.1844472587108612, + "learning_rate": 1.7995707298316632e-05, + "loss": 0.9621, + "step": 7155 + }, + { + "epoch": 0.8260267651130595, + "grad_norm": 0.1923513561487198, + "learning_rate": 1.788063584221017e-05, + "loss": 0.8956, + "step": 7160 + }, + { + "epoch": 0.8266035994462391, + "grad_norm": 0.1910182386636734, + "learning_rate": 1.776589734040487e-05, + "loss": 0.9845, + "step": 7165 + }, + { + "epoch": 0.8271804337794185, + "grad_norm": 0.1840658187866211, + "learning_rate": 1.7651492258109835e-05, + "loss": 0.9264, + "step": 7170 + }, + { + "epoch": 0.8277572681125981, + "grad_norm": 0.21792177855968475, + "learning_rate": 1.7537421059182314e-05, + "loss": 0.9866, + "step": 7175 + }, + { + "epoch": 0.8283341024457775, + "grad_norm": 0.19659163057804108, + "learning_rate": 1.74236842061259e-05, + "loss": 0.9331, + "step": 7180 + }, + { + "epoch": 0.8289109367789571, + "grad_norm": 0.1791463941335678, + "learning_rate": 1.7310282160088465e-05, + "loss": 0.9154, + "step": 7185 + }, + { + "epoch": 0.8294877711121366, + "grad_norm": 0.19773773849010468, + "learning_rate": 1.7197215380860497e-05, + "loss": 0.9588, + "step": 7190 + }, + { + "epoch": 0.8300646054453161, + "grad_norm": 0.2065158188343048, + "learning_rate": 1.7084484326873062e-05, + "loss": 0.9579, + "step": 7195 + }, + { + "epoch": 0.8306414397784956, + "grad_norm": 0.19158004224300385, + "learning_rate": 1.6972089455196115e-05, + "loss": 0.9358, + "step": 7200 + }, + { + "epoch": 0.8312182741116751, + "grad_norm": 0.2142983227968216, + "learning_rate": 1.6860031221536398e-05, + "loss": 0.9572, + "step": 7205 + }, + { + "epoch": 0.8317951084448546, + "grad_norm": 0.1989503800868988, + "learning_rate": 1.674831008023594e-05, + "loss": 0.9698, + "step": 7210 + }, + { + "epoch": 0.8323719427780342, + "grad_norm": 0.19422230124473572, + "learning_rate": 1.6636926484269855e-05, + "loss": 0.936, + "step": 7215 + }, + { + "epoch": 0.8329487771112136, + "grad_norm": 0.22429165244102478, + "learning_rate": 1.6525880885244815e-05, + "loss": 0.9515, + "step": 7220 + }, + { + "epoch": 0.8335256114443932, + "grad_norm": 0.19105808436870575, + "learning_rate": 1.641517373339696e-05, + "loss": 0.9732, + "step": 7225 + }, + { + "epoch": 0.8341024457775726, + "grad_norm": 0.20830240845680237, + "learning_rate": 1.6304805477590312e-05, + "loss": 0.9794, + "step": 7230 + }, + { + "epoch": 0.8346792801107522, + "grad_norm": 0.20146532356739044, + "learning_rate": 1.6194776565314672e-05, + "loss": 0.9492, + "step": 7235 + }, + { + "epoch": 0.8352561144439317, + "grad_norm": 0.18800169229507446, + "learning_rate": 1.6085087442684122e-05, + "loss": 0.92, + "step": 7240 + }, + { + "epoch": 0.8358329487771112, + "grad_norm": 0.19800886511802673, + "learning_rate": 1.597573855443497e-05, + "loss": 0.9912, + "step": 7245 + }, + { + "epoch": 0.8364097831102907, + "grad_norm": 0.18179026246070862, + "learning_rate": 1.5866730343924085e-05, + "loss": 0.9121, + "step": 7250 + }, + { + "epoch": 0.8369866174434702, + "grad_norm": 0.19229067862033844, + "learning_rate": 1.575806325312702e-05, + "loss": 0.9529, + "step": 7255 + }, + { + "epoch": 0.8375634517766497, + "grad_norm": 0.1789880245923996, + "learning_rate": 1.5649737722636315e-05, + "loss": 0.926, + "step": 7260 + }, + { + "epoch": 0.8381402861098293, + "grad_norm": 0.1741928905248642, + "learning_rate": 1.554175419165951e-05, + "loss": 0.9237, + "step": 7265 + }, + { + "epoch": 0.8387171204430087, + "grad_norm": 0.20456916093826294, + "learning_rate": 1.5434113098017667e-05, + "loss": 0.9821, + "step": 7270 + }, + { + "epoch": 0.8392939547761883, + "grad_norm": 0.1989426463842392, + "learning_rate": 1.5326814878143304e-05, + "loss": 0.9187, + "step": 7275 + }, + { + "epoch": 0.8398707891093677, + "grad_norm": 0.18545162677764893, + "learning_rate": 1.5219859967078854e-05, + "loss": 0.945, + "step": 7280 + }, + { + "epoch": 0.8404476234425473, + "grad_norm": 0.17884254455566406, + "learning_rate": 1.5113248798474689e-05, + "loss": 0.9181, + "step": 7285 + }, + { + "epoch": 0.8410244577757268, + "grad_norm": 0.18553291261196136, + "learning_rate": 1.5006981804587595e-05, + "loss": 0.9737, + "step": 7290 + }, + { + "epoch": 0.8416012921089063, + "grad_norm": 0.1832725703716278, + "learning_rate": 1.4901059416278806e-05, + "loss": 0.9121, + "step": 7295 + }, + { + "epoch": 0.8421781264420858, + "grad_norm": 0.18451160192489624, + "learning_rate": 1.4795482063012367e-05, + "loss": 0.9595, + "step": 7300 + }, + { + "epoch": 0.8427549607752653, + "grad_norm": 0.1967657506465912, + "learning_rate": 1.4690250172853348e-05, + "loss": 0.9664, + "step": 7305 + }, + { + "epoch": 0.8433317951084448, + "grad_norm": 0.1973174512386322, + "learning_rate": 1.4585364172466231e-05, + "loss": 0.9763, + "step": 7310 + }, + { + "epoch": 0.8439086294416244, + "grad_norm": 0.1908571720123291, + "learning_rate": 1.4480824487112943e-05, + "loss": 0.9536, + "step": 7315 + }, + { + "epoch": 0.8444854637748038, + "grad_norm": 0.17997972667217255, + "learning_rate": 1.437663154065142e-05, + "loss": 0.977, + "step": 7320 + }, + { + "epoch": 0.8450622981079834, + "grad_norm": 0.19456464052200317, + "learning_rate": 1.4272785755533601e-05, + "loss": 0.96, + "step": 7325 + }, + { + "epoch": 0.8456391324411628, + "grad_norm": 0.18535476922988892, + "learning_rate": 1.4169287552803923e-05, + "loss": 0.9358, + "step": 7330 + }, + { + "epoch": 0.8462159667743424, + "grad_norm": 0.19435110688209534, + "learning_rate": 1.4066137352097575e-05, + "loss": 0.9578, + "step": 7335 + }, + { + "epoch": 0.846792801107522, + "grad_norm": 0.196323961019516, + "learning_rate": 1.396333557163868e-05, + "loss": 0.9587, + "step": 7340 + }, + { + "epoch": 0.8473696354407014, + "grad_norm": 0.18497958779335022, + "learning_rate": 1.3860882628238781e-05, + "loss": 0.8837, + "step": 7345 + }, + { + "epoch": 0.847946469773881, + "grad_norm": 0.1857413649559021, + "learning_rate": 1.3758778937294947e-05, + "loss": 0.9882, + "step": 7350 + }, + { + "epoch": 0.8485233041070604, + "grad_norm": 0.18033544719219208, + "learning_rate": 1.365702491278833e-05, + "loss": 0.9649, + "step": 7355 + }, + { + "epoch": 0.84910013844024, + "grad_norm": 0.17830757796764374, + "learning_rate": 1.3555620967282235e-05, + "loss": 0.9306, + "step": 7360 + }, + { + "epoch": 0.8496769727734195, + "grad_norm": 0.18326257169246674, + "learning_rate": 1.3454567511920634e-05, + "loss": 0.9583, + "step": 7365 + }, + { + "epoch": 0.850253807106599, + "grad_norm": 0.1927419751882553, + "learning_rate": 1.3353864956426366e-05, + "loss": 0.9199, + "step": 7370 + }, + { + "epoch": 0.8508306414397785, + "grad_norm": 0.20454275608062744, + "learning_rate": 1.3253513709099652e-05, + "loss": 0.9859, + "step": 7375 + }, + { + "epoch": 0.8514074757729581, + "grad_norm": 0.1943024843931198, + "learning_rate": 1.3153514176816195e-05, + "loss": 0.9491, + "step": 7380 + }, + { + "epoch": 0.8519843101061375, + "grad_norm": 0.196246936917305, + "learning_rate": 1.305386676502578e-05, + "loss": 0.9904, + "step": 7385 + }, + { + "epoch": 0.8525611444393171, + "grad_norm": 0.1911761462688446, + "learning_rate": 1.2954571877750443e-05, + "loss": 0.9533, + "step": 7390 + }, + { + "epoch": 0.8531379787724965, + "grad_norm": 0.193502277135849, + "learning_rate": 1.2855629917582935e-05, + "loss": 0.9714, + "step": 7395 + }, + { + "epoch": 0.8537148131056761, + "grad_norm": 0.193160742521286, + "learning_rate": 1.2757041285685011e-05, + "loss": 0.9481, + "step": 7400 + }, + { + "epoch": 0.8542916474388556, + "grad_norm": 0.19016680121421814, + "learning_rate": 1.2658806381785926e-05, + "loss": 0.9405, + "step": 7405 + }, + { + "epoch": 0.8548684817720351, + "grad_norm": 0.19033581018447876, + "learning_rate": 1.2560925604180673e-05, + "loss": 0.9857, + "step": 7410 + }, + { + "epoch": 0.8554453161052146, + "grad_norm": 0.17935208976268768, + "learning_rate": 1.2463399349728488e-05, + "loss": 0.9586, + "step": 7415 + }, + { + "epoch": 0.8560221504383941, + "grad_norm": 0.18908259272575378, + "learning_rate": 1.2366228013851156e-05, + "loss": 0.9478, + "step": 7420 + }, + { + "epoch": 0.8565989847715736, + "grad_norm": 0.18578480184078217, + "learning_rate": 1.2269411990531421e-05, + "loss": 0.9384, + "step": 7425 + }, + { + "epoch": 0.8571758191047532, + "grad_norm": 0.18580469489097595, + "learning_rate": 1.2172951672311427e-05, + "loss": 0.9289, + "step": 7430 + }, + { + "epoch": 0.8577526534379326, + "grad_norm": 0.20112477242946625, + "learning_rate": 1.207684745029114e-05, + "loss": 0.9331, + "step": 7435 + }, + { + "epoch": 0.8583294877711122, + "grad_norm": 0.19187471270561218, + "learning_rate": 1.1981099714126654e-05, + "loss": 0.9518, + "step": 7440 + }, + { + "epoch": 0.8589063221042916, + "grad_norm": 0.1830911487340927, + "learning_rate": 1.1885708852028777e-05, + "loss": 0.9235, + "step": 7445 + }, + { + "epoch": 0.8594831564374712, + "grad_norm": 0.1817176192998886, + "learning_rate": 1.1790675250761263e-05, + "loss": 0.9221, + "step": 7450 + }, + { + "epoch": 0.8600599907706507, + "grad_norm": 0.19020894169807434, + "learning_rate": 1.1695999295639459e-05, + "loss": 0.9953, + "step": 7455 + }, + { + "epoch": 0.8606368251038302, + "grad_norm": 0.191711887717247, + "learning_rate": 1.1601681370528484e-05, + "loss": 0.9635, + "step": 7460 + }, + { + "epoch": 0.8612136594370097, + "grad_norm": 0.19259481132030487, + "learning_rate": 1.150772185784198e-05, + "loss": 0.9135, + "step": 7465 + }, + { + "epoch": 0.8617904937701892, + "grad_norm": 0.1911952793598175, + "learning_rate": 1.1414121138540279e-05, + "loss": 0.9416, + "step": 7470 + }, + { + "epoch": 0.8623673281033687, + "grad_norm": 0.2062252312898636, + "learning_rate": 1.1320879592129052e-05, + "loss": 0.9167, + "step": 7475 + }, + { + "epoch": 0.8629441624365483, + "grad_norm": 0.1907137632369995, + "learning_rate": 1.1227997596657636e-05, + "loss": 0.9375, + "step": 7480 + }, + { + "epoch": 0.8635209967697277, + "grad_norm": 0.19318221509456635, + "learning_rate": 1.1135475528717642e-05, + "loss": 0.936, + "step": 7485 + }, + { + "epoch": 0.8640978311029073, + "grad_norm": 0.18050383031368256, + "learning_rate": 1.1043313763441277e-05, + "loss": 0.9388, + "step": 7490 + }, + { + "epoch": 0.8646746654360867, + "grad_norm": 0.18808601796627045, + "learning_rate": 1.0951512674499898e-05, + "loss": 0.9033, + "step": 7495 + }, + { + "epoch": 0.8652514997692663, + "grad_norm": 0.18681325018405914, + "learning_rate": 1.0860072634102569e-05, + "loss": 0.9511, + "step": 7500 + }, + { + "epoch": 0.8658283341024458, + "grad_norm": 0.18992042541503906, + "learning_rate": 1.0768994012994371e-05, + "loss": 0.9316, + "step": 7505 + }, + { + "epoch": 0.8664051684356253, + "grad_norm": 0.19558371603488922, + "learning_rate": 1.0678277180455109e-05, + "loss": 0.9203, + "step": 7510 + }, + { + "epoch": 0.8669820027688048, + "grad_norm": 0.18919962644577026, + "learning_rate": 1.0587922504297642e-05, + "loss": 0.939, + "step": 7515 + }, + { + "epoch": 0.8675588371019843, + "grad_norm": 0.18517306447029114, + "learning_rate": 1.049793035086647e-05, + "loss": 0.9253, + "step": 7520 + }, + { + "epoch": 0.8681356714351638, + "grad_norm": 0.1969662606716156, + "learning_rate": 1.040830108503622e-05, + "loss": 0.9644, + "step": 7525 + }, + { + "epoch": 0.8687125057683434, + "grad_norm": 0.18453893065452576, + "learning_rate": 1.031903507021027e-05, + "loss": 0.9617, + "step": 7530 + }, + { + "epoch": 0.8692893401015228, + "grad_norm": 0.18227744102478027, + "learning_rate": 1.0230132668319082e-05, + "loss": 0.9707, + "step": 7535 + }, + { + "epoch": 0.8698661744347024, + "grad_norm": 0.19960415363311768, + "learning_rate": 1.014159423981893e-05, + "loss": 0.9484, + "step": 7540 + }, + { + "epoch": 0.8704430087678818, + "grad_norm": 0.1987922042608261, + "learning_rate": 1.0053420143690284e-05, + "loss": 0.931, + "step": 7545 + }, + { + "epoch": 0.8710198431010614, + "grad_norm": 0.1911085695028305, + "learning_rate": 9.965610737436515e-06, + "loss": 0.9723, + "step": 7550 + }, + { + "epoch": 0.8715966774342409, + "grad_norm": 0.1827116310596466, + "learning_rate": 9.87816637708221e-06, + "loss": 0.9181, + "step": 7555 + }, + { + "epoch": 0.8721735117674204, + "grad_norm": 0.19509339332580566, + "learning_rate": 9.791087417172019e-06, + "loss": 0.971, + "step": 7560 + }, + { + "epoch": 0.8727503461005999, + "grad_norm": 0.18567214906215668, + "learning_rate": 9.704374210768952e-06, + "loss": 0.9254, + "step": 7565 + }, + { + "epoch": 0.8733271804337794, + "grad_norm": 0.20758579671382904, + "learning_rate": 9.618027109453176e-06, + "loss": 0.9661, + "step": 7570 + }, + { + "epoch": 0.8739040147669589, + "grad_norm": 0.20255382359027863, + "learning_rate": 9.532046463320365e-06, + "loss": 0.9802, + "step": 7575 + }, + { + "epoch": 0.8744808491001385, + "grad_norm": 0.18845497071743011, + "learning_rate": 9.446432620980517e-06, + "loss": 0.94, + "step": 7580 + }, + { + "epoch": 0.8750576834333179, + "grad_norm": 0.19557009637355804, + "learning_rate": 9.361185929556282e-06, + "loss": 0.9853, + "step": 7585 + }, + { + "epoch": 0.8756345177664975, + "grad_norm": 0.1927575021982193, + "learning_rate": 9.276306734681805e-06, + "loss": 0.966, + "step": 7590 + }, + { + "epoch": 0.8762113520996769, + "grad_norm": 0.18559756875038147, + "learning_rate": 9.191795380501134e-06, + "loss": 0.9768, + "step": 7595 + }, + { + "epoch": 0.8767881864328565, + "grad_norm": 0.18745888769626617, + "learning_rate": 9.107652209666973e-06, + "loss": 0.9522, + "step": 7600 + }, + { + "epoch": 0.877365020766036, + "grad_norm": 0.1900642067193985, + "learning_rate": 9.023877563339134e-06, + "loss": 0.8757, + "step": 7605 + }, + { + "epoch": 0.8779418550992155, + "grad_norm": 0.1792660653591156, + "learning_rate": 8.940471781183335e-06, + "loss": 0.9486, + "step": 7610 + }, + { + "epoch": 0.878518689432395, + "grad_norm": 0.19872300326824188, + "learning_rate": 8.857435201369645e-06, + "loss": 0.955, + "step": 7615 + }, + { + "epoch": 0.8790955237655745, + "grad_norm": 0.2203512042760849, + "learning_rate": 8.774768160571257e-06, + "loss": 0.9289, + "step": 7620 + }, + { + "epoch": 0.879672358098754, + "grad_norm": 0.19183123111724854, + "learning_rate": 8.692470993962987e-06, + "loss": 0.9636, + "step": 7625 + }, + { + "epoch": 0.8802491924319336, + "grad_norm": 0.1879664957523346, + "learning_rate": 8.610544035220103e-06, + "loss": 0.9431, + "step": 7630 + }, + { + "epoch": 0.880826026765113, + "grad_norm": 0.19013233482837677, + "learning_rate": 8.528987616516748e-06, + "loss": 0.8901, + "step": 7635 + }, + { + "epoch": 0.8814028610982926, + "grad_norm": 0.18684031069278717, + "learning_rate": 8.44780206852478e-06, + "loss": 0.9107, + "step": 7640 + }, + { + "epoch": 0.881979695431472, + "grad_norm": 0.24778002500534058, + "learning_rate": 8.366987720412322e-06, + "loss": 0.9398, + "step": 7645 + }, + { + "epoch": 0.8825565297646516, + "grad_norm": 0.20799873769283295, + "learning_rate": 8.286544899842441e-06, + "loss": 0.9893, + "step": 7650 + }, + { + "epoch": 0.883133364097831, + "grad_norm": 0.18839897215366364, + "learning_rate": 8.206473932971903e-06, + "loss": 1.0032, + "step": 7655 + }, + { + "epoch": 0.8837101984310106, + "grad_norm": 0.1880505084991455, + "learning_rate": 8.126775144449705e-06, + "loss": 0.9634, + "step": 7660 + }, + { + "epoch": 0.8842870327641901, + "grad_norm": 0.1813763827085495, + "learning_rate": 8.04744885741593e-06, + "loss": 0.9001, + "step": 7665 + }, + { + "epoch": 0.8848638670973696, + "grad_norm": 0.18489223718643188, + "learning_rate": 7.968495393500285e-06, + "loss": 0.9576, + "step": 7670 + }, + { + "epoch": 0.8854407014305491, + "grad_norm": 0.19136987626552582, + "learning_rate": 7.889915072820874e-06, + "loss": 0.9586, + "step": 7675 + }, + { + "epoch": 0.8860175357637287, + "grad_norm": 0.18882869184017181, + "learning_rate": 7.811708213982883e-06, + "loss": 0.938, + "step": 7680 + }, + { + "epoch": 0.8865943700969081, + "grad_norm": 0.19385488331317902, + "learning_rate": 7.733875134077307e-06, + "loss": 0.9481, + "step": 7685 + }, + { + "epoch": 0.8871712044300877, + "grad_norm": 0.2002614289522171, + "learning_rate": 7.656416148679612e-06, + "loss": 0.9657, + "step": 7690 + }, + { + "epoch": 0.8877480387632672, + "grad_norm": 0.19226016104221344, + "learning_rate": 7.579331571848569e-06, + "loss": 1.0032, + "step": 7695 + }, + { + "epoch": 0.8883248730964467, + "grad_norm": 0.192755326628685, + "learning_rate": 7.502621716124791e-06, + "loss": 0.9508, + "step": 7700 + }, + { + "epoch": 0.8889017074296263, + "grad_norm": 0.19249974191188812, + "learning_rate": 7.4262868925296995e-06, + "loss": 0.9289, + "step": 7705 + }, + { + "epoch": 0.8894785417628057, + "grad_norm": 0.19001740217208862, + "learning_rate": 7.35032741056404e-06, + "loss": 0.9426, + "step": 7710 + }, + { + "epoch": 0.8900553760959853, + "grad_norm": 0.20642143487930298, + "learning_rate": 7.274743578206788e-06, + "loss": 0.9962, + "step": 7715 + }, + { + "epoch": 0.8906322104291647, + "grad_norm": 0.1881352961063385, + "learning_rate": 7.199535701913806e-06, + "loss": 0.92, + "step": 7720 + }, + { + "epoch": 0.8912090447623443, + "grad_norm": 0.18823300302028656, + "learning_rate": 7.124704086616684e-06, + "loss": 0.9823, + "step": 7725 + }, + { + "epoch": 0.8917858790955238, + "grad_norm": 0.19650129973888397, + "learning_rate": 7.05024903572139e-06, + "loss": 0.9415, + "step": 7730 + }, + { + "epoch": 0.8923627134287033, + "grad_norm": 0.18661227822303772, + "learning_rate": 6.976170851107178e-06, + "loss": 0.986, + "step": 7735 + }, + { + "epoch": 0.8929395477618828, + "grad_norm": 0.1886250525712967, + "learning_rate": 6.902469833125236e-06, + "loss": 0.9679, + "step": 7740 + }, + { + "epoch": 0.8935163820950623, + "grad_norm": 0.1884276568889618, + "learning_rate": 6.8291462805975535e-06, + "loss": 0.9508, + "step": 7745 + }, + { + "epoch": 0.8940932164282418, + "grad_norm": 0.18795393407344818, + "learning_rate": 6.756200490815645e-06, + "loss": 0.9148, + "step": 7750 + }, + { + "epoch": 0.8946700507614214, + "grad_norm": 0.19203375279903412, + "learning_rate": 6.683632759539449e-06, + "loss": 0.9604, + "step": 7755 + }, + { + "epoch": 0.8952468850946008, + "grad_norm": 0.19676725566387177, + "learning_rate": 6.611443380995963e-06, + "loss": 0.964, + "step": 7760 + }, + { + "epoch": 0.8958237194277804, + "grad_norm": 0.17924650013446808, + "learning_rate": 6.5396326478782465e-06, + "loss": 0.8975, + "step": 7765 + }, + { + "epoch": 0.8964005537609598, + "grad_norm": 0.18147966265678406, + "learning_rate": 6.468200851344042e-06, + "loss": 0.9611, + "step": 7770 + }, + { + "epoch": 0.8969773880941394, + "grad_norm": 0.17754757404327393, + "learning_rate": 6.397148281014798e-06, + "loss": 0.9375, + "step": 7775 + }, + { + "epoch": 0.8975542224273189, + "grad_norm": 0.192587211728096, + "learning_rate": 6.326475224974249e-06, + "loss": 0.9408, + "step": 7780 + }, + { + "epoch": 0.8981310567604984, + "grad_norm": 0.1927655041217804, + "learning_rate": 6.256181969767505e-06, + "loss": 0.9238, + "step": 7785 + }, + { + "epoch": 0.8987078910936779, + "grad_norm": 0.18646924197673798, + "learning_rate": 6.186268800399675e-06, + "loss": 0.9693, + "step": 7790 + }, + { + "epoch": 0.8992847254268574, + "grad_norm": 0.18755947053432465, + "learning_rate": 6.116736000334888e-06, + "loss": 0.9422, + "step": 7795 + }, + { + "epoch": 0.8998615597600369, + "grad_norm": 0.19362567365169525, + "learning_rate": 6.047583851494965e-06, + "loss": 0.9084, + "step": 7800 + }, + { + "epoch": 0.9004383940932165, + "grad_norm": 0.1956445276737213, + "learning_rate": 5.978812634258468e-06, + "loss": 0.9196, + "step": 7805 + }, + { + "epoch": 0.9010152284263959, + "grad_norm": 0.20970386266708374, + "learning_rate": 5.910422627459411e-06, + "loss": 0.968, + "step": 7810 + }, + { + "epoch": 0.9015920627595755, + "grad_norm": 0.19140861928462982, + "learning_rate": 5.842414108386151e-06, + "loss": 0.9587, + "step": 7815 + }, + { + "epoch": 0.9021688970927549, + "grad_norm": 0.18535487353801727, + "learning_rate": 5.774787352780387e-06, + "loss": 0.9869, + "step": 7820 + }, + { + "epoch": 0.9027457314259345, + "grad_norm": 0.1941797137260437, + "learning_rate": 5.707542634835883e-06, + "loss": 0.983, + "step": 7825 + }, + { + "epoch": 0.903322565759114, + "grad_norm": 0.1803128570318222, + "learning_rate": 5.640680227197426e-06, + "loss": 0.9323, + "step": 7830 + }, + { + "epoch": 0.9038994000922935, + "grad_norm": 0.18415096402168274, + "learning_rate": 5.574200400959773e-06, + "loss": 0.9442, + "step": 7835 + }, + { + "epoch": 0.904476234425473, + "grad_norm": 0.19649024307727814, + "learning_rate": 5.5081034256664445e-06, + "loss": 1.0359, + "step": 7840 + }, + { + "epoch": 0.9050530687586525, + "grad_norm": 0.19169114530086517, + "learning_rate": 5.442389569308703e-06, + "loss": 0.9368, + "step": 7845 + }, + { + "epoch": 0.905629903091832, + "grad_norm": 0.19056066870689392, + "learning_rate": 5.377059098324455e-06, + "loss": 0.9447, + "step": 7850 + }, + { + "epoch": 0.9062067374250116, + "grad_norm": 0.18303756415843964, + "learning_rate": 5.312112277597159e-06, + "loss": 0.8908, + "step": 7855 + }, + { + "epoch": 0.906783571758191, + "grad_norm": 0.20311231911182404, + "learning_rate": 5.247549370454763e-06, + "loss": 0.9742, + "step": 7860 + }, + { + "epoch": 0.9073604060913706, + "grad_norm": 0.19544216990470886, + "learning_rate": 5.183370638668616e-06, + "loss": 1.0013, + "step": 7865 + }, + { + "epoch": 0.90793724042455, + "grad_norm": 0.19310228526592255, + "learning_rate": 5.119576342452459e-06, + "loss": 0.9131, + "step": 7870 + }, + { + "epoch": 0.9085140747577296, + "grad_norm": 0.2016146332025528, + "learning_rate": 5.056166740461265e-06, + "loss": 0.9889, + "step": 7875 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.25115054845809937, + "learning_rate": 4.993142089790337e-06, + "loss": 0.9626, + "step": 7880 + }, + { + "epoch": 0.9096677434240886, + "grad_norm": 0.19337384402751923, + "learning_rate": 4.9305026459741224e-06, + "loss": 0.9716, + "step": 7885 + }, + { + "epoch": 0.9102445777572681, + "grad_norm": 0.1815733164548874, + "learning_rate": 4.8682486629852975e-06, + "loss": 0.9293, + "step": 7890 + }, + { + "epoch": 0.9108214120904476, + "grad_norm": 0.18898643553256989, + "learning_rate": 4.8063803932336114e-06, + "loss": 0.8889, + "step": 7895 + }, + { + "epoch": 0.9113982464236271, + "grad_norm": 0.1844266951084137, + "learning_rate": 4.74489808756502e-06, + "loss": 0.9805, + "step": 7900 + }, + { + "epoch": 0.9119750807568067, + "grad_norm": 0.18842408061027527, + "learning_rate": 4.683801995260484e-06, + "loss": 0.9662, + "step": 7905 + }, + { + "epoch": 0.9125519150899861, + "grad_norm": 0.20098374783992767, + "learning_rate": 4.623092364035153e-06, + "loss": 0.9969, + "step": 7910 + }, + { + "epoch": 0.9131287494231657, + "grad_norm": 0.19723452627658844, + "learning_rate": 4.562769440037174e-06, + "loss": 0.8958, + "step": 7915 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 0.20587897300720215, + "learning_rate": 4.502833467846857e-06, + "loss": 1.036, + "step": 7920 + }, + { + "epoch": 0.9142824180895247, + "grad_norm": 0.18344053626060486, + "learning_rate": 4.443284690475558e-06, + "loss": 0.9227, + "step": 7925 + }, + { + "epoch": 0.9148592524227042, + "grad_norm": 0.19742338359355927, + "learning_rate": 4.384123349364788e-06, + "loss": 0.9817, + "step": 7930 + }, + { + "epoch": 0.9154360867558837, + "grad_norm": 0.1915377378463745, + "learning_rate": 4.32534968438516e-06, + "loss": 0.9285, + "step": 7935 + }, + { + "epoch": 0.9160129210890632, + "grad_norm": 0.20171992480754852, + "learning_rate": 4.266963933835455e-06, + "loss": 0.9628, + "step": 7940 + }, + { + "epoch": 0.9165897554222427, + "grad_norm": 0.19588075578212738, + "learning_rate": 4.208966334441633e-06, + "loss": 0.944, + "step": 7945 + }, + { + "epoch": 0.9171665897554222, + "grad_norm": 0.1999482959508896, + "learning_rate": 4.151357121355947e-06, + "loss": 0.9673, + "step": 7950 + }, + { + "epoch": 0.9177434240886018, + "grad_norm": 0.1867521107196808, + "learning_rate": 4.0941365281558454e-06, + "loss": 0.9605, + "step": 7955 + }, + { + "epoch": 0.9183202584217812, + "grad_norm": 0.19164207577705383, + "learning_rate": 4.037304786843188e-06, + "loss": 0.9841, + "step": 7960 + }, + { + "epoch": 0.9188970927549608, + "grad_norm": 0.18731051683425903, + "learning_rate": 3.980862127843199e-06, + "loss": 0.9518, + "step": 7965 + }, + { + "epoch": 0.9194739270881402, + "grad_norm": 0.18932907283306122, + "learning_rate": 3.924808780003531e-06, + "loss": 0.9197, + "step": 7970 + }, + { + "epoch": 0.9200507614213198, + "grad_norm": 0.19089601933956146, + "learning_rate": 3.86914497059343e-06, + "loss": 0.916, + "step": 7975 + }, + { + "epoch": 0.9206275957544993, + "grad_norm": 0.19187115132808685, + "learning_rate": 3.813870925302698e-06, + "loss": 0.9431, + "step": 7980 + }, + { + "epoch": 0.9212044300876788, + "grad_norm": 0.20279189944267273, + "learning_rate": 3.7589868682408434e-06, + "loss": 0.9663, + "step": 7985 + }, + { + "epoch": 0.9217812644208583, + "grad_norm": 0.18457676470279694, + "learning_rate": 3.7044930219362063e-06, + "loss": 0.9523, + "step": 7990 + }, + { + "epoch": 0.9223580987540378, + "grad_norm": 0.18185338377952576, + "learning_rate": 3.6503896073349587e-06, + "loss": 0.8865, + "step": 7995 + }, + { + "epoch": 0.9229349330872173, + "grad_norm": 0.19247382879257202, + "learning_rate": 3.5966768438002507e-06, + "loss": 0.9723, + "step": 8000 + }, + { + "epoch": 0.9235117674203969, + "grad_norm": 0.1936521977186203, + "learning_rate": 3.5433549491113884e-06, + "loss": 0.9622, + "step": 8005 + }, + { + "epoch": 0.9240886017535763, + "grad_norm": 0.19727076590061188, + "learning_rate": 3.4904241394628557e-06, + "loss": 0.921, + "step": 8010 + }, + { + "epoch": 0.9246654360867559, + "grad_norm": 0.19335485994815826, + "learning_rate": 3.4378846294634835e-06, + "loss": 0.9722, + "step": 8015 + }, + { + "epoch": 0.9252422704199353, + "grad_norm": 0.1931992471218109, + "learning_rate": 3.3857366321355722e-06, + "loss": 0.9835, + "step": 8020 + }, + { + "epoch": 0.9258191047531149, + "grad_norm": 0.17889727652072906, + "learning_rate": 3.3339803589140352e-06, + "loss": 0.9528, + "step": 8025 + }, + { + "epoch": 0.9263959390862944, + "grad_norm": 0.19378302991390228, + "learning_rate": 3.2826160196455123e-06, + "loss": 0.9556, + "step": 8030 + }, + { + "epoch": 0.9269727734194739, + "grad_norm": 0.18681880831718445, + "learning_rate": 3.23164382258756e-06, + "loss": 0.9854, + "step": 8035 + }, + { + "epoch": 0.9275496077526535, + "grad_norm": 0.19622889161109924, + "learning_rate": 3.181063974407772e-06, + "loss": 0.9782, + "step": 8040 + }, + { + "epoch": 0.928126442085833, + "grad_norm": 0.17983347177505493, + "learning_rate": 3.1308766801829926e-06, + "loss": 0.968, + "step": 8045 + }, + { + "epoch": 0.9287032764190125, + "grad_norm": 0.2013084590435028, + "learning_rate": 3.081082143398395e-06, + "loss": 0.9816, + "step": 8050 + }, + { + "epoch": 0.929280110752192, + "grad_norm": 0.2396339625120163, + "learning_rate": 3.0316805659467705e-06, + "loss": 0.9845, + "step": 8055 + }, + { + "epoch": 0.9298569450853715, + "grad_norm": 0.19670408964157104, + "learning_rate": 2.9826721481276077e-06, + "loss": 0.9738, + "step": 8060 + }, + { + "epoch": 0.930433779418551, + "grad_norm": 0.19861535727977753, + "learning_rate": 2.934057088646336e-06, + "loss": 0.9276, + "step": 8065 + }, + { + "epoch": 0.9310106137517306, + "grad_norm": 0.19099079072475433, + "learning_rate": 2.8858355846134944e-06, + "loss": 0.9658, + "step": 8070 + }, + { + "epoch": 0.93158744808491, + "grad_norm": 0.18421880900859833, + "learning_rate": 2.8380078315439653e-06, + "loss": 0.945, + "step": 8075 + }, + { + "epoch": 0.9321642824180896, + "grad_norm": 0.1922229677438736, + "learning_rate": 2.790574023356163e-06, + "loss": 0.9513, + "step": 8080 + }, + { + "epoch": 0.932741116751269, + "grad_norm": 0.18163804709911346, + "learning_rate": 2.7435343523712242e-06, + "loss": 0.93, + "step": 8085 + }, + { + "epoch": 0.9333179510844486, + "grad_norm": 0.2035042643547058, + "learning_rate": 2.6968890093122754e-06, + "loss": 0.955, + "step": 8090 + }, + { + "epoch": 0.933894785417628, + "grad_norm": 0.19784539937973022, + "learning_rate": 2.650638183303611e-06, + "loss": 0.9324, + "step": 8095 + }, + { + "epoch": 0.9344716197508076, + "grad_norm": 0.2163614183664322, + "learning_rate": 2.6047820618699592e-06, + "loss": 0.9646, + "step": 8100 + }, + { + "epoch": 0.9350484540839871, + "grad_norm": 0.19160954654216766, + "learning_rate": 2.5593208309357187e-06, + "loss": 0.9506, + "step": 8105 + }, + { + "epoch": 0.9356252884171666, + "grad_norm": 0.18938621878623962, + "learning_rate": 2.514254674824168e-06, + "loss": 0.9404, + "step": 8110 + }, + { + "epoch": 0.9362021227503461, + "grad_norm": 0.20650902390480042, + "learning_rate": 2.469583776256812e-06, + "loss": 0.9374, + "step": 8115 + }, + { + "epoch": 0.9367789570835257, + "grad_norm": 0.19179539382457733, + "learning_rate": 2.4253083163525038e-06, + "loss": 0.9592, + "step": 8120 + }, + { + "epoch": 0.9373557914167051, + "grad_norm": 0.18857675790786743, + "learning_rate": 2.3814284746268344e-06, + "loss": 0.9976, + "step": 8125 + }, + { + "epoch": 0.9379326257498847, + "grad_norm": 0.19117896258831024, + "learning_rate": 2.3379444289913342e-06, + "loss": 0.9419, + "step": 8130 + }, + { + "epoch": 0.9385094600830641, + "grad_norm": 0.2242845743894577, + "learning_rate": 2.2948563557527836e-06, + "loss": 0.9697, + "step": 8135 + }, + { + "epoch": 0.9390862944162437, + "grad_norm": 0.1854942888021469, + "learning_rate": 2.2521644296124466e-06, + "loss": 0.9324, + "step": 8140 + }, + { + "epoch": 0.9396631287494231, + "grad_norm": 0.19978578388690948, + "learning_rate": 2.209868823665473e-06, + "loss": 0.9658, + "step": 8145 + }, + { + "epoch": 0.9402399630826027, + "grad_norm": 0.19244587421417236, + "learning_rate": 2.1679697094000638e-06, + "loss": 0.9476, + "step": 8150 + }, + { + "epoch": 0.9408167974157822, + "grad_norm": 0.18521623313426971, + "learning_rate": 2.1264672566968736e-06, + "loss": 0.9274, + "step": 8155 + }, + { + "epoch": 0.9413936317489617, + "grad_norm": 0.18388035893440247, + "learning_rate": 2.0853616338282644e-06, + "loss": 0.9005, + "step": 8160 + }, + { + "epoch": 0.9419704660821412, + "grad_norm": 0.2041049599647522, + "learning_rate": 2.044653007457653e-06, + "loss": 0.9389, + "step": 8165 + }, + { + "epoch": 0.9425473004153208, + "grad_norm": 0.18562230467796326, + "learning_rate": 2.0043415426388324e-06, + "loss": 0.9862, + "step": 8170 + }, + { + "epoch": 0.9431241347485002, + "grad_norm": 0.192937970161438, + "learning_rate": 1.964427402815294e-06, + "loss": 0.9155, + "step": 8175 + }, + { + "epoch": 0.9437009690816798, + "grad_norm": 0.1922484189271927, + "learning_rate": 1.924910749819586e-06, + "loss": 0.9495, + "step": 8180 + }, + { + "epoch": 0.9442778034148592, + "grad_norm": 0.170320063829422, + "learning_rate": 1.8857917438725892e-06, + "loss": 0.9415, + "step": 8185 + }, + { + "epoch": 0.9448546377480388, + "grad_norm": 0.20970895886421204, + "learning_rate": 1.8470705435829849e-06, + "loss": 0.9352, + "step": 8190 + }, + { + "epoch": 0.9454314720812182, + "grad_norm": 0.18411558866500854, + "learning_rate": 1.8087473059464788e-06, + "loss": 0.9427, + "step": 8195 + }, + { + "epoch": 0.9460083064143978, + "grad_norm": 0.19260326027870178, + "learning_rate": 1.770822186345289e-06, + "loss": 0.9852, + "step": 8200 + }, + { + "epoch": 0.9465851407475773, + "grad_norm": 0.18823884427547455, + "learning_rate": 1.7332953385474027e-06, + "loss": 0.9574, + "step": 8205 + }, + { + "epoch": 0.9471619750807568, + "grad_norm": 0.20108766853809357, + "learning_rate": 1.6961669147060765e-06, + "loss": 0.9593, + "step": 8210 + }, + { + "epoch": 0.9477388094139363, + "grad_norm": 0.18643486499786377, + "learning_rate": 1.6594370653590706e-06, + "loss": 0.9279, + "step": 8215 + }, + { + "epoch": 0.9483156437471159, + "grad_norm": 0.19154737889766693, + "learning_rate": 1.6231059394281934e-06, + "loss": 0.9204, + "step": 8220 + }, + { + "epoch": 0.9488924780802953, + "grad_norm": 0.20026449859142303, + "learning_rate": 1.587173684218557e-06, + "loss": 0.9065, + "step": 8225 + }, + { + "epoch": 0.9494693124134749, + "grad_norm": 0.20944979786872864, + "learning_rate": 1.55164044541809e-06, + "loss": 0.9708, + "step": 8230 + }, + { + "epoch": 0.9500461467466543, + "grad_norm": 0.21715082228183746, + "learning_rate": 1.5165063670968926e-06, + "loss": 0.9512, + "step": 8235 + }, + { + "epoch": 0.9506229810798339, + "grad_norm": 0.19158127903938293, + "learning_rate": 1.4817715917066488e-06, + "loss": 0.9409, + "step": 8240 + }, + { + "epoch": 0.9511998154130133, + "grad_norm": 0.18626423180103302, + "learning_rate": 1.4474362600800706e-06, + "loss": 0.9412, + "step": 8245 + }, + { + "epoch": 0.9517766497461929, + "grad_norm": 0.1935131996870041, + "learning_rate": 1.4135005114303435e-06, + "loss": 0.9377, + "step": 8250 + }, + { + "epoch": 0.9523534840793724, + "grad_norm": 0.19177401065826416, + "learning_rate": 1.379964483350482e-06, + "loss": 0.9414, + "step": 8255 + }, + { + "epoch": 0.9529303184125519, + "grad_norm": 0.1947384476661682, + "learning_rate": 1.3468283118128756e-06, + "loss": 1.0007, + "step": 8260 + }, + { + "epoch": 0.9535071527457314, + "grad_norm": 0.2043754905462265, + "learning_rate": 1.314092131168665e-06, + "loss": 0.9421, + "step": 8265 + }, + { + "epoch": 0.954083987078911, + "grad_norm": 0.19373486936092377, + "learning_rate": 1.2817560741472445e-06, + "loss": 1.0349, + "step": 8270 + }, + { + "epoch": 0.9546608214120904, + "grad_norm": 0.1884533017873764, + "learning_rate": 1.2498202718556617e-06, + "loss": 0.9743, + "step": 8275 + }, + { + "epoch": 0.95523765574527, + "grad_norm": 0.20388375222682953, + "learning_rate": 1.2182848537781622e-06, + "loss": 0.9565, + "step": 8280 + }, + { + "epoch": 0.9558144900784494, + "grad_norm": 0.18673211336135864, + "learning_rate": 1.187149947775612e-06, + "loss": 0.9801, + "step": 8285 + }, + { + "epoch": 0.956391324411629, + "grad_norm": 0.19092413783073425, + "learning_rate": 1.1564156800849879e-06, + "loss": 0.9634, + "step": 8290 + }, + { + "epoch": 0.9569681587448085, + "grad_norm": 0.19628967344760895, + "learning_rate": 1.1260821753188987e-06, + "loss": 0.9575, + "step": 8295 + }, + { + "epoch": 0.957544993077988, + "grad_norm": 0.19361461699008942, + "learning_rate": 1.0961495564650092e-06, + "loss": 0.9055, + "step": 8300 + }, + { + "epoch": 0.9581218274111675, + "grad_norm": 0.20304569602012634, + "learning_rate": 1.0666179448856174e-06, + "loss": 0.8844, + "step": 8305 + }, + { + "epoch": 0.958698661744347, + "grad_norm": 0.19096963107585907, + "learning_rate": 1.0374874603171326e-06, + "loss": 0.9868, + "step": 8310 + }, + { + "epoch": 0.9592754960775265, + "grad_norm": 0.1914781928062439, + "learning_rate": 1.0087582208695768e-06, + "loss": 0.8896, + "step": 8315 + }, + { + "epoch": 0.9598523304107061, + "grad_norm": 0.19837866723537445, + "learning_rate": 9.804303430261174e-07, + "loss": 0.9526, + "step": 8320 + }, + { + "epoch": 0.9604291647438855, + "grad_norm": 0.1940370500087738, + "learning_rate": 9.525039416425907e-07, + "loss": 0.947, + "step": 8325 + }, + { + "epoch": 0.9610059990770651, + "grad_norm": 0.1854429841041565, + "learning_rate": 9.249791299470567e-07, + "loss": 0.9243, + "step": 8330 + }, + { + "epoch": 0.9615828334102445, + "grad_norm": 0.20479361712932587, + "learning_rate": 8.978560195393115e-07, + "loss": 0.9391, + "step": 8335 + }, + { + "epoch": 0.9621596677434241, + "grad_norm": 0.19264739751815796, + "learning_rate": 8.711347203904541e-07, + "loss": 0.9304, + "step": 8340 + }, + { + "epoch": 0.9627365020766036, + "grad_norm": 0.18168406188488007, + "learning_rate": 8.448153408424087e-07, + "loss": 0.9627, + "step": 8345 + }, + { + "epoch": 0.9633133364097831, + "grad_norm": 0.1960950791835785, + "learning_rate": 8.188979876075475e-07, + "loss": 0.9653, + "step": 8350 + }, + { + "epoch": 0.9638901707429626, + "grad_norm": 0.22994817793369293, + "learning_rate": 7.933827657682025e-07, + "loss": 0.9642, + "step": 8355 + }, + { + "epoch": 0.9644670050761421, + "grad_norm": 0.19266270101070404, + "learning_rate": 7.682697787762317e-07, + "loss": 0.9601, + "step": 8360 + }, + { + "epoch": 0.9650438394093216, + "grad_norm": 0.20038749277591705, + "learning_rate": 7.435591284526866e-07, + "loss": 1.018, + "step": 8365 + }, + { + "epoch": 0.9656206737425012, + "grad_norm": 0.18728072941303253, + "learning_rate": 7.192509149872684e-07, + "loss": 0.9477, + "step": 8370 + }, + { + "epoch": 0.9661975080756806, + "grad_norm": 0.1958947330713272, + "learning_rate": 6.953452369380497e-07, + "loss": 0.9806, + "step": 8375 + }, + { + "epoch": 0.9667743424088602, + "grad_norm": 0.1987629234790802, + "learning_rate": 6.718421912309758e-07, + "loss": 0.8928, + "step": 8380 + }, + { + "epoch": 0.9673511767420396, + "grad_norm": 0.19647015631198883, + "learning_rate": 6.487418731595418e-07, + "loss": 0.9933, + "step": 8385 + }, + { + "epoch": 0.9679280110752192, + "grad_norm": 0.19900114834308624, + "learning_rate": 6.260443763843493e-07, + "loss": 0.951, + "step": 8390 + }, + { + "epoch": 0.9685048454083988, + "grad_norm": 0.19372044503688812, + "learning_rate": 6.037497929327839e-07, + "loss": 0.9665, + "step": 8395 + }, + { + "epoch": 0.9690816797415782, + "grad_norm": 0.20044179260730743, + "learning_rate": 5.818582131985939e-07, + "loss": 0.9543, + "step": 8400 + }, + { + "epoch": 0.9696585140747578, + "grad_norm": 0.19964653253555298, + "learning_rate": 5.603697259415341e-07, + "loss": 0.9529, + "step": 8405 + }, + { + "epoch": 0.9702353484079372, + "grad_norm": 0.19442757964134216, + "learning_rate": 5.392844182870449e-07, + "loss": 0.987, + "step": 8410 + }, + { + "epoch": 0.9708121827411168, + "grad_norm": 0.18775001168251038, + "learning_rate": 5.186023757258407e-07, + "loss": 0.9705, + "step": 8415 + }, + { + "epoch": 0.9713890170742963, + "grad_norm": 0.19862832129001617, + "learning_rate": 4.983236821135995e-07, + "loss": 0.9409, + "step": 8420 + }, + { + "epoch": 0.9719658514074758, + "grad_norm": 0.18501073122024536, + "learning_rate": 4.784484196706073e-07, + "loss": 0.9682, + "step": 8425 + }, + { + "epoch": 0.9725426857406553, + "grad_norm": 0.19959832727909088, + "learning_rate": 4.5897666898145896e-07, + "loss": 0.9556, + "step": 8430 + }, + { + "epoch": 0.9731195200738348, + "grad_norm": 0.18275192379951477, + "learning_rate": 4.3990850899467975e-07, + "loss": 0.9612, + "step": 8435 + }, + { + "epoch": 0.9736963544070143, + "grad_norm": 0.18589888513088226, + "learning_rate": 4.2124401702241524e-07, + "loss": 0.947, + "step": 8440 + }, + { + "epoch": 0.9742731887401939, + "grad_norm": 0.19903455674648285, + "learning_rate": 4.029832687401758e-07, + "loss": 0.9574, + "step": 8445 + }, + { + "epoch": 0.9748500230733733, + "grad_norm": 0.1937977373600006, + "learning_rate": 3.851263381864589e-07, + "loss": 0.9699, + "step": 8450 + }, + { + "epoch": 0.9754268574065529, + "grad_norm": 0.19150091707706451, + "learning_rate": 3.67673297762483e-07, + "loss": 0.9158, + "step": 8455 + }, + { + "epoch": 0.9760036917397323, + "grad_norm": 0.1840011477470398, + "learning_rate": 3.506242182318653e-07, + "loss": 0.9732, + "step": 8460 + }, + { + "epoch": 0.9765805260729119, + "grad_norm": 0.1933344304561615, + "learning_rate": 3.339791687203997e-07, + "loss": 0.9664, + "step": 8465 + }, + { + "epoch": 0.9771573604060914, + "grad_norm": 0.18675784766674042, + "learning_rate": 3.177382167156906e-07, + "loss": 0.9303, + "step": 8470 + }, + { + "epoch": 0.9777341947392709, + "grad_norm": 0.19082637131214142, + "learning_rate": 3.019014280669641e-07, + "loss": 0.9024, + "step": 8475 + }, + { + "epoch": 0.9783110290724504, + "grad_norm": 0.2020220011472702, + "learning_rate": 2.8646886698473484e-07, + "loss": 0.9963, + "step": 8480 + }, + { + "epoch": 0.9788878634056299, + "grad_norm": 0.20680001378059387, + "learning_rate": 2.7144059604055085e-07, + "loss": 0.9355, + "step": 8485 + }, + { + "epoch": 0.9794646977388094, + "grad_norm": 0.1897319108247757, + "learning_rate": 2.568166761668156e-07, + "loss": 0.9195, + "step": 8490 + }, + { + "epoch": 0.980041532071989, + "grad_norm": 0.19423364102840424, + "learning_rate": 2.4259716665641083e-07, + "loss": 0.9845, + "step": 8495 + }, + { + "epoch": 0.9806183664051684, + "grad_norm": 0.19203795492649078, + "learning_rate": 2.2878212516260766e-07, + "loss": 0.9134, + "step": 8500 + }, + { + "epoch": 0.981195200738348, + "grad_norm": 0.18443405628204346, + "learning_rate": 2.1537160769870002e-07, + "loss": 0.9501, + "step": 8505 + }, + { + "epoch": 0.9817720350715274, + "grad_norm": 0.2106209546327591, + "learning_rate": 2.0236566863784944e-07, + "loss": 1.0056, + "step": 8510 + }, + { + "epoch": 0.982348869404707, + "grad_norm": 0.19864287972450256, + "learning_rate": 1.8976436071284076e-07, + "loss": 0.9141, + "step": 8515 + }, + { + "epoch": 0.9829257037378865, + "grad_norm": 0.19259801506996155, + "learning_rate": 1.775677350159044e-07, + "loss": 0.9578, + "step": 8520 + }, + { + "epoch": 0.983502538071066, + "grad_norm": 0.1885799914598465, + "learning_rate": 1.657758409984278e-07, + "loss": 0.9788, + "step": 8525 + }, + { + "epoch": 0.9840793724042455, + "grad_norm": 0.18557386100292206, + "learning_rate": 1.5438872647086655e-07, + "loss": 0.9251, + "step": 8530 + }, + { + "epoch": 0.984656206737425, + "grad_norm": 0.1834561824798584, + "learning_rate": 1.4340643760244464e-07, + "loss": 0.9683, + "step": 8535 + }, + { + "epoch": 0.9852330410706045, + "grad_norm": 0.2051221877336502, + "learning_rate": 1.328290189210435e-07, + "loss": 0.9747, + "step": 8540 + }, + { + "epoch": 0.9858098754037841, + "grad_norm": 0.18745014071464539, + "learning_rate": 1.2265651331296869e-07, + "loss": 0.9445, + "step": 8545 + }, + { + "epoch": 0.9863867097369635, + "grad_norm": 0.188162162899971, + "learning_rate": 1.1288896202281685e-07, + "loss": 0.9681, + "step": 8550 + }, + { + "epoch": 0.9869635440701431, + "grad_norm": 0.1787605881690979, + "learning_rate": 1.0352640465327578e-07, + "loss": 0.9767, + "step": 8555 + }, + { + "epoch": 0.9875403784033225, + "grad_norm": 0.19322550296783447, + "learning_rate": 9.456887916499125e-08, + "loss": 0.9264, + "step": 8560 + }, + { + "epoch": 0.9881172127365021, + "grad_norm": 0.210786372423172, + "learning_rate": 8.601642187640036e-08, + "loss": 0.9254, + "step": 8565 + }, + { + "epoch": 0.9886940470696816, + "grad_norm": 0.18473652005195618, + "learning_rate": 7.786906746358735e-08, + "loss": 0.9626, + "step": 8570 + }, + { + "epoch": 0.9892708814028611, + "grad_norm": 0.21195201575756073, + "learning_rate": 7.012684896011702e-08, + "loss": 0.9211, + "step": 8575 + }, + { + "epoch": 0.9898477157360406, + "grad_norm": 0.18607978522777557, + "learning_rate": 6.278979775694582e-08, + "loss": 0.9473, + "step": 8580 + }, + { + "epoch": 0.9904245500692201, + "grad_norm": 0.194194957613945, + "learning_rate": 5.585794360226659e-08, + "loss": 0.9632, + "step": 8585 + }, + { + "epoch": 0.9910013844023996, + "grad_norm": 0.199600949883461, + "learning_rate": 4.9331314601408495e-08, + "loss": 0.9848, + "step": 8590 + }, + { + "epoch": 0.9915782187355792, + "grad_norm": 0.20218248665332794, + "learning_rate": 4.320993721668165e-08, + "loss": 0.9596, + "step": 8595 + }, + { + "epoch": 0.9921550530687586, + "grad_norm": 0.19225312769412994, + "learning_rate": 3.7493836267310514e-08, + "loss": 0.9497, + "step": 8600 + }, + { + "epoch": 0.9927318874019382, + "grad_norm": 0.18986690044403076, + "learning_rate": 3.218303492932284e-08, + "loss": 0.9513, + "step": 8605 + }, + { + "epoch": 0.9933087217351176, + "grad_norm": 0.19648917019367218, + "learning_rate": 2.7277554735449794e-08, + "loss": 0.9769, + "step": 8610 + }, + { + "epoch": 0.9938855560682972, + "grad_norm": 0.24255739152431488, + "learning_rate": 2.2777415575037098e-08, + "loss": 0.9355, + "step": 8615 + }, + { + "epoch": 0.9944623904014767, + "grad_norm": 0.1974877566099167, + "learning_rate": 1.8682635693978433e-08, + "loss": 0.9339, + "step": 8620 + }, + { + "epoch": 0.9950392247346562, + "grad_norm": 0.1958783119916916, + "learning_rate": 1.499323169462663e-08, + "loss": 0.9779, + "step": 8625 + }, + { + "epoch": 0.9956160590678357, + "grad_norm": 0.18412433564662933, + "learning_rate": 1.1709218535715938e-08, + "loss": 0.9261, + "step": 8630 + }, + { + "epoch": 0.9961928934010152, + "grad_norm": 0.18094801902770996, + "learning_rate": 8.83060953235093e-09, + "loss": 0.904, + "step": 8635 + }, + { + "epoch": 0.9967697277341947, + "grad_norm": 0.19589665532112122, + "learning_rate": 6.357416355884382e-09, + "loss": 0.9594, + "step": 8640 + }, + { + "epoch": 0.9973465620673743, + "grad_norm": 0.19667023420333862, + "learning_rate": 4.289649033928367e-09, + "loss": 1.0295, + "step": 8645 + }, + { + "epoch": 0.9979233964005537, + "grad_norm": 0.1948215663433075, + "learning_rate": 2.627315950265441e-09, + "loss": 0.982, + "step": 8650 + }, + { + "epoch": 0.9985002307337333, + "grad_norm": 0.18232625722885132, + "learning_rate": 1.3704238448708496e-09, + "loss": 0.959, + "step": 8655 + }, + { + "epoch": 0.9990770650669127, + "grad_norm": 0.18606936931610107, + "learning_rate": 5.189778138237067e-10, + "loss": 0.9795, + "step": 8660 + }, + { + "epoch": 0.9996538994000923, + "grad_norm": 0.20517615973949432, + "learning_rate": 7.298130931809865e-11, + "loss": 0.9515, + "step": 8665 + }, + { + "epoch": 1.0, + "eval_loss": 0.9548913240432739, + "eval_runtime": 3071.7071, + "eval_samples_per_second": 4.997, + "eval_steps_per_second": 0.313, + "step": 8668 + }, + { + "epoch": 1.0, + "step": 8668, + "total_flos": 1.3416878770665554e+19, + "train_loss": 0.9644491172051397, + "train_runtime": 101367.857, + "train_samples_per_second": 1.368, + "train_steps_per_second": 0.086 + } + ], + "logging_steps": 5, + "max_steps": 8668, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3416878770665554e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}