diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.44441391904394445, + "epoch": 0.5413769559262596, "eval_steps": 500, - "global_step": 110000, + "global_step": 134000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -77007,6 +77007,16806 @@ "learning_rate": 3.436516483539781e-05, "loss": 35.0977, "step": 110000 + }, + { + "epoch": 0.4444543203093121, + "grad_norm": 787.899658203125, + "learning_rate": 3.4361928261912254e-05, + "loss": 56.4767, + "step": 110010 + }, + { + "epoch": 0.44449472157467973, + "grad_norm": 0.0, + "learning_rate": 3.4358691505911104e-05, + "loss": 44.5502, + "step": 110020 + }, + { + "epoch": 0.44453512284004737, + "grad_norm": 385.0424499511719, + "learning_rate": 3.4355454567457445e-05, + "loss": 70.914, + "step": 110030 + }, + { + "epoch": 0.44457552410541495, + "grad_norm": 967.9429931640625, + "learning_rate": 3.435221744661438e-05, + "loss": 62.6088, + "step": 110040 + }, + { + "epoch": 0.4446159253707826, + "grad_norm": 389.7696533203125, + "learning_rate": 3.434898014344501e-05, + "loss": 48.413, + "step": 110050 + }, + { + "epoch": 0.44465632663615023, + "grad_norm": 1513.93017578125, + "learning_rate": 3.434574265801247e-05, + "loss": 50.9386, + "step": 110060 + }, + { + "epoch": 0.4446967279015179, + "grad_norm": 461.18096923828125, + "learning_rate": 3.4342504990379866e-05, + "loss": 66.7544, + "step": 110070 + }, + { + "epoch": 0.4447371291668855, + "grad_norm": 593.7630615234375, + "learning_rate": 3.433926714061032e-05, + "loss": 60.6121, + "step": 110080 + }, + { + "epoch": 0.44477753043225315, + "grad_norm": 1564.3096923828125, + "learning_rate": 3.433602910876694e-05, + "loss": 70.2792, + "step": 110090 + }, + { + "epoch": 0.4448179316976208, + "grad_norm": 1443.179931640625, + "learning_rate": 3.433279089491288e-05, + "loss": 36.5048, + "step": 110100 + }, + { + "epoch": 0.4448583329629884, + "grad_norm": 822.4376220703125, + "learning_rate": 3.432955249911125e-05, + "loss": 78.1726, + "step": 110110 + }, + { + "epoch": 0.444898734228356, + "grad_norm": 1508.205078125, + "learning_rate": 3.432631392142519e-05, + "loss": 66.1732, + "step": 110120 + }, + { + "epoch": 0.44493913549372366, + "grad_norm": 251.31320190429688, + "learning_rate": 3.432307516191783e-05, + "loss": 34.4932, + "step": 110130 + }, + { + "epoch": 0.4449795367590913, + "grad_norm": 1120.1636962890625, + "learning_rate": 3.4319836220652335e-05, + "loss": 38.0345, + "step": 110140 + }, + { + "epoch": 0.44501993802445894, + "grad_norm": 439.1837158203125, + "learning_rate": 3.431659709769183e-05, + "loss": 56.9619, + "step": 110150 + }, + { + "epoch": 0.4450603392898266, + "grad_norm": 969.8883666992188, + "learning_rate": 3.431335779309947e-05, + "loss": 37.3155, + "step": 110160 + }, + { + "epoch": 0.44510074055519416, + "grad_norm": 813.2791137695312, + "learning_rate": 3.43101183069384e-05, + "loss": 82.1769, + "step": 110170 + }, + { + "epoch": 0.4451411418205618, + "grad_norm": 712.302001953125, + "learning_rate": 3.430687863927178e-05, + "loss": 53.5758, + "step": 110180 + }, + { + "epoch": 0.44518154308592944, + "grad_norm": 730.143310546875, + "learning_rate": 3.4303638790162774e-05, + "loss": 61.2881, + "step": 110190 + }, + { + "epoch": 0.4452219443512971, + "grad_norm": 945.42041015625, + "learning_rate": 3.430039875967454e-05, + "loss": 62.6643, + "step": 110200 + }, + { + "epoch": 0.4452623456166647, + "grad_norm": 887.8884887695312, + "learning_rate": 3.429715854787024e-05, + "loss": 52.4225, + "step": 110210 + }, + { + "epoch": 0.44530274688203236, + "grad_norm": 778.4572143554688, + "learning_rate": 3.429391815481305e-05, + "loss": 80.6638, + "step": 110220 + }, + { + "epoch": 0.4453431481474, + "grad_norm": 701.6489868164062, + "learning_rate": 3.429067758056613e-05, + "loss": 70.6435, + "step": 110230 + }, + { + "epoch": 0.4453835494127676, + "grad_norm": 1139.0123291015625, + "learning_rate": 3.428743682519269e-05, + "loss": 78.8899, + "step": 110240 + }, + { + "epoch": 0.4454239506781352, + "grad_norm": 0.0, + "learning_rate": 3.428419588875588e-05, + "loss": 51.6693, + "step": 110250 + }, + { + "epoch": 0.44546435194350287, + "grad_norm": 919.1795654296875, + "learning_rate": 3.428095477131888e-05, + "loss": 56.4408, + "step": 110260 + }, + { + "epoch": 0.4455047532088705, + "grad_norm": 953.95263671875, + "learning_rate": 3.427771347294489e-05, + "loss": 72.902, + "step": 110270 + }, + { + "epoch": 0.44554515447423815, + "grad_norm": 1921.18359375, + "learning_rate": 3.427447199369711e-05, + "loss": 43.3139, + "step": 110280 + }, + { + "epoch": 0.4455855557396058, + "grad_norm": 790.10791015625, + "learning_rate": 3.4271230333638716e-05, + "loss": 71.2949, + "step": 110290 + }, + { + "epoch": 0.44562595700497337, + "grad_norm": 1008.7927856445312, + "learning_rate": 3.426798849283291e-05, + "loss": 60.3911, + "step": 110300 + }, + { + "epoch": 0.445666358270341, + "grad_norm": 749.1685791015625, + "learning_rate": 3.4264746471342905e-05, + "loss": 78.0302, + "step": 110310 + }, + { + "epoch": 0.44570675953570865, + "grad_norm": 967.6669921875, + "learning_rate": 3.4261504269231904e-05, + "loss": 49.5635, + "step": 110320 + }, + { + "epoch": 0.4457471608010763, + "grad_norm": 1514.6182861328125, + "learning_rate": 3.4258261886563104e-05, + "loss": 76.0732, + "step": 110330 + }, + { + "epoch": 0.44578756206644393, + "grad_norm": 1716.936279296875, + "learning_rate": 3.425501932339971e-05, + "loss": 133.2402, + "step": 110340 + }, + { + "epoch": 0.44582796333181157, + "grad_norm": 959.5086669921875, + "learning_rate": 3.425177657980496e-05, + "loss": 71.294, + "step": 110350 + }, + { + "epoch": 0.44586836459717916, + "grad_norm": 812.0950317382812, + "learning_rate": 3.4248533655842066e-05, + "loss": 57.06, + "step": 110360 + }, + { + "epoch": 0.4459087658625468, + "grad_norm": 267.0820617675781, + "learning_rate": 3.4245290551574237e-05, + "loss": 44.4798, + "step": 110370 + }, + { + "epoch": 0.44594916712791444, + "grad_norm": 1544.18310546875, + "learning_rate": 3.4242047267064715e-05, + "loss": 57.8316, + "step": 110380 + }, + { + "epoch": 0.4459895683932821, + "grad_norm": 695.9027709960938, + "learning_rate": 3.4238803802376716e-05, + "loss": 56.366, + "step": 110390 + }, + { + "epoch": 0.4460299696586497, + "grad_norm": 685.2529296875, + "learning_rate": 3.423556015757349e-05, + "loss": 57.191, + "step": 110400 + }, + { + "epoch": 0.44607037092401736, + "grad_norm": 653.3543090820312, + "learning_rate": 3.423231633271826e-05, + "loss": 61.4118, + "step": 110410 + }, + { + "epoch": 0.446110772189385, + "grad_norm": 1396.4661865234375, + "learning_rate": 3.4229072327874274e-05, + "loss": 53.9518, + "step": 110420 + }, + { + "epoch": 0.4461511734547526, + "grad_norm": 742.2490234375, + "learning_rate": 3.422582814310476e-05, + "loss": 64.2797, + "step": 110430 + }, + { + "epoch": 0.4461915747201202, + "grad_norm": 811.0308837890625, + "learning_rate": 3.4222583778472996e-05, + "loss": 34.3383, + "step": 110440 + }, + { + "epoch": 0.44623197598548786, + "grad_norm": 784.7843017578125, + "learning_rate": 3.421933923404219e-05, + "loss": 83.5336, + "step": 110450 + }, + { + "epoch": 0.4462723772508555, + "grad_norm": 368.3476257324219, + "learning_rate": 3.421609450987563e-05, + "loss": 43.9047, + "step": 110460 + }, + { + "epoch": 0.44631277851622314, + "grad_norm": 507.54205322265625, + "learning_rate": 3.421284960603657e-05, + "loss": 40.9963, + "step": 110470 + }, + { + "epoch": 0.4463531797815908, + "grad_norm": 1140.799560546875, + "learning_rate": 3.4209604522588255e-05, + "loss": 46.2735, + "step": 110480 + }, + { + "epoch": 0.44639358104695837, + "grad_norm": 1338.29541015625, + "learning_rate": 3.4206359259593954e-05, + "loss": 37.1206, + "step": 110490 + }, + { + "epoch": 0.446433982312326, + "grad_norm": 665.2498168945312, + "learning_rate": 3.4203113817116957e-05, + "loss": 57.0931, + "step": 110500 + }, + { + "epoch": 0.44647438357769365, + "grad_norm": 734.5198974609375, + "learning_rate": 3.4199868195220505e-05, + "loss": 71.1633, + "step": 110510 + }, + { + "epoch": 0.4465147848430613, + "grad_norm": 679.3670043945312, + "learning_rate": 3.419662239396789e-05, + "loss": 52.7156, + "step": 110520 + }, + { + "epoch": 0.4465551861084289, + "grad_norm": 538.1387939453125, + "learning_rate": 3.419337641342239e-05, + "loss": 87.353, + "step": 110530 + }, + { + "epoch": 0.44659558737379657, + "grad_norm": 625.4715576171875, + "learning_rate": 3.419013025364727e-05, + "loss": 52.2098, + "step": 110540 + }, + { + "epoch": 0.4466359886391642, + "grad_norm": 1380.083740234375, + "learning_rate": 3.4186883914705835e-05, + "loss": 110.8045, + "step": 110550 + }, + { + "epoch": 0.4466763899045318, + "grad_norm": 664.3757934570312, + "learning_rate": 3.418363739666137e-05, + "loss": 33.5992, + "step": 110560 + }, + { + "epoch": 0.44671679116989943, + "grad_norm": 352.8798522949219, + "learning_rate": 3.418039069957717e-05, + "loss": 54.5655, + "step": 110570 + }, + { + "epoch": 0.44675719243526707, + "grad_norm": 196.41831970214844, + "learning_rate": 3.417714382351652e-05, + "loss": 46.0049, + "step": 110580 + }, + { + "epoch": 0.4467975937006347, + "grad_norm": 1419.017822265625, + "learning_rate": 3.417389676854274e-05, + "loss": 87.4692, + "step": 110590 + }, + { + "epoch": 0.44683799496600235, + "grad_norm": 515.550537109375, + "learning_rate": 3.417064953471911e-05, + "loss": 46.8774, + "step": 110600 + }, + { + "epoch": 0.44687839623137, + "grad_norm": 729.7348022460938, + "learning_rate": 3.416740212210894e-05, + "loss": 47.5663, + "step": 110610 + }, + { + "epoch": 0.4469187974967376, + "grad_norm": 618.8557739257812, + "learning_rate": 3.416415453077555e-05, + "loss": 56.8461, + "step": 110620 + }, + { + "epoch": 0.4469591987621052, + "grad_norm": 1495.1834716796875, + "learning_rate": 3.416090676078225e-05, + "loss": 71.6594, + "step": 110630 + }, + { + "epoch": 0.44699960002747285, + "grad_norm": 537.9067993164062, + "learning_rate": 3.415765881219236e-05, + "loss": 75.8224, + "step": 110640 + }, + { + "epoch": 0.4470400012928405, + "grad_norm": 1021.9192504882812, + "learning_rate": 3.4154410685069196e-05, + "loss": 71.1588, + "step": 110650 + }, + { + "epoch": 0.44708040255820813, + "grad_norm": 1147.8583984375, + "learning_rate": 3.4151162379476075e-05, + "loss": 99.0359, + "step": 110660 + }, + { + "epoch": 0.4471208038235758, + "grad_norm": 951.2720947265625, + "learning_rate": 3.414791389547635e-05, + "loss": 62.2096, + "step": 110670 + }, + { + "epoch": 0.44716120508894336, + "grad_norm": 292.7242736816406, + "learning_rate": 3.414466523313332e-05, + "loss": 40.3493, + "step": 110680 + }, + { + "epoch": 0.447201606354311, + "grad_norm": 1284.24755859375, + "learning_rate": 3.414141639251033e-05, + "loss": 49.9673, + "step": 110690 + }, + { + "epoch": 0.44724200761967864, + "grad_norm": 1174.4100341796875, + "learning_rate": 3.413816737367073e-05, + "loss": 76.3328, + "step": 110700 + }, + { + "epoch": 0.4472824088850463, + "grad_norm": 1334.654052734375, + "learning_rate": 3.4134918176677846e-05, + "loss": 68.6847, + "step": 110710 + }, + { + "epoch": 0.4473228101504139, + "grad_norm": 1329.33203125, + "learning_rate": 3.4131668801595027e-05, + "loss": 60.1202, + "step": 110720 + }, + { + "epoch": 0.44736321141578156, + "grad_norm": 472.8536071777344, + "learning_rate": 3.4128419248485635e-05, + "loss": 55.691, + "step": 110730 + }, + { + "epoch": 0.4474036126811492, + "grad_norm": 1049.84619140625, + "learning_rate": 3.4125169517413e-05, + "loss": 70.7531, + "step": 110740 + }, + { + "epoch": 0.4474440139465168, + "grad_norm": 527.7225952148438, + "learning_rate": 3.412191960844049e-05, + "loss": 48.9243, + "step": 110750 + }, + { + "epoch": 0.4474844152118844, + "grad_norm": 415.2084045410156, + "learning_rate": 3.411866952163146e-05, + "loss": 52.9608, + "step": 110760 + }, + { + "epoch": 0.44752481647725206, + "grad_norm": 1201.3553466796875, + "learning_rate": 3.4115419257049286e-05, + "loss": 71.9567, + "step": 110770 + }, + { + "epoch": 0.4475652177426197, + "grad_norm": 245.35345458984375, + "learning_rate": 3.4112168814757307e-05, + "loss": 69.7229, + "step": 110780 + }, + { + "epoch": 0.44760561900798734, + "grad_norm": 1845.09033203125, + "learning_rate": 3.41089181948189e-05, + "loss": 68.3285, + "step": 110790 + }, + { + "epoch": 0.447646020273355, + "grad_norm": 874.4844360351562, + "learning_rate": 3.410566739729746e-05, + "loss": 42.1089, + "step": 110800 + }, + { + "epoch": 0.44768642153872257, + "grad_norm": 466.1338806152344, + "learning_rate": 3.410241642225633e-05, + "loss": 53.4753, + "step": 110810 + }, + { + "epoch": 0.4477268228040902, + "grad_norm": 391.43475341796875, + "learning_rate": 3.409916526975892e-05, + "loss": 56.1511, + "step": 110820 + }, + { + "epoch": 0.44776722406945785, + "grad_norm": 1110.3125, + "learning_rate": 3.409591393986859e-05, + "loss": 56.4363, + "step": 110830 + }, + { + "epoch": 0.4478076253348255, + "grad_norm": 441.3172302246094, + "learning_rate": 3.409266243264874e-05, + "loss": 54.6422, + "step": 110840 + }, + { + "epoch": 0.4478480266001931, + "grad_norm": 868.8545532226562, + "learning_rate": 3.408941074816275e-05, + "loss": 56.9401, + "step": 110850 + }, + { + "epoch": 0.44788842786556077, + "grad_norm": 988.2617797851562, + "learning_rate": 3.408615888647402e-05, + "loss": 58.8403, + "step": 110860 + }, + { + "epoch": 0.4479288291309284, + "grad_norm": 1164.9776611328125, + "learning_rate": 3.408290684764594e-05, + "loss": 63.0026, + "step": 110870 + }, + { + "epoch": 0.447969230396296, + "grad_norm": 592.5801391601562, + "learning_rate": 3.407965463174192e-05, + "loss": 52.5244, + "step": 110880 + }, + { + "epoch": 0.44800963166166363, + "grad_norm": 647.7515258789062, + "learning_rate": 3.407640223882536e-05, + "loss": 29.65, + "step": 110890 + }, + { + "epoch": 0.44805003292703127, + "grad_norm": 954.7613525390625, + "learning_rate": 3.407314966895966e-05, + "loss": 56.6861, + "step": 110900 + }, + { + "epoch": 0.4480904341923989, + "grad_norm": 997.4801025390625, + "learning_rate": 3.406989692220824e-05, + "loss": 72.3521, + "step": 110910 + }, + { + "epoch": 0.44813083545776655, + "grad_norm": 308.1066589355469, + "learning_rate": 3.4066643998634505e-05, + "loss": 79.9528, + "step": 110920 + }, + { + "epoch": 0.4481712367231342, + "grad_norm": 1413.3211669921875, + "learning_rate": 3.406339089830188e-05, + "loss": 90.6204, + "step": 110930 + }, + { + "epoch": 0.4482116379885018, + "grad_norm": 761.3001098632812, + "learning_rate": 3.406013762127379e-05, + "loss": 59.6376, + "step": 110940 + }, + { + "epoch": 0.4482520392538694, + "grad_norm": 550.8245239257812, + "learning_rate": 3.405688416761364e-05, + "loss": 61.1043, + "step": 110950 + }, + { + "epoch": 0.44829244051923706, + "grad_norm": 1125.8817138671875, + "learning_rate": 3.4053630537384885e-05, + "loss": 54.3687, + "step": 110960 + }, + { + "epoch": 0.4483328417846047, + "grad_norm": 1198.2652587890625, + "learning_rate": 3.4050376730650935e-05, + "loss": 52.4602, + "step": 110970 + }, + { + "epoch": 0.44837324304997234, + "grad_norm": 293.5941162109375, + "learning_rate": 3.4047122747475224e-05, + "loss": 48.189, + "step": 110980 + }, + { + "epoch": 0.44841364431534, + "grad_norm": 1793.2451171875, + "learning_rate": 3.40438685879212e-05, + "loss": 58.882, + "step": 110990 + }, + { + "epoch": 0.44845404558070756, + "grad_norm": 405.7001037597656, + "learning_rate": 3.4040614252052305e-05, + "loss": 87.0919, + "step": 111000 + }, + { + "epoch": 0.4484944468460752, + "grad_norm": 609.4334716796875, + "learning_rate": 3.403735973993198e-05, + "loss": 88.5648, + "step": 111010 + }, + { + "epoch": 0.44853484811144284, + "grad_norm": 476.433837890625, + "learning_rate": 3.403410505162369e-05, + "loss": 72.3557, + "step": 111020 + }, + { + "epoch": 0.4485752493768105, + "grad_norm": 547.7504272460938, + "learning_rate": 3.403085018719085e-05, + "loss": 71.4732, + "step": 111030 + }, + { + "epoch": 0.4486156506421781, + "grad_norm": 618.4702758789062, + "learning_rate": 3.402759514669694e-05, + "loss": 44.4472, + "step": 111040 + }, + { + "epoch": 0.44865605190754576, + "grad_norm": 731.4344482421875, + "learning_rate": 3.4024339930205415e-05, + "loss": 53.5866, + "step": 111050 + }, + { + "epoch": 0.4486964531729134, + "grad_norm": 457.8855895996094, + "learning_rate": 3.402108453777974e-05, + "loss": 58.6352, + "step": 111060 + }, + { + "epoch": 0.448736854438281, + "grad_norm": 612.9066772460938, + "learning_rate": 3.401782896948338e-05, + "loss": 80.7433, + "step": 111070 + }, + { + "epoch": 0.4487772557036486, + "grad_norm": 590.0531616210938, + "learning_rate": 3.401457322537979e-05, + "loss": 49.937, + "step": 111080 + }, + { + "epoch": 0.44881765696901627, + "grad_norm": 579.5253295898438, + "learning_rate": 3.401131730553247e-05, + "loss": 53.8527, + "step": 111090 + }, + { + "epoch": 0.4488580582343839, + "grad_norm": 761.0218505859375, + "learning_rate": 3.400806121000487e-05, + "loss": 57.2909, + "step": 111100 + }, + { + "epoch": 0.44889845949975155, + "grad_norm": 938.6339721679688, + "learning_rate": 3.400480493886048e-05, + "loss": 54.5442, + "step": 111110 + }, + { + "epoch": 0.4489388607651192, + "grad_norm": 800.48486328125, + "learning_rate": 3.400154849216278e-05, + "loss": 50.155, + "step": 111120 + }, + { + "epoch": 0.44897926203048677, + "grad_norm": 1541.491943359375, + "learning_rate": 3.3998291869975266e-05, + "loss": 47.9149, + "step": 111130 + }, + { + "epoch": 0.4490196632958544, + "grad_norm": 631.569580078125, + "learning_rate": 3.399503507236141e-05, + "loss": 60.8179, + "step": 111140 + }, + { + "epoch": 0.44906006456122205, + "grad_norm": 233.70372009277344, + "learning_rate": 3.399177809938472e-05, + "loss": 33.1188, + "step": 111150 + }, + { + "epoch": 0.4491004658265897, + "grad_norm": 492.7761535644531, + "learning_rate": 3.398852095110868e-05, + "loss": 62.9832, + "step": 111160 + }, + { + "epoch": 0.44914086709195733, + "grad_norm": 691.3985595703125, + "learning_rate": 3.398526362759681e-05, + "loss": 58.8685, + "step": 111170 + }, + { + "epoch": 0.44918126835732497, + "grad_norm": 763.817626953125, + "learning_rate": 3.3982006128912584e-05, + "loss": 34.1555, + "step": 111180 + }, + { + "epoch": 0.4492216696226926, + "grad_norm": 794.317626953125, + "learning_rate": 3.3978748455119536e-05, + "loss": 71.4623, + "step": 111190 + }, + { + "epoch": 0.4492620708880602, + "grad_norm": 423.894287109375, + "learning_rate": 3.397549060628116e-05, + "loss": 56.8908, + "step": 111200 + }, + { + "epoch": 0.44930247215342783, + "grad_norm": 2169.064697265625, + "learning_rate": 3.3972232582460974e-05, + "loss": 52.3652, + "step": 111210 + }, + { + "epoch": 0.4493428734187955, + "grad_norm": 363.28289794921875, + "learning_rate": 3.3968974383722495e-05, + "loss": 40.4274, + "step": 111220 + }, + { + "epoch": 0.4493832746841631, + "grad_norm": 973.9365234375, + "learning_rate": 3.3965716010129236e-05, + "loss": 106.0774, + "step": 111230 + }, + { + "epoch": 0.44942367594953075, + "grad_norm": 3008.888671875, + "learning_rate": 3.396245746174473e-05, + "loss": 44.22, + "step": 111240 + }, + { + "epoch": 0.4494640772148984, + "grad_norm": 926.1411743164062, + "learning_rate": 3.39591987386325e-05, + "loss": 46.0268, + "step": 111250 + }, + { + "epoch": 0.449504478480266, + "grad_norm": 177.50221252441406, + "learning_rate": 3.3955939840856096e-05, + "loss": 44.916, + "step": 111260 + }, + { + "epoch": 0.4495448797456336, + "grad_norm": 779.60888671875, + "learning_rate": 3.395268076847902e-05, + "loss": 56.0308, + "step": 111270 + }, + { + "epoch": 0.44958528101100126, + "grad_norm": 323.57818603515625, + "learning_rate": 3.394942152156482e-05, + "loss": 40.499, + "step": 111280 + }, + { + "epoch": 0.4496256822763689, + "grad_norm": 513.7103881835938, + "learning_rate": 3.394616210017705e-05, + "loss": 44.3521, + "step": 111290 + }, + { + "epoch": 0.44966608354173654, + "grad_norm": 657.2388305664062, + "learning_rate": 3.3942902504379235e-05, + "loss": 72.6372, + "step": 111300 + }, + { + "epoch": 0.4497064848071042, + "grad_norm": 513.4599609375, + "learning_rate": 3.3939642734234936e-05, + "loss": 118.4347, + "step": 111310 + }, + { + "epoch": 0.44974688607247176, + "grad_norm": 719.9033203125, + "learning_rate": 3.39363827898077e-05, + "loss": 85.1415, + "step": 111320 + }, + { + "epoch": 0.4497872873378394, + "grad_norm": 1146.169677734375, + "learning_rate": 3.393312267116107e-05, + "loss": 60.6316, + "step": 111330 + }, + { + "epoch": 0.44982768860320704, + "grad_norm": 633.3748168945312, + "learning_rate": 3.392986237835863e-05, + "loss": 60.0308, + "step": 111340 + }, + { + "epoch": 0.4498680898685747, + "grad_norm": 436.7900390625, + "learning_rate": 3.3926601911463915e-05, + "loss": 54.0903, + "step": 111350 + }, + { + "epoch": 0.4499084911339423, + "grad_norm": 425.1293029785156, + "learning_rate": 3.392334127054051e-05, + "loss": 59.7452, + "step": 111360 + }, + { + "epoch": 0.44994889239930996, + "grad_norm": 1201.4161376953125, + "learning_rate": 3.392008045565197e-05, + "loss": 56.7192, + "step": 111370 + }, + { + "epoch": 0.4499892936646776, + "grad_norm": 287.7637939453125, + "learning_rate": 3.391681946686186e-05, + "loss": 67.3147, + "step": 111380 + }, + { + "epoch": 0.4500296949300452, + "grad_norm": 818.6224975585938, + "learning_rate": 3.3913558304233776e-05, + "loss": 53.3463, + "step": 111390 + }, + { + "epoch": 0.4500700961954128, + "grad_norm": 385.80987548828125, + "learning_rate": 3.3910296967831266e-05, + "loss": 66.1127, + "step": 111400 + }, + { + "epoch": 0.45011049746078047, + "grad_norm": 1808.7755126953125, + "learning_rate": 3.3907035457717944e-05, + "loss": 67.1233, + "step": 111410 + }, + { + "epoch": 0.4501508987261481, + "grad_norm": 512.4915771484375, + "learning_rate": 3.390377377395738e-05, + "loss": 52.8904, + "step": 111420 + }, + { + "epoch": 0.45019129999151575, + "grad_norm": 1636.7056884765625, + "learning_rate": 3.3900511916613155e-05, + "loss": 101.467, + "step": 111430 + }, + { + "epoch": 0.4502317012568834, + "grad_norm": 682.1113891601562, + "learning_rate": 3.389724988574887e-05, + "loss": 47.2666, + "step": 111440 + }, + { + "epoch": 0.45027210252225097, + "grad_norm": 401.15771484375, + "learning_rate": 3.389398768142812e-05, + "loss": 40.7815, + "step": 111450 + }, + { + "epoch": 0.4503125037876186, + "grad_norm": 1251.0804443359375, + "learning_rate": 3.389072530371451e-05, + "loss": 49.2346, + "step": 111460 + }, + { + "epoch": 0.45035290505298625, + "grad_norm": 642.7127075195312, + "learning_rate": 3.388746275267162e-05, + "loss": 63.2051, + "step": 111470 + }, + { + "epoch": 0.4503933063183539, + "grad_norm": 605.9111938476562, + "learning_rate": 3.388420002836307e-05, + "loss": 52.5846, + "step": 111480 + }, + { + "epoch": 0.45043370758372153, + "grad_norm": 1575.560302734375, + "learning_rate": 3.3880937130852466e-05, + "loss": 35.7747, + "step": 111490 + }, + { + "epoch": 0.45047410884908917, + "grad_norm": 645.1834716796875, + "learning_rate": 3.387767406020343e-05, + "loss": 55.3005, + "step": 111500 + }, + { + "epoch": 0.4505145101144568, + "grad_norm": 666.5519409179688, + "learning_rate": 3.3874410816479564e-05, + "loss": 48.5718, + "step": 111510 + }, + { + "epoch": 0.4505549113798244, + "grad_norm": 654.6434326171875, + "learning_rate": 3.387114739974448e-05, + "loss": 81.415, + "step": 111520 + }, + { + "epoch": 0.45059531264519204, + "grad_norm": 2661.026123046875, + "learning_rate": 3.3867883810061824e-05, + "loss": 84.8381, + "step": 111530 + }, + { + "epoch": 0.4506357139105597, + "grad_norm": 2447.82470703125, + "learning_rate": 3.38646200474952e-05, + "loss": 48.1841, + "step": 111540 + }, + { + "epoch": 0.4506761151759273, + "grad_norm": 708.4982299804688, + "learning_rate": 3.3861356112108247e-05, + "loss": 56.9643, + "step": 111550 + }, + { + "epoch": 0.45071651644129496, + "grad_norm": 489.48846435546875, + "learning_rate": 3.3858092003964594e-05, + "loss": 57.1001, + "step": 111560 + }, + { + "epoch": 0.4507569177066626, + "grad_norm": 538.01806640625, + "learning_rate": 3.385482772312787e-05, + "loss": 43.918, + "step": 111570 + }, + { + "epoch": 0.4507973189720302, + "grad_norm": 624.0175170898438, + "learning_rate": 3.3851563269661726e-05, + "loss": 95.1447, + "step": 111580 + }, + { + "epoch": 0.4508377202373978, + "grad_norm": 565.3558959960938, + "learning_rate": 3.38482986436298e-05, + "loss": 44.5088, + "step": 111590 + }, + { + "epoch": 0.45087812150276546, + "grad_norm": 746.841796875, + "learning_rate": 3.384503384509574e-05, + "loss": 44.0926, + "step": 111600 + }, + { + "epoch": 0.4509185227681331, + "grad_norm": 969.9041748046875, + "learning_rate": 3.384176887412318e-05, + "loss": 38.6924, + "step": 111610 + }, + { + "epoch": 0.45095892403350074, + "grad_norm": 971.7702026367188, + "learning_rate": 3.38385037307758e-05, + "loss": 56.2464, + "step": 111620 + }, + { + "epoch": 0.4509993252988684, + "grad_norm": 1089.8248291015625, + "learning_rate": 3.383523841511723e-05, + "loss": 89.415, + "step": 111630 + }, + { + "epoch": 0.45103972656423597, + "grad_norm": 535.305419921875, + "learning_rate": 3.3831972927211135e-05, + "loss": 41.9581, + "step": 111640 + }, + { + "epoch": 0.4510801278296036, + "grad_norm": 1719.795166015625, + "learning_rate": 3.382870726712119e-05, + "loss": 74.1624, + "step": 111650 + }, + { + "epoch": 0.45112052909497125, + "grad_norm": 1085.8828125, + "learning_rate": 3.382544143491104e-05, + "loss": 81.0191, + "step": 111660 + }, + { + "epoch": 0.4511609303603389, + "grad_norm": 409.8812561035156, + "learning_rate": 3.382217543064438e-05, + "loss": 40.052, + "step": 111670 + }, + { + "epoch": 0.4512013316257065, + "grad_norm": 1443.2547607421875, + "learning_rate": 3.381890925438486e-05, + "loss": 37.1698, + "step": 111680 + }, + { + "epoch": 0.45124173289107417, + "grad_norm": 1432.521728515625, + "learning_rate": 3.3815642906196156e-05, + "loss": 75.6335, + "step": 111690 + }, + { + "epoch": 0.4512821341564418, + "grad_norm": 1234.38720703125, + "learning_rate": 3.381237638614196e-05, + "loss": 48.0366, + "step": 111700 + }, + { + "epoch": 0.4513225354218094, + "grad_norm": 840.4757690429688, + "learning_rate": 3.380910969428596e-05, + "loss": 58.3024, + "step": 111710 + }, + { + "epoch": 0.45136293668717703, + "grad_norm": 591.4329833984375, + "learning_rate": 3.380584283069183e-05, + "loss": 42.8441, + "step": 111720 + }, + { + "epoch": 0.45140333795254467, + "grad_norm": 579.3867797851562, + "learning_rate": 3.380257579542325e-05, + "loss": 100.9832, + "step": 111730 + }, + { + "epoch": 0.4514437392179123, + "grad_norm": 1018.6928100585938, + "learning_rate": 3.379930858854392e-05, + "loss": 40.5557, + "step": 111740 + }, + { + "epoch": 0.45148414048327995, + "grad_norm": 475.0141906738281, + "learning_rate": 3.3796041210117546e-05, + "loss": 47.8023, + "step": 111750 + }, + { + "epoch": 0.4515245417486476, + "grad_norm": 1127.599853515625, + "learning_rate": 3.379277366020782e-05, + "loss": 56.3643, + "step": 111760 + }, + { + "epoch": 0.4515649430140152, + "grad_norm": 632.768310546875, + "learning_rate": 3.3789505938878443e-05, + "loss": 42.618, + "step": 111770 + }, + { + "epoch": 0.4516053442793828, + "grad_norm": 558.4849853515625, + "learning_rate": 3.378623804619313e-05, + "loss": 38.2455, + "step": 111780 + }, + { + "epoch": 0.45164574554475045, + "grad_norm": 1455.6197509765625, + "learning_rate": 3.378296998221557e-05, + "loss": 54.4919, + "step": 111790 + }, + { + "epoch": 0.4516861468101181, + "grad_norm": 650.8120727539062, + "learning_rate": 3.3779701747009504e-05, + "loss": 68.0886, + "step": 111800 + }, + { + "epoch": 0.45172654807548573, + "grad_norm": 613.3826904296875, + "learning_rate": 3.377643334063862e-05, + "loss": 47.3737, + "step": 111810 + }, + { + "epoch": 0.4517669493408534, + "grad_norm": 814.562744140625, + "learning_rate": 3.3773164763166655e-05, + "loss": 70.561, + "step": 111820 + }, + { + "epoch": 0.451807350606221, + "grad_norm": 785.9385986328125, + "learning_rate": 3.376989601465733e-05, + "loss": 38.8111, + "step": 111830 + }, + { + "epoch": 0.4518477518715886, + "grad_norm": 640.478759765625, + "learning_rate": 3.376662709517435e-05, + "loss": 55.49, + "step": 111840 + }, + { + "epoch": 0.45188815313695624, + "grad_norm": 590.5025024414062, + "learning_rate": 3.3763358004781475e-05, + "loss": 59.1937, + "step": 111850 + }, + { + "epoch": 0.4519285544023239, + "grad_norm": 681.9274291992188, + "learning_rate": 3.3760088743542424e-05, + "loss": 121.1419, + "step": 111860 + }, + { + "epoch": 0.4519689556676915, + "grad_norm": 1086.4017333984375, + "learning_rate": 3.375681931152093e-05, + "loss": 57.0334, + "step": 111870 + }, + { + "epoch": 0.45200935693305916, + "grad_norm": 614.7588500976562, + "learning_rate": 3.375354970878073e-05, + "loss": 51.712, + "step": 111880 + }, + { + "epoch": 0.4520497581984268, + "grad_norm": 1066.3997802734375, + "learning_rate": 3.375027993538559e-05, + "loss": 62.087, + "step": 111890 + }, + { + "epoch": 0.4520901594637944, + "grad_norm": 644.5421142578125, + "learning_rate": 3.374700999139923e-05, + "loss": 58.5303, + "step": 111900 + }, + { + "epoch": 0.452130560729162, + "grad_norm": 840.0023193359375, + "learning_rate": 3.37437398768854e-05, + "loss": 71.8506, + "step": 111910 + }, + { + "epoch": 0.45217096199452966, + "grad_norm": 1441.012451171875, + "learning_rate": 3.374046959190786e-05, + "loss": 92.4958, + "step": 111920 + }, + { + "epoch": 0.4522113632598973, + "grad_norm": 1386.08349609375, + "learning_rate": 3.3737199136530364e-05, + "loss": 51.7875, + "step": 111930 + }, + { + "epoch": 0.45225176452526494, + "grad_norm": 1856.324951171875, + "learning_rate": 3.373392851081668e-05, + "loss": 74.6996, + "step": 111940 + }, + { + "epoch": 0.4522921657906326, + "grad_norm": 371.86566162109375, + "learning_rate": 3.373065771483056e-05, + "loss": 53.6041, + "step": 111950 + }, + { + "epoch": 0.45233256705600017, + "grad_norm": 862.8280639648438, + "learning_rate": 3.372738674863577e-05, + "loss": 51.361, + "step": 111960 + }, + { + "epoch": 0.4523729683213678, + "grad_norm": 923.8464965820312, + "learning_rate": 3.372411561229609e-05, + "loss": 43.7514, + "step": 111970 + }, + { + "epoch": 0.45241336958673545, + "grad_norm": 716.6856079101562, + "learning_rate": 3.372084430587528e-05, + "loss": 39.6764, + "step": 111980 + }, + { + "epoch": 0.4524537708521031, + "grad_norm": 593.4553833007812, + "learning_rate": 3.371757282943712e-05, + "loss": 39.9165, + "step": 111990 + }, + { + "epoch": 0.4524941721174707, + "grad_norm": 496.314208984375, + "learning_rate": 3.3714301183045385e-05, + "loss": 78.6274, + "step": 112000 + }, + { + "epoch": 0.45253457338283837, + "grad_norm": 811.6598510742188, + "learning_rate": 3.3711029366763866e-05, + "loss": 34.8752, + "step": 112010 + }, + { + "epoch": 0.452574974648206, + "grad_norm": 1419.599609375, + "learning_rate": 3.370775738065634e-05, + "loss": 81.8719, + "step": 112020 + }, + { + "epoch": 0.4526153759135736, + "grad_norm": 1418.275146484375, + "learning_rate": 3.370448522478661e-05, + "loss": 65.6506, + "step": 112030 + }, + { + "epoch": 0.45265577717894123, + "grad_norm": 8803.1748046875, + "learning_rate": 3.370121289921845e-05, + "loss": 112.1481, + "step": 112040 + }, + { + "epoch": 0.45269617844430887, + "grad_norm": 328.6112060546875, + "learning_rate": 3.369794040401567e-05, + "loss": 38.9246, + "step": 112050 + }, + { + "epoch": 0.4527365797096765, + "grad_norm": 576.2307739257812, + "learning_rate": 3.3694667739242066e-05, + "loss": 54.7131, + "step": 112060 + }, + { + "epoch": 0.45277698097504415, + "grad_norm": 1210.724365234375, + "learning_rate": 3.369139490496144e-05, + "loss": 60.4555, + "step": 112070 + }, + { + "epoch": 0.4528173822404118, + "grad_norm": 1678.520751953125, + "learning_rate": 3.368812190123759e-05, + "loss": 53.7549, + "step": 112080 + }, + { + "epoch": 0.4528577835057794, + "grad_norm": 754.4735717773438, + "learning_rate": 3.3684848728134334e-05, + "loss": 55.5762, + "step": 112090 + }, + { + "epoch": 0.452898184771147, + "grad_norm": 294.9254455566406, + "learning_rate": 3.368157538571548e-05, + "loss": 62.4506, + "step": 112100 + }, + { + "epoch": 0.45293858603651466, + "grad_norm": 1608.3568115234375, + "learning_rate": 3.367830187404484e-05, + "loss": 55.6337, + "step": 112110 + }, + { + "epoch": 0.4529789873018823, + "grad_norm": 680.6564331054688, + "learning_rate": 3.367502819318624e-05, + "loss": 49.9141, + "step": 112120 + }, + { + "epoch": 0.45301938856724994, + "grad_norm": 1690.9805908203125, + "learning_rate": 3.36717543432035e-05, + "loss": 58.1772, + "step": 112130 + }, + { + "epoch": 0.4530597898326176, + "grad_norm": 564.469482421875, + "learning_rate": 3.366848032416045e-05, + "loss": 59.8349, + "step": 112140 + }, + { + "epoch": 0.4531001910979852, + "grad_norm": 725.934326171875, + "learning_rate": 3.3665206136120906e-05, + "loss": 72.5532, + "step": 112150 + }, + { + "epoch": 0.4531405923633528, + "grad_norm": 949.9281616210938, + "learning_rate": 3.3661931779148707e-05, + "loss": 65.5223, + "step": 112160 + }, + { + "epoch": 0.45318099362872044, + "grad_norm": 1123.207763671875, + "learning_rate": 3.365865725330769e-05, + "loss": 52.4678, + "step": 112170 + }, + { + "epoch": 0.4532213948940881, + "grad_norm": 264.70001220703125, + "learning_rate": 3.3655382558661685e-05, + "loss": 64.133, + "step": 112180 + }, + { + "epoch": 0.4532617961594557, + "grad_norm": 1888.4464111328125, + "learning_rate": 3.3652107695274555e-05, + "loss": 45.2613, + "step": 112190 + }, + { + "epoch": 0.45330219742482336, + "grad_norm": 539.1635131835938, + "learning_rate": 3.3648832663210124e-05, + "loss": 95.8625, + "step": 112200 + }, + { + "epoch": 0.453342598690191, + "grad_norm": 510.55816650390625, + "learning_rate": 3.3645557462532245e-05, + "loss": 39.8649, + "step": 112210 + }, + { + "epoch": 0.4533829999555586, + "grad_norm": 1129.6290283203125, + "learning_rate": 3.364228209330477e-05, + "loss": 54.8464, + "step": 112220 + }, + { + "epoch": 0.4534234012209262, + "grad_norm": 718.903564453125, + "learning_rate": 3.363900655559157e-05, + "loss": 70.0421, + "step": 112230 + }, + { + "epoch": 0.45346380248629387, + "grad_norm": 765.8641967773438, + "learning_rate": 3.363573084945648e-05, + "loss": 43.2255, + "step": 112240 + }, + { + "epoch": 0.4535042037516615, + "grad_norm": 305.63763427734375, + "learning_rate": 3.363245497496337e-05, + "loss": 55.8746, + "step": 112250 + }, + { + "epoch": 0.45354460501702915, + "grad_norm": 359.51715087890625, + "learning_rate": 3.362917893217611e-05, + "loss": 73.9199, + "step": 112260 + }, + { + "epoch": 0.4535850062823968, + "grad_norm": 516.9877319335938, + "learning_rate": 3.362590272115855e-05, + "loss": 67.4331, + "step": 112270 + }, + { + "epoch": 0.45362540754776437, + "grad_norm": 862.2160034179688, + "learning_rate": 3.3622626341974594e-05, + "loss": 57.4161, + "step": 112280 + }, + { + "epoch": 0.453665808813132, + "grad_norm": 1069.54150390625, + "learning_rate": 3.361934979468809e-05, + "loss": 57.9138, + "step": 112290 + }, + { + "epoch": 0.45370621007849965, + "grad_norm": 375.27825927734375, + "learning_rate": 3.3616073079362926e-05, + "loss": 51.6367, + "step": 112300 + }, + { + "epoch": 0.4537466113438673, + "grad_norm": 450.5061950683594, + "learning_rate": 3.361279619606299e-05, + "loss": 61.73, + "step": 112310 + }, + { + "epoch": 0.45378701260923493, + "grad_norm": 1464.5079345703125, + "learning_rate": 3.360951914485215e-05, + "loss": 52.9956, + "step": 112320 + }, + { + "epoch": 0.45382741387460257, + "grad_norm": 324.18194580078125, + "learning_rate": 3.3606241925794295e-05, + "loss": 47.1093, + "step": 112330 + }, + { + "epoch": 0.4538678151399702, + "grad_norm": 836.4578857421875, + "learning_rate": 3.360296453895333e-05, + "loss": 56.2767, + "step": 112340 + }, + { + "epoch": 0.4539082164053378, + "grad_norm": 1074.7415771484375, + "learning_rate": 3.3599686984393134e-05, + "loss": 50.7534, + "step": 112350 + }, + { + "epoch": 0.45394861767070543, + "grad_norm": 225.61444091796875, + "learning_rate": 3.359640926217763e-05, + "loss": 40.0397, + "step": 112360 + }, + { + "epoch": 0.4539890189360731, + "grad_norm": 1213.6234130859375, + "learning_rate": 3.359313137237069e-05, + "loss": 47.421, + "step": 112370 + }, + { + "epoch": 0.4540294202014407, + "grad_norm": 640.165283203125, + "learning_rate": 3.3589853315036225e-05, + "loss": 60.4271, + "step": 112380 + }, + { + "epoch": 0.45406982146680835, + "grad_norm": 354.8425598144531, + "learning_rate": 3.358657509023815e-05, + "loss": 58.7236, + "step": 112390 + }, + { + "epoch": 0.454110222732176, + "grad_norm": 1010.51123046875, + "learning_rate": 3.3583296698040384e-05, + "loss": 67.7665, + "step": 112400 + }, + { + "epoch": 0.4541506239975436, + "grad_norm": 313.4957580566406, + "learning_rate": 3.3580018138506824e-05, + "loss": 73.5613, + "step": 112410 + }, + { + "epoch": 0.4541910252629112, + "grad_norm": 623.6607055664062, + "learning_rate": 3.3576739411701394e-05, + "loss": 58.9651, + "step": 112420 + }, + { + "epoch": 0.45423142652827886, + "grad_norm": 870.5923461914062, + "learning_rate": 3.357346051768801e-05, + "loss": 64.04, + "step": 112430 + }, + { + "epoch": 0.4542718277936465, + "grad_norm": 499.6788330078125, + "learning_rate": 3.35701814565306e-05, + "loss": 59.2575, + "step": 112440 + }, + { + "epoch": 0.45431222905901414, + "grad_norm": 902.0311889648438, + "learning_rate": 3.356690222829309e-05, + "loss": 37.0818, + "step": 112450 + }, + { + "epoch": 0.4543526303243818, + "grad_norm": 713.7149047851562, + "learning_rate": 3.356362283303941e-05, + "loss": 52.5913, + "step": 112460 + }, + { + "epoch": 0.45439303158974936, + "grad_norm": 671.7296142578125, + "learning_rate": 3.3560343270833495e-05, + "loss": 53.5628, + "step": 112470 + }, + { + "epoch": 0.454433432855117, + "grad_norm": 1461.3150634765625, + "learning_rate": 3.355706354173928e-05, + "loss": 78.7695, + "step": 112480 + }, + { + "epoch": 0.45447383412048464, + "grad_norm": 1066.4896240234375, + "learning_rate": 3.3553783645820715e-05, + "loss": 74.929, + "step": 112490 + }, + { + "epoch": 0.4545142353858523, + "grad_norm": 257.38531494140625, + "learning_rate": 3.355050358314172e-05, + "loss": 39.4283, + "step": 112500 + }, + { + "epoch": 0.4545546366512199, + "grad_norm": 685.919677734375, + "learning_rate": 3.354722335376626e-05, + "loss": 51.0791, + "step": 112510 + }, + { + "epoch": 0.45459503791658756, + "grad_norm": 1068.9114990234375, + "learning_rate": 3.354394295775829e-05, + "loss": 44.3589, + "step": 112520 + }, + { + "epoch": 0.4546354391819552, + "grad_norm": 604.07568359375, + "learning_rate": 3.354066239518174e-05, + "loss": 65.293, + "step": 112530 + }, + { + "epoch": 0.4546758404473228, + "grad_norm": 1334.7886962890625, + "learning_rate": 3.353738166610058e-05, + "loss": 52.1476, + "step": 112540 + }, + { + "epoch": 0.4547162417126904, + "grad_norm": 517.1060791015625, + "learning_rate": 3.353410077057877e-05, + "loss": 80.8657, + "step": 112550 + }, + { + "epoch": 0.45475664297805807, + "grad_norm": 479.4462890625, + "learning_rate": 3.3530819708680286e-05, + "loss": 54.3628, + "step": 112560 + }, + { + "epoch": 0.4547970442434257, + "grad_norm": 579.92041015625, + "learning_rate": 3.352753848046907e-05, + "loss": 41.2108, + "step": 112570 + }, + { + "epoch": 0.45483744550879335, + "grad_norm": 557.235107421875, + "learning_rate": 3.3524257086009104e-05, + "loss": 33.0313, + "step": 112580 + }, + { + "epoch": 0.454877846774161, + "grad_norm": 929.7172241210938, + "learning_rate": 3.352097552536435e-05, + "loss": 40.3618, + "step": 112590 + }, + { + "epoch": 0.45491824803952857, + "grad_norm": 1180.1580810546875, + "learning_rate": 3.35176937985988e-05, + "loss": 73.8001, + "step": 112600 + }, + { + "epoch": 0.4549586493048962, + "grad_norm": 931.85009765625, + "learning_rate": 3.351441190577642e-05, + "loss": 45.0227, + "step": 112610 + }, + { + "epoch": 0.45499905057026385, + "grad_norm": 911.3134155273438, + "learning_rate": 3.3511129846961184e-05, + "loss": 59.2229, + "step": 112620 + }, + { + "epoch": 0.4550394518356315, + "grad_norm": 675.170654296875, + "learning_rate": 3.35078476222171e-05, + "loss": 80.4119, + "step": 112630 + }, + { + "epoch": 0.45507985310099913, + "grad_norm": 1176.3988037109375, + "learning_rate": 3.350456523160815e-05, + "loss": 47.3479, + "step": 112640 + }, + { + "epoch": 0.45512025436636677, + "grad_norm": 969.1943359375, + "learning_rate": 3.350128267519832e-05, + "loss": 81.494, + "step": 112650 + }, + { + "epoch": 0.4551606556317344, + "grad_norm": 291.8066711425781, + "learning_rate": 3.349799995305162e-05, + "loss": 41.662, + "step": 112660 + }, + { + "epoch": 0.455201056897102, + "grad_norm": 658.59765625, + "learning_rate": 3.3494717065232016e-05, + "loss": 61.2986, + "step": 112670 + }, + { + "epoch": 0.45524145816246964, + "grad_norm": 917.7549438476562, + "learning_rate": 3.349143401180354e-05, + "loss": 63.2453, + "step": 112680 + }, + { + "epoch": 0.4552818594278373, + "grad_norm": 453.6599426269531, + "learning_rate": 3.348815079283018e-05, + "loss": 59.7746, + "step": 112690 + }, + { + "epoch": 0.4553222606932049, + "grad_norm": 850.1251220703125, + "learning_rate": 3.3484867408375954e-05, + "loss": 57.4509, + "step": 112700 + }, + { + "epoch": 0.45536266195857256, + "grad_norm": 701.5401000976562, + "learning_rate": 3.348158385850487e-05, + "loss": 41.1296, + "step": 112710 + }, + { + "epoch": 0.4554030632239402, + "grad_norm": 819.9852294921875, + "learning_rate": 3.347830014328094e-05, + "loss": 69.8444, + "step": 112720 + }, + { + "epoch": 0.4554434644893078, + "grad_norm": 720.2810668945312, + "learning_rate": 3.347501626276819e-05, + "loss": 75.9919, + "step": 112730 + }, + { + "epoch": 0.4554838657546754, + "grad_norm": 495.089599609375, + "learning_rate": 3.3471732217030625e-05, + "loss": 40.4414, + "step": 112740 + }, + { + "epoch": 0.45552426702004306, + "grad_norm": 573.2144775390625, + "learning_rate": 3.346844800613229e-05, + "loss": 43.111, + "step": 112750 + }, + { + "epoch": 0.4555646682854107, + "grad_norm": 1056.5557861328125, + "learning_rate": 3.346516363013719e-05, + "loss": 92.2572, + "step": 112760 + }, + { + "epoch": 0.45560506955077834, + "grad_norm": 487.2084655761719, + "learning_rate": 3.346187908910938e-05, + "loss": 36.6314, + "step": 112770 + }, + { + "epoch": 0.455645470816146, + "grad_norm": 501.5868225097656, + "learning_rate": 3.345859438311287e-05, + "loss": 56.4939, + "step": 112780 + }, + { + "epoch": 0.45568587208151357, + "grad_norm": 792.0189819335938, + "learning_rate": 3.345530951221171e-05, + "loss": 94.0912, + "step": 112790 + }, + { + "epoch": 0.4557262733468812, + "grad_norm": 778.9989624023438, + "learning_rate": 3.3452024476469934e-05, + "loss": 40.7863, + "step": 112800 + }, + { + "epoch": 0.45576667461224885, + "grad_norm": 1507.0234375, + "learning_rate": 3.3448739275951595e-05, + "loss": 61.6816, + "step": 112810 + }, + { + "epoch": 0.4558070758776165, + "grad_norm": 432.25244140625, + "learning_rate": 3.344545391072073e-05, + "loss": 44.9511, + "step": 112820 + }, + { + "epoch": 0.4558474771429841, + "grad_norm": 655.8712158203125, + "learning_rate": 3.34421683808414e-05, + "loss": 48.9905, + "step": 112830 + }, + { + "epoch": 0.45588787840835177, + "grad_norm": 940.5873413085938, + "learning_rate": 3.343888268637765e-05, + "loss": 59.8554, + "step": 112840 + }, + { + "epoch": 0.4559282796737194, + "grad_norm": 1201.72412109375, + "learning_rate": 3.343559682739353e-05, + "loss": 85.8798, + "step": 112850 + }, + { + "epoch": 0.455968680939087, + "grad_norm": 822.5703735351562, + "learning_rate": 3.343231080395312e-05, + "loss": 54.6579, + "step": 112860 + }, + { + "epoch": 0.45600908220445463, + "grad_norm": 476.291748046875, + "learning_rate": 3.342902461612045e-05, + "loss": 40.1478, + "step": 112870 + }, + { + "epoch": 0.45604948346982227, + "grad_norm": 1140.7523193359375, + "learning_rate": 3.3425738263959615e-05, + "loss": 36.0184, + "step": 112880 + }, + { + "epoch": 0.4560898847351899, + "grad_norm": 627.5087280273438, + "learning_rate": 3.3422451747534684e-05, + "loss": 67.8078, + "step": 112890 + }, + { + "epoch": 0.45613028600055755, + "grad_norm": 525.3790283203125, + "learning_rate": 3.3419165066909705e-05, + "loss": 47.1049, + "step": 112900 + }, + { + "epoch": 0.4561706872659252, + "grad_norm": 361.6174011230469, + "learning_rate": 3.3415878222148776e-05, + "loss": 66.291, + "step": 112910 + }, + { + "epoch": 0.4562110885312928, + "grad_norm": 1267.40966796875, + "learning_rate": 3.341259121331597e-05, + "loss": 39.7781, + "step": 112920 + }, + { + "epoch": 0.4562514897966604, + "grad_norm": 643.3392333984375, + "learning_rate": 3.340930404047537e-05, + "loss": 37.3639, + "step": 112930 + }, + { + "epoch": 0.45629189106202805, + "grad_norm": 481.6955871582031, + "learning_rate": 3.3406016703691055e-05, + "loss": 55.8988, + "step": 112940 + }, + { + "epoch": 0.4563322923273957, + "grad_norm": 650.4725952148438, + "learning_rate": 3.340272920302711e-05, + "loss": 91.6434, + "step": 112950 + }, + { + "epoch": 0.45637269359276333, + "grad_norm": 674.4364013671875, + "learning_rate": 3.339944153854764e-05, + "loss": 46.4506, + "step": 112960 + }, + { + "epoch": 0.456413094858131, + "grad_norm": 542.6135864257812, + "learning_rate": 3.3396153710316736e-05, + "loss": 54.1574, + "step": 112970 + }, + { + "epoch": 0.4564534961234986, + "grad_norm": 1714.6644287109375, + "learning_rate": 3.339286571839848e-05, + "loss": 96.3534, + "step": 112980 + }, + { + "epoch": 0.4564938973888662, + "grad_norm": 525.607177734375, + "learning_rate": 3.338957756285699e-05, + "loss": 67.7952, + "step": 112990 + }, + { + "epoch": 0.45653429865423384, + "grad_norm": 1665.643798828125, + "learning_rate": 3.338628924375638e-05, + "loss": 69.8561, + "step": 113000 + }, + { + "epoch": 0.4565746999196015, + "grad_norm": 1218.552490234375, + "learning_rate": 3.338300076116073e-05, + "loss": 91.2745, + "step": 113010 + }, + { + "epoch": 0.4566151011849691, + "grad_norm": 1521.2366943359375, + "learning_rate": 3.337971211513417e-05, + "loss": 59.6287, + "step": 113020 + }, + { + "epoch": 0.45665550245033676, + "grad_norm": 898.8818969726562, + "learning_rate": 3.337642330574081e-05, + "loss": 54.526, + "step": 113030 + }, + { + "epoch": 0.4566959037157044, + "grad_norm": 390.90435791015625, + "learning_rate": 3.3373134333044756e-05, + "loss": 35.1512, + "step": 113040 + }, + { + "epoch": 0.456736304981072, + "grad_norm": 788.0700073242188, + "learning_rate": 3.336984519711015e-05, + "loss": 46.5823, + "step": 113050 + }, + { + "epoch": 0.4567767062464396, + "grad_norm": 979.811767578125, + "learning_rate": 3.336655589800109e-05, + "loss": 58.1711, + "step": 113060 + }, + { + "epoch": 0.45681710751180726, + "grad_norm": 1041.4178466796875, + "learning_rate": 3.336326643578172e-05, + "loss": 55.3556, + "step": 113070 + }, + { + "epoch": 0.4568575087771749, + "grad_norm": 594.2887573242188, + "learning_rate": 3.3359976810516164e-05, + "loss": 52.3926, + "step": 113080 + }, + { + "epoch": 0.45689791004254254, + "grad_norm": 731.5540771484375, + "learning_rate": 3.335668702226856e-05, + "loss": 107.6374, + "step": 113090 + }, + { + "epoch": 0.4569383113079102, + "grad_norm": 608.2393798828125, + "learning_rate": 3.3353397071103046e-05, + "loss": 57.6193, + "step": 113100 + }, + { + "epoch": 0.45697871257327777, + "grad_norm": 617.5512084960938, + "learning_rate": 3.3350106957083744e-05, + "loss": 72.52, + "step": 113110 + }, + { + "epoch": 0.4570191138386454, + "grad_norm": 658.0664672851562, + "learning_rate": 3.334681668027481e-05, + "loss": 94.5809, + "step": 113120 + }, + { + "epoch": 0.45705951510401305, + "grad_norm": 543.9719848632812, + "learning_rate": 3.334352624074039e-05, + "loss": 41.8438, + "step": 113130 + }, + { + "epoch": 0.4570999163693807, + "grad_norm": 696.774658203125, + "learning_rate": 3.334023563854463e-05, + "loss": 76.3532, + "step": 113140 + }, + { + "epoch": 0.4571403176347483, + "grad_norm": 2398.177734375, + "learning_rate": 3.333694487375168e-05, + "loss": 38.9571, + "step": 113150 + }, + { + "epoch": 0.45718071890011597, + "grad_norm": 455.6088562011719, + "learning_rate": 3.33336539464257e-05, + "loss": 32.256, + "step": 113160 + }, + { + "epoch": 0.4572211201654836, + "grad_norm": 1092.000732421875, + "learning_rate": 3.3330362856630845e-05, + "loss": 59.1251, + "step": 113170 + }, + { + "epoch": 0.4572615214308512, + "grad_norm": 1621.383544921875, + "learning_rate": 3.3327071604431275e-05, + "loss": 43.2729, + "step": 113180 + }, + { + "epoch": 0.45730192269621883, + "grad_norm": 680.7568969726562, + "learning_rate": 3.3323780189891166e-05, + "loss": 49.1451, + "step": 113190 + }, + { + "epoch": 0.45734232396158647, + "grad_norm": 280.8765563964844, + "learning_rate": 3.332048861307467e-05, + "loss": 54.7998, + "step": 113200 + }, + { + "epoch": 0.4573827252269541, + "grad_norm": 457.403564453125, + "learning_rate": 3.331719687404597e-05, + "loss": 91.7487, + "step": 113210 + }, + { + "epoch": 0.45742312649232175, + "grad_norm": 777.8742065429688, + "learning_rate": 3.331390497286922e-05, + "loss": 44.5807, + "step": 113220 + }, + { + "epoch": 0.4574635277576894, + "grad_norm": 1098.077880859375, + "learning_rate": 3.331061290960863e-05, + "loss": 90.1487, + "step": 113230 + }, + { + "epoch": 0.457503929023057, + "grad_norm": 882.6914672851562, + "learning_rate": 3.3307320684328354e-05, + "loss": 68.7481, + "step": 113240 + }, + { + "epoch": 0.4575443302884246, + "grad_norm": 1128.127197265625, + "learning_rate": 3.330402829709258e-05, + "loss": 65.0538, + "step": 113250 + }, + { + "epoch": 0.45758473155379226, + "grad_norm": 937.3806762695312, + "learning_rate": 3.3300735747965505e-05, + "loss": 57.1227, + "step": 113260 + }, + { + "epoch": 0.4576251328191599, + "grad_norm": 911.514404296875, + "learning_rate": 3.329744303701132e-05, + "loss": 85.5697, + "step": 113270 + }, + { + "epoch": 0.45766553408452754, + "grad_norm": 532.0978393554688, + "learning_rate": 3.3294150164294204e-05, + "loss": 41.7875, + "step": 113280 + }, + { + "epoch": 0.4577059353498952, + "grad_norm": 2276.76904296875, + "learning_rate": 3.329085712987836e-05, + "loss": 58.2938, + "step": 113290 + }, + { + "epoch": 0.4577463366152628, + "grad_norm": 633.0645751953125, + "learning_rate": 3.3287563933827995e-05, + "loss": 45.8421, + "step": 113300 + }, + { + "epoch": 0.4577867378806304, + "grad_norm": 884.1083984375, + "learning_rate": 3.328427057620729e-05, + "loss": 69.57, + "step": 113310 + }, + { + "epoch": 0.45782713914599804, + "grad_norm": 832.331298828125, + "learning_rate": 3.328097705708047e-05, + "loss": 36.6988, + "step": 113320 + }, + { + "epoch": 0.4578675404113657, + "grad_norm": 1298.6260986328125, + "learning_rate": 3.3277683376511744e-05, + "loss": 67.1316, + "step": 113330 + }, + { + "epoch": 0.4579079416767333, + "grad_norm": 690.5106811523438, + "learning_rate": 3.327438953456532e-05, + "loss": 43.1363, + "step": 113340 + }, + { + "epoch": 0.45794834294210096, + "grad_norm": 638.0509643554688, + "learning_rate": 3.327109553130541e-05, + "loss": 64.9828, + "step": 113350 + }, + { + "epoch": 0.4579887442074686, + "grad_norm": 712.4091186523438, + "learning_rate": 3.326780136679623e-05, + "loss": 51.8144, + "step": 113360 + }, + { + "epoch": 0.4580291454728362, + "grad_norm": 768.2869873046875, + "learning_rate": 3.326450704110201e-05, + "loss": 43.4623, + "step": 113370 + }, + { + "epoch": 0.4580695467382038, + "grad_norm": 667.932373046875, + "learning_rate": 3.3261212554286975e-05, + "loss": 60.1954, + "step": 113380 + }, + { + "epoch": 0.45810994800357147, + "grad_norm": 797.756591796875, + "learning_rate": 3.3257917906415336e-05, + "loss": 64.3577, + "step": 113390 + }, + { + "epoch": 0.4581503492689391, + "grad_norm": 417.9449462890625, + "learning_rate": 3.325462309755134e-05, + "loss": 52.2687, + "step": 113400 + }, + { + "epoch": 0.45819075053430675, + "grad_norm": 131.72840881347656, + "learning_rate": 3.325132812775922e-05, + "loss": 53.6158, + "step": 113410 + }, + { + "epoch": 0.4582311517996744, + "grad_norm": 1277.050537109375, + "learning_rate": 3.324803299710321e-05, + "loss": 79.7133, + "step": 113420 + }, + { + "epoch": 0.45827155306504197, + "grad_norm": 247.889404296875, + "learning_rate": 3.3244737705647554e-05, + "loss": 79.97, + "step": 113430 + }, + { + "epoch": 0.4583119543304096, + "grad_norm": 1504.002685546875, + "learning_rate": 3.324144225345649e-05, + "loss": 57.3291, + "step": 113440 + }, + { + "epoch": 0.45835235559577725, + "grad_norm": 3024.2744140625, + "learning_rate": 3.3238146640594256e-05, + "loss": 55.5882, + "step": 113450 + }, + { + "epoch": 0.4583927568611449, + "grad_norm": 1827.9388427734375, + "learning_rate": 3.323485086712513e-05, + "loss": 68.6746, + "step": 113460 + }, + { + "epoch": 0.45843315812651253, + "grad_norm": 1997.11279296875, + "learning_rate": 3.323155493311334e-05, + "loss": 60.3449, + "step": 113470 + }, + { + "epoch": 0.45847355939188017, + "grad_norm": 1310.3616943359375, + "learning_rate": 3.322825883862314e-05, + "loss": 58.0651, + "step": 113480 + }, + { + "epoch": 0.4585139606572478, + "grad_norm": 542.6761474609375, + "learning_rate": 3.32249625837188e-05, + "loss": 45.5163, + "step": 113490 + }, + { + "epoch": 0.4585543619226154, + "grad_norm": 109.23764038085938, + "learning_rate": 3.322166616846458e-05, + "loss": 46.8832, + "step": 113500 + }, + { + "epoch": 0.45859476318798303, + "grad_norm": 770.7872314453125, + "learning_rate": 3.321836959292475e-05, + "loss": 54.6829, + "step": 113510 + }, + { + "epoch": 0.4586351644533507, + "grad_norm": 1391.15185546875, + "learning_rate": 3.321507285716357e-05, + "loss": 76.5766, + "step": 113520 + }, + { + "epoch": 0.4586755657187183, + "grad_norm": 1156.2738037109375, + "learning_rate": 3.321177596124532e-05, + "loss": 44.4677, + "step": 113530 + }, + { + "epoch": 0.45871596698408595, + "grad_norm": 882.244140625, + "learning_rate": 3.3208478905234274e-05, + "loss": 64.2383, + "step": 113540 + }, + { + "epoch": 0.4587563682494536, + "grad_norm": 1060.12841796875, + "learning_rate": 3.32051816891947e-05, + "loss": 49.1175, + "step": 113550 + }, + { + "epoch": 0.4587967695148212, + "grad_norm": 1234.741455078125, + "learning_rate": 3.320188431319088e-05, + "loss": 41.2987, + "step": 113560 + }, + { + "epoch": 0.4588371707801888, + "grad_norm": 437.5050048828125, + "learning_rate": 3.31985867772871e-05, + "loss": 63.9297, + "step": 113570 + }, + { + "epoch": 0.45887757204555646, + "grad_norm": 839.0947875976562, + "learning_rate": 3.319528908154766e-05, + "loss": 55.2399, + "step": 113580 + }, + { + "epoch": 0.4589179733109241, + "grad_norm": 1059.20654296875, + "learning_rate": 3.319199122603683e-05, + "loss": 35.0855, + "step": 113590 + }, + { + "epoch": 0.45895837457629174, + "grad_norm": 1006.2760009765625, + "learning_rate": 3.318869321081892e-05, + "loss": 75.6239, + "step": 113600 + }, + { + "epoch": 0.4589987758416594, + "grad_norm": 1427.097412109375, + "learning_rate": 3.3185395035958224e-05, + "loss": 53.2945, + "step": 113610 + }, + { + "epoch": 0.459039177107027, + "grad_norm": 960.8377075195312, + "learning_rate": 3.318209670151904e-05, + "loss": 74.3849, + "step": 113620 + }, + { + "epoch": 0.4590795783723946, + "grad_norm": 780.0336303710938, + "learning_rate": 3.317879820756566e-05, + "loss": 59.6619, + "step": 113630 + }, + { + "epoch": 0.45911997963776224, + "grad_norm": 913.7399291992188, + "learning_rate": 3.31754995541624e-05, + "loss": 54.3696, + "step": 113640 + }, + { + "epoch": 0.4591603809031299, + "grad_norm": 884.738037109375, + "learning_rate": 3.3172200741373563e-05, + "loss": 87.1222, + "step": 113650 + }, + { + "epoch": 0.4592007821684975, + "grad_norm": 495.6034851074219, + "learning_rate": 3.3168901769263474e-05, + "loss": 52.4804, + "step": 113660 + }, + { + "epoch": 0.45924118343386516, + "grad_norm": 1088.6365966796875, + "learning_rate": 3.316560263789643e-05, + "loss": 46.7347, + "step": 113670 + }, + { + "epoch": 0.4592815846992328, + "grad_norm": 724.1251831054688, + "learning_rate": 3.3162303347336764e-05, + "loss": 53.7025, + "step": 113680 + }, + { + "epoch": 0.4593219859646004, + "grad_norm": 982.661376953125, + "learning_rate": 3.315900389764879e-05, + "loss": 47.1129, + "step": 113690 + }, + { + "epoch": 0.459362387229968, + "grad_norm": 662.5863037109375, + "learning_rate": 3.315570428889684e-05, + "loss": 35.4883, + "step": 113700 + }, + { + "epoch": 0.45940278849533567, + "grad_norm": 1490.281005859375, + "learning_rate": 3.315240452114523e-05, + "loss": 52.5399, + "step": 113710 + }, + { + "epoch": 0.4594431897607033, + "grad_norm": 1237.041748046875, + "learning_rate": 3.31491045944583e-05, + "loss": 59.784, + "step": 113720 + }, + { + "epoch": 0.45948359102607095, + "grad_norm": 700.3053588867188, + "learning_rate": 3.314580450890038e-05, + "loss": 70.0428, + "step": 113730 + }, + { + "epoch": 0.4595239922914386, + "grad_norm": 851.7976684570312, + "learning_rate": 3.3142504264535804e-05, + "loss": 50.4964, + "step": 113740 + }, + { + "epoch": 0.45956439355680617, + "grad_norm": 983.1741943359375, + "learning_rate": 3.313920386142892e-05, + "loss": 59.0461, + "step": 113750 + }, + { + "epoch": 0.4596047948221738, + "grad_norm": 1348.2669677734375, + "learning_rate": 3.313590329964406e-05, + "loss": 59.1032, + "step": 113760 + }, + { + "epoch": 0.45964519608754145, + "grad_norm": 487.41680908203125, + "learning_rate": 3.313260257924558e-05, + "loss": 87.3492, + "step": 113770 + }, + { + "epoch": 0.4596855973529091, + "grad_norm": 345.0389099121094, + "learning_rate": 3.312930170029783e-05, + "loss": 52.9883, + "step": 113780 + }, + { + "epoch": 0.45972599861827673, + "grad_norm": 438.7192077636719, + "learning_rate": 3.3126000662865156e-05, + "loss": 49.1873, + "step": 113790 + }, + { + "epoch": 0.45976639988364437, + "grad_norm": 493.15130615234375, + "learning_rate": 3.312269946701191e-05, + "loss": 50.4528, + "step": 113800 + }, + { + "epoch": 0.459806801149012, + "grad_norm": 1520.73974609375, + "learning_rate": 3.311939811280246e-05, + "loss": 42.8416, + "step": 113810 + }, + { + "epoch": 0.4598472024143796, + "grad_norm": 146.88551330566406, + "learning_rate": 3.311609660030117e-05, + "loss": 52.2549, + "step": 113820 + }, + { + "epoch": 0.45988760367974724, + "grad_norm": 610.1900634765625, + "learning_rate": 3.311279492957239e-05, + "loss": 81.4292, + "step": 113830 + }, + { + "epoch": 0.4599280049451149, + "grad_norm": 478.4176330566406, + "learning_rate": 3.31094931006805e-05, + "loss": 59.0037, + "step": 113840 + }, + { + "epoch": 0.4599684062104825, + "grad_norm": 698.8412475585938, + "learning_rate": 3.310619111368986e-05, + "loss": 60.8137, + "step": 113850 + }, + { + "epoch": 0.46000880747585016, + "grad_norm": 719.0433349609375, + "learning_rate": 3.310288896866486e-05, + "loss": 40.1776, + "step": 113860 + }, + { + "epoch": 0.4600492087412178, + "grad_norm": 807.5015258789062, + "learning_rate": 3.309958666566986e-05, + "loss": 87.1863, + "step": 113870 + }, + { + "epoch": 0.4600896100065854, + "grad_norm": 773.2158813476562, + "learning_rate": 3.309628420476926e-05, + "loss": 65.442, + "step": 113880 + }, + { + "epoch": 0.460130011271953, + "grad_norm": 1120.7901611328125, + "learning_rate": 3.309298158602742e-05, + "loss": 46.1566, + "step": 113890 + }, + { + "epoch": 0.46017041253732066, + "grad_norm": 2974.2939453125, + "learning_rate": 3.308967880950874e-05, + "loss": 86.9134, + "step": 113900 + }, + { + "epoch": 0.4602108138026883, + "grad_norm": 1336.4696044921875, + "learning_rate": 3.308637587527761e-05, + "loss": 79.6506, + "step": 113910 + }, + { + "epoch": 0.46025121506805594, + "grad_norm": 1928.9310302734375, + "learning_rate": 3.3083072783398416e-05, + "loss": 96.6911, + "step": 113920 + }, + { + "epoch": 0.4602916163334236, + "grad_norm": 538.5294799804688, + "learning_rate": 3.3079769533935556e-05, + "loss": 49.8037, + "step": 113930 + }, + { + "epoch": 0.4603320175987912, + "grad_norm": 809.7352294921875, + "learning_rate": 3.307646612695343e-05, + "loss": 61.5456, + "step": 113940 + }, + { + "epoch": 0.4603724188641588, + "grad_norm": 550.5420532226562, + "learning_rate": 3.307316256251644e-05, + "loss": 49.7282, + "step": 113950 + }, + { + "epoch": 0.46041282012952645, + "grad_norm": 1422.148193359375, + "learning_rate": 3.3069858840688994e-05, + "loss": 71.9855, + "step": 113960 + }, + { + "epoch": 0.4604532213948941, + "grad_norm": 426.9280090332031, + "learning_rate": 3.3066554961535485e-05, + "loss": 46.9851, + "step": 113970 + }, + { + "epoch": 0.4604936226602617, + "grad_norm": 844.1886596679688, + "learning_rate": 3.3063250925120334e-05, + "loss": 56.5634, + "step": 113980 + }, + { + "epoch": 0.46053402392562937, + "grad_norm": 590.6248779296875, + "learning_rate": 3.305994673150797e-05, + "loss": 58.149, + "step": 113990 + }, + { + "epoch": 0.460574425190997, + "grad_norm": 722.6728515625, + "learning_rate": 3.305664238076278e-05, + "loss": 48.5719, + "step": 114000 + }, + { + "epoch": 0.4606148264563646, + "grad_norm": 761.7833251953125, + "learning_rate": 3.30533378729492e-05, + "loss": 67.1143, + "step": 114010 + }, + { + "epoch": 0.46065522772173223, + "grad_norm": 258.24639892578125, + "learning_rate": 3.3050033208131656e-05, + "loss": 50.417, + "step": 114020 + }, + { + "epoch": 0.46069562898709987, + "grad_norm": 391.4142150878906, + "learning_rate": 3.304672838637457e-05, + "loss": 65.8437, + "step": 114030 + }, + { + "epoch": 0.4607360302524675, + "grad_norm": 432.7169189453125, + "learning_rate": 3.3043423407742375e-05, + "loss": 60.9631, + "step": 114040 + }, + { + "epoch": 0.46077643151783515, + "grad_norm": 893.4138793945312, + "learning_rate": 3.3040118272299495e-05, + "loss": 131.9281, + "step": 114050 + }, + { + "epoch": 0.4608168327832028, + "grad_norm": 1183.3709716796875, + "learning_rate": 3.303681298011037e-05, + "loss": 74.6201, + "step": 114060 + }, + { + "epoch": 0.4608572340485704, + "grad_norm": 327.8617248535156, + "learning_rate": 3.303350753123944e-05, + "loss": 38.2665, + "step": 114070 + }, + { + "epoch": 0.460897635313938, + "grad_norm": 642.1771850585938, + "learning_rate": 3.3030201925751145e-05, + "loss": 50.6443, + "step": 114080 + }, + { + "epoch": 0.46093803657930565, + "grad_norm": 1546.3465576171875, + "learning_rate": 3.302689616370993e-05, + "loss": 37.8828, + "step": 114090 + }, + { + "epoch": 0.4609784378446733, + "grad_norm": 634.73876953125, + "learning_rate": 3.302359024518024e-05, + "loss": 29.7485, + "step": 114100 + }, + { + "epoch": 0.46101883911004093, + "grad_norm": 658.0567626953125, + "learning_rate": 3.302028417022653e-05, + "loss": 45.2375, + "step": 114110 + }, + { + "epoch": 0.4610592403754086, + "grad_norm": 633.6270751953125, + "learning_rate": 3.301697793891324e-05, + "loss": 59.6125, + "step": 114120 + }, + { + "epoch": 0.4610996416407762, + "grad_norm": 1438.4256591796875, + "learning_rate": 3.301367155130485e-05, + "loss": 89.7295, + "step": 114130 + }, + { + "epoch": 0.4611400429061438, + "grad_norm": 1383.4329833984375, + "learning_rate": 3.3010365007465805e-05, + "loss": 48.6195, + "step": 114140 + }, + { + "epoch": 0.46118044417151144, + "grad_norm": 530.7550659179688, + "learning_rate": 3.300705830746057e-05, + "loss": 46.5861, + "step": 114150 + }, + { + "epoch": 0.4612208454368791, + "grad_norm": 633.077880859375, + "learning_rate": 3.300375145135361e-05, + "loss": 60.9828, + "step": 114160 + }, + { + "epoch": 0.4612612467022467, + "grad_norm": 376.91864013671875, + "learning_rate": 3.3000444439209396e-05, + "loss": 62.8593, + "step": 114170 + }, + { + "epoch": 0.46130164796761436, + "grad_norm": 721.732666015625, + "learning_rate": 3.299713727109239e-05, + "loss": 84.59, + "step": 114180 + }, + { + "epoch": 0.461342049232982, + "grad_norm": 473.7373352050781, + "learning_rate": 3.299382994706709e-05, + "loss": 94.0822, + "step": 114190 + }, + { + "epoch": 0.4613824504983496, + "grad_norm": 1528.2044677734375, + "learning_rate": 3.299052246719795e-05, + "loss": 81.7651, + "step": 114200 + }, + { + "epoch": 0.4614228517637172, + "grad_norm": 890.8794555664062, + "learning_rate": 3.298721483154946e-05, + "loss": 40.4416, + "step": 114210 + }, + { + "epoch": 0.46146325302908486, + "grad_norm": 541.0910034179688, + "learning_rate": 3.298390704018611e-05, + "loss": 43.575, + "step": 114220 + }, + { + "epoch": 0.4615036542944525, + "grad_norm": 915.8447875976562, + "learning_rate": 3.298059909317239e-05, + "loss": 70.035, + "step": 114230 + }, + { + "epoch": 0.46154405555982014, + "grad_norm": 1335.07275390625, + "learning_rate": 3.297729099057277e-05, + "loss": 58.3776, + "step": 114240 + }, + { + "epoch": 0.4615844568251878, + "grad_norm": 1571.07373046875, + "learning_rate": 3.2973982732451755e-05, + "loss": 72.0075, + "step": 114250 + }, + { + "epoch": 0.4616248580905554, + "grad_norm": 1263.757080078125, + "learning_rate": 3.297067431887384e-05, + "loss": 53.4101, + "step": 114260 + }, + { + "epoch": 0.461665259355923, + "grad_norm": 916.6146850585938, + "learning_rate": 3.296736574990353e-05, + "loss": 78.4652, + "step": 114270 + }, + { + "epoch": 0.46170566062129065, + "grad_norm": 365.46142578125, + "learning_rate": 3.296405702560532e-05, + "loss": 80.6376, + "step": 114280 + }, + { + "epoch": 0.4617460618866583, + "grad_norm": 944.3591918945312, + "learning_rate": 3.2960748146043716e-05, + "loss": 90.1059, + "step": 114290 + }, + { + "epoch": 0.46178646315202593, + "grad_norm": 679.8099365234375, + "learning_rate": 3.295743911128324e-05, + "loss": 48.7843, + "step": 114300 + }, + { + "epoch": 0.46182686441739357, + "grad_norm": 1261.843017578125, + "learning_rate": 3.295412992138838e-05, + "loss": 78.7052, + "step": 114310 + }, + { + "epoch": 0.4618672656827612, + "grad_norm": 481.5779724121094, + "learning_rate": 3.295082057642367e-05, + "loss": 41.2433, + "step": 114320 + }, + { + "epoch": 0.4619076669481288, + "grad_norm": 903.3895874023438, + "learning_rate": 3.294751107645361e-05, + "loss": 48.583, + "step": 114330 + }, + { + "epoch": 0.46194806821349643, + "grad_norm": 730.4287109375, + "learning_rate": 3.294420142154274e-05, + "loss": 78.3227, + "step": 114340 + }, + { + "epoch": 0.46198846947886407, + "grad_norm": 563.7612915039062, + "learning_rate": 3.2940891611755564e-05, + "loss": 53.6732, + "step": 114350 + }, + { + "epoch": 0.4620288707442317, + "grad_norm": 1336.9444580078125, + "learning_rate": 3.293758164715663e-05, + "loss": 55.2327, + "step": 114360 + }, + { + "epoch": 0.46206927200959935, + "grad_norm": 1316.6781005859375, + "learning_rate": 3.293427152781044e-05, + "loss": 39.9807, + "step": 114370 + }, + { + "epoch": 0.462109673274967, + "grad_norm": 1014.697265625, + "learning_rate": 3.2930961253781554e-05, + "loss": 81.932, + "step": 114380 + }, + { + "epoch": 0.4621500745403346, + "grad_norm": 2144.41064453125, + "learning_rate": 3.292765082513449e-05, + "loss": 80.4216, + "step": 114390 + }, + { + "epoch": 0.4621904758057022, + "grad_norm": 509.5421447753906, + "learning_rate": 3.29243402419338e-05, + "loss": 40.3778, + "step": 114400 + }, + { + "epoch": 0.46223087707106986, + "grad_norm": 432.00933837890625, + "learning_rate": 3.2921029504244004e-05, + "loss": 76.2071, + "step": 114410 + }, + { + "epoch": 0.4622712783364375, + "grad_norm": 722.176025390625, + "learning_rate": 3.2917718612129665e-05, + "loss": 41.5058, + "step": 114420 + }, + { + "epoch": 0.46231167960180514, + "grad_norm": 725.9029541015625, + "learning_rate": 3.291440756565533e-05, + "loss": 53.8693, + "step": 114430 + }, + { + "epoch": 0.4623520808671728, + "grad_norm": 281.3984375, + "learning_rate": 3.2911096364885544e-05, + "loss": 80.6305, + "step": 114440 + }, + { + "epoch": 0.4623924821325404, + "grad_norm": 811.3935546875, + "learning_rate": 3.290778500988485e-05, + "loss": 55.3654, + "step": 114450 + }, + { + "epoch": 0.462432883397908, + "grad_norm": 472.8229064941406, + "learning_rate": 3.2904473500717824e-05, + "loss": 59.1575, + "step": 114460 + }, + { + "epoch": 0.46247328466327564, + "grad_norm": 713.5052490234375, + "learning_rate": 3.290116183744902e-05, + "loss": 50.5053, + "step": 114470 + }, + { + "epoch": 0.4625136859286433, + "grad_norm": 1182.7664794921875, + "learning_rate": 3.2897850020143005e-05, + "loss": 59.1632, + "step": 114480 + }, + { + "epoch": 0.4625540871940109, + "grad_norm": 845.945556640625, + "learning_rate": 3.289453804886433e-05, + "loss": 85.3159, + "step": 114490 + }, + { + "epoch": 0.46259448845937856, + "grad_norm": 406.4460754394531, + "learning_rate": 3.289122592367757e-05, + "loss": 39.6031, + "step": 114500 + }, + { + "epoch": 0.4626348897247462, + "grad_norm": 883.4130249023438, + "learning_rate": 3.288791364464729e-05, + "loss": 44.7248, + "step": 114510 + }, + { + "epoch": 0.4626752909901138, + "grad_norm": 447.2448425292969, + "learning_rate": 3.2884601211838085e-05, + "loss": 62.0368, + "step": 114520 + }, + { + "epoch": 0.4627156922554814, + "grad_norm": 640.0363159179688, + "learning_rate": 3.288128862531452e-05, + "loss": 56.4858, + "step": 114530 + }, + { + "epoch": 0.46275609352084907, + "grad_norm": 891.1563720703125, + "learning_rate": 3.287797588514117e-05, + "loss": 70.3198, + "step": 114540 + }, + { + "epoch": 0.4627964947862167, + "grad_norm": 628.9306030273438, + "learning_rate": 3.287466299138262e-05, + "loss": 53.9219, + "step": 114550 + }, + { + "epoch": 0.46283689605158435, + "grad_norm": 382.36456298828125, + "learning_rate": 3.287134994410347e-05, + "loss": 42.8895, + "step": 114560 + }, + { + "epoch": 0.462877297316952, + "grad_norm": 1126.9713134765625, + "learning_rate": 3.28680367433683e-05, + "loss": 38.392, + "step": 114570 + }, + { + "epoch": 0.4629176985823196, + "grad_norm": 2824.01025390625, + "learning_rate": 3.28647233892417e-05, + "loss": 77.3813, + "step": 114580 + }, + { + "epoch": 0.4629580998476872, + "grad_norm": 413.0010986328125, + "learning_rate": 3.286140988178826e-05, + "loss": 50.3714, + "step": 114590 + }, + { + "epoch": 0.46299850111305485, + "grad_norm": 952.1806030273438, + "learning_rate": 3.28580962210726e-05, + "loss": 55.7656, + "step": 114600 + }, + { + "epoch": 0.4630389023784225, + "grad_norm": 491.023193359375, + "learning_rate": 3.2854782407159305e-05, + "loss": 47.8107, + "step": 114610 + }, + { + "epoch": 0.46307930364379013, + "grad_norm": 413.3840026855469, + "learning_rate": 3.285146844011298e-05, + "loss": 57.6984, + "step": 114620 + }, + { + "epoch": 0.46311970490915777, + "grad_norm": 593.4551391601562, + "learning_rate": 3.2848154319998235e-05, + "loss": 47.3308, + "step": 114630 + }, + { + "epoch": 0.4631601061745254, + "grad_norm": 543.7266845703125, + "learning_rate": 3.2844840046879686e-05, + "loss": 46.1537, + "step": 114640 + }, + { + "epoch": 0.463200507439893, + "grad_norm": 876.9476318359375, + "learning_rate": 3.2841525620821945e-05, + "loss": 39.367, + "step": 114650 + }, + { + "epoch": 0.46324090870526063, + "grad_norm": 470.560791015625, + "learning_rate": 3.2838211041889625e-05, + "loss": 56.22, + "step": 114660 + }, + { + "epoch": 0.4632813099706283, + "grad_norm": 692.4049072265625, + "learning_rate": 3.2834896310147336e-05, + "loss": 46.9886, + "step": 114670 + }, + { + "epoch": 0.4633217112359959, + "grad_norm": 1012.37890625, + "learning_rate": 3.283158142565971e-05, + "loss": 77.0895, + "step": 114680 + }, + { + "epoch": 0.46336211250136355, + "grad_norm": 1028.7650146484375, + "learning_rate": 3.282826638849138e-05, + "loss": 60.6332, + "step": 114690 + }, + { + "epoch": 0.4634025137667312, + "grad_norm": 1001.2539672851562, + "learning_rate": 3.2824951198706954e-05, + "loss": 55.1329, + "step": 114700 + }, + { + "epoch": 0.4634429150320988, + "grad_norm": 707.6488647460938, + "learning_rate": 3.2821635856371086e-05, + "loss": 68.9933, + "step": 114710 + }, + { + "epoch": 0.4634833162974664, + "grad_norm": 640.0159912109375, + "learning_rate": 3.28183203615484e-05, + "loss": 56.8567, + "step": 114720 + }, + { + "epoch": 0.46352371756283406, + "grad_norm": 1112.1865234375, + "learning_rate": 3.281500471430353e-05, + "loss": 63.7654, + "step": 114730 + }, + { + "epoch": 0.4635641188282017, + "grad_norm": 871.5289306640625, + "learning_rate": 3.281168891470112e-05, + "loss": 64.3242, + "step": 114740 + }, + { + "epoch": 0.46360452009356934, + "grad_norm": 232.17013549804688, + "learning_rate": 3.2808372962805816e-05, + "loss": 51.8217, + "step": 114750 + }, + { + "epoch": 0.463644921358937, + "grad_norm": 653.4492797851562, + "learning_rate": 3.280505685868226e-05, + "loss": 32.7491, + "step": 114760 + }, + { + "epoch": 0.4636853226243046, + "grad_norm": 806.7274780273438, + "learning_rate": 3.2801740602395105e-05, + "loss": 60.581, + "step": 114770 + }, + { + "epoch": 0.4637257238896722, + "grad_norm": 698.8681640625, + "learning_rate": 3.279842419400899e-05, + "loss": 81.199, + "step": 114780 + }, + { + "epoch": 0.46376612515503984, + "grad_norm": 1318.1488037109375, + "learning_rate": 3.2795107633588586e-05, + "loss": 61.5727, + "step": 114790 + }, + { + "epoch": 0.4638065264204075, + "grad_norm": 2012.5032958984375, + "learning_rate": 3.279179092119855e-05, + "loss": 63.0752, + "step": 114800 + }, + { + "epoch": 0.4638469276857751, + "grad_norm": 946.7776489257812, + "learning_rate": 3.278847405690353e-05, + "loss": 104.6616, + "step": 114810 + }, + { + "epoch": 0.46388732895114276, + "grad_norm": 332.1543273925781, + "learning_rate": 3.278515704076821e-05, + "loss": 57.2958, + "step": 114820 + }, + { + "epoch": 0.4639277302165104, + "grad_norm": 628.6360473632812, + "learning_rate": 3.278183987285724e-05, + "loss": 49.5415, + "step": 114830 + }, + { + "epoch": 0.463968131481878, + "grad_norm": 407.3933410644531, + "learning_rate": 3.277852255323529e-05, + "loss": 45.7567, + "step": 114840 + }, + { + "epoch": 0.46400853274724563, + "grad_norm": 1091.437255859375, + "learning_rate": 3.277520508196705e-05, + "loss": 76.9394, + "step": 114850 + }, + { + "epoch": 0.46404893401261327, + "grad_norm": 540.4053344726562, + "learning_rate": 3.277188745911717e-05, + "loss": 42.3653, + "step": 114860 + }, + { + "epoch": 0.4640893352779809, + "grad_norm": 1648.4383544921875, + "learning_rate": 3.276856968475035e-05, + "loss": 78.4631, + "step": 114870 + }, + { + "epoch": 0.46412973654334855, + "grad_norm": 469.8823547363281, + "learning_rate": 3.276525175893126e-05, + "loss": 59.8111, + "step": 114880 + }, + { + "epoch": 0.4641701378087162, + "grad_norm": 925.3463745117188, + "learning_rate": 3.27619336817246e-05, + "loss": 49.3538, + "step": 114890 + }, + { + "epoch": 0.46421053907408383, + "grad_norm": 1113.8001708984375, + "learning_rate": 3.2758615453195034e-05, + "loss": 64.5263, + "step": 114900 + }, + { + "epoch": 0.4642509403394514, + "grad_norm": 591.8302612304688, + "learning_rate": 3.275529707340728e-05, + "loss": 71.7116, + "step": 114910 + }, + { + "epoch": 0.46429134160481905, + "grad_norm": 564.6202392578125, + "learning_rate": 3.2751978542425995e-05, + "loss": 58.8196, + "step": 114920 + }, + { + "epoch": 0.4643317428701867, + "grad_norm": 1191.913330078125, + "learning_rate": 3.2748659860315916e-05, + "loss": 42.0783, + "step": 114930 + }, + { + "epoch": 0.46437214413555433, + "grad_norm": 599.6019287109375, + "learning_rate": 3.274534102714172e-05, + "loss": 50.7316, + "step": 114940 + }, + { + "epoch": 0.46441254540092197, + "grad_norm": 527.0614013671875, + "learning_rate": 3.2742022042968104e-05, + "loss": 51.8205, + "step": 114950 + }, + { + "epoch": 0.4644529466662896, + "grad_norm": 665.9529418945312, + "learning_rate": 3.273870290785979e-05, + "loss": 68.3194, + "step": 114960 + }, + { + "epoch": 0.4644933479316572, + "grad_norm": 515.5640258789062, + "learning_rate": 3.2735383621881485e-05, + "loss": 37.5466, + "step": 114970 + }, + { + "epoch": 0.46453374919702484, + "grad_norm": 822.676513671875, + "learning_rate": 3.273206418509788e-05, + "loss": 69.2502, + "step": 114980 + }, + { + "epoch": 0.4645741504623925, + "grad_norm": 1226.153076171875, + "learning_rate": 3.272874459757371e-05, + "loss": 50.1505, + "step": 114990 + }, + { + "epoch": 0.4646145517277601, + "grad_norm": 1813.3848876953125, + "learning_rate": 3.272542485937369e-05, + "loss": 77.2155, + "step": 115000 + }, + { + "epoch": 0.46465495299312776, + "grad_norm": 579.1246337890625, + "learning_rate": 3.2722104970562525e-05, + "loss": 93.8051, + "step": 115010 + }, + { + "epoch": 0.4646953542584954, + "grad_norm": 0.0, + "learning_rate": 3.271878493120496e-05, + "loss": 37.53, + "step": 115020 + }, + { + "epoch": 0.464735755523863, + "grad_norm": 285.748779296875, + "learning_rate": 3.27154647413657e-05, + "loss": 51.3141, + "step": 115030 + }, + { + "epoch": 0.4647761567892306, + "grad_norm": 585.9220581054688, + "learning_rate": 3.271214440110948e-05, + "loss": 66.3542, + "step": 115040 + }, + { + "epoch": 0.46481655805459826, + "grad_norm": 385.7046813964844, + "learning_rate": 3.270882391050104e-05, + "loss": 43.7319, + "step": 115050 + }, + { + "epoch": 0.4648569593199659, + "grad_norm": 799.7272338867188, + "learning_rate": 3.270550326960511e-05, + "loss": 73.2882, + "step": 115060 + }, + { + "epoch": 0.46489736058533354, + "grad_norm": 1642.924560546875, + "learning_rate": 3.270218247848642e-05, + "loss": 90.7406, + "step": 115070 + }, + { + "epoch": 0.4649377618507012, + "grad_norm": 890.0045776367188, + "learning_rate": 3.269886153720972e-05, + "loss": 65.4913, + "step": 115080 + }, + { + "epoch": 0.4649781631160688, + "grad_norm": 1087.22802734375, + "learning_rate": 3.2695540445839764e-05, + "loss": 66.0933, + "step": 115090 + }, + { + "epoch": 0.4650185643814364, + "grad_norm": 436.0012512207031, + "learning_rate": 3.269221920444127e-05, + "loss": 48.0393, + "step": 115100 + }, + { + "epoch": 0.46505896564680405, + "grad_norm": 821.9993286132812, + "learning_rate": 3.2688897813079005e-05, + "loss": 48.355, + "step": 115110 + }, + { + "epoch": 0.4650993669121717, + "grad_norm": 434.6961364746094, + "learning_rate": 3.2685576271817716e-05, + "loss": 54.5126, + "step": 115120 + }, + { + "epoch": 0.4651397681775393, + "grad_norm": 897.4439697265625, + "learning_rate": 3.268225458072217e-05, + "loss": 62.3177, + "step": 115130 + }, + { + "epoch": 0.46518016944290697, + "grad_norm": 2384.05224609375, + "learning_rate": 3.267893273985711e-05, + "loss": 72.2372, + "step": 115140 + }, + { + "epoch": 0.4652205707082746, + "grad_norm": 724.9418334960938, + "learning_rate": 3.26756107492873e-05, + "loss": 56.3772, + "step": 115150 + }, + { + "epoch": 0.4652609719736422, + "grad_norm": 666.811767578125, + "learning_rate": 3.267228860907751e-05, + "loss": 45.4823, + "step": 115160 + }, + { + "epoch": 0.46530137323900983, + "grad_norm": 846.5064086914062, + "learning_rate": 3.266896631929251e-05, + "loss": 49.0491, + "step": 115170 + }, + { + "epoch": 0.46534177450437747, + "grad_norm": 372.02447509765625, + "learning_rate": 3.2665643879997056e-05, + "loss": 78.3602, + "step": 115180 + }, + { + "epoch": 0.4653821757697451, + "grad_norm": 745.8118896484375, + "learning_rate": 3.266232129125593e-05, + "loss": 39.577, + "step": 115190 + }, + { + "epoch": 0.46542257703511275, + "grad_norm": 936.6083984375, + "learning_rate": 3.2658998553133895e-05, + "loss": 43.7622, + "step": 115200 + }, + { + "epoch": 0.4654629783004804, + "grad_norm": 458.98114013671875, + "learning_rate": 3.2655675665695754e-05, + "loss": 49.8332, + "step": 115210 + }, + { + "epoch": 0.46550337956584803, + "grad_norm": 636.2042846679688, + "learning_rate": 3.2652352629006275e-05, + "loss": 63.9273, + "step": 115220 + }, + { + "epoch": 0.4655437808312156, + "grad_norm": 556.1316528320312, + "learning_rate": 3.264902944313023e-05, + "loss": 46.3902, + "step": 115230 + }, + { + "epoch": 0.46558418209658325, + "grad_norm": 724.4549560546875, + "learning_rate": 3.2645706108132424e-05, + "loss": 54.3436, + "step": 115240 + }, + { + "epoch": 0.4656245833619509, + "grad_norm": 546.86767578125, + "learning_rate": 3.264238262407764e-05, + "loss": 50.2506, + "step": 115250 + }, + { + "epoch": 0.46566498462731853, + "grad_norm": 784.9192504882812, + "learning_rate": 3.263905899103068e-05, + "loss": 40.6821, + "step": 115260 + }, + { + "epoch": 0.4657053858926862, + "grad_norm": 686.7406616210938, + "learning_rate": 3.263573520905633e-05, + "loss": 52.2234, + "step": 115270 + }, + { + "epoch": 0.4657457871580538, + "grad_norm": 709.7639770507812, + "learning_rate": 3.263241127821938e-05, + "loss": 57.2129, + "step": 115280 + }, + { + "epoch": 0.4657861884234214, + "grad_norm": 704.1000366210938, + "learning_rate": 3.262908719858466e-05, + "loss": 44.5228, + "step": 115290 + }, + { + "epoch": 0.46582658968878904, + "grad_norm": 444.74029541015625, + "learning_rate": 3.262576297021695e-05, + "loss": 49.4663, + "step": 115300 + }, + { + "epoch": 0.4658669909541567, + "grad_norm": 951.1953125, + "learning_rate": 3.262243859318105e-05, + "loss": 60.8233, + "step": 115310 + }, + { + "epoch": 0.4659073922195243, + "grad_norm": 466.8185119628906, + "learning_rate": 3.2619114067541796e-05, + "loss": 92.2595, + "step": 115320 + }, + { + "epoch": 0.46594779348489196, + "grad_norm": 581.5831298828125, + "learning_rate": 3.2615789393363995e-05, + "loss": 53.4414, + "step": 115330 + }, + { + "epoch": 0.4659881947502596, + "grad_norm": 1562.436767578125, + "learning_rate": 3.261246457071245e-05, + "loss": 61.0114, + "step": 115340 + }, + { + "epoch": 0.4660285960156272, + "grad_norm": 1172.59130859375, + "learning_rate": 3.260913959965201e-05, + "loss": 71.2536, + "step": 115350 + }, + { + "epoch": 0.4660689972809948, + "grad_norm": 751.3859252929688, + "learning_rate": 3.260581448024745e-05, + "loss": 53.4582, + "step": 115360 + }, + { + "epoch": 0.46610939854636246, + "grad_norm": 479.7427673339844, + "learning_rate": 3.260248921256364e-05, + "loss": 38.3002, + "step": 115370 + }, + { + "epoch": 0.4661497998117301, + "grad_norm": 562.0839233398438, + "learning_rate": 3.2599163796665376e-05, + "loss": 74.0789, + "step": 115380 + }, + { + "epoch": 0.46619020107709774, + "grad_norm": 1441.49853515625, + "learning_rate": 3.25958382326175e-05, + "loss": 46.7784, + "step": 115390 + }, + { + "epoch": 0.4662306023424654, + "grad_norm": 586.0812377929688, + "learning_rate": 3.2592512520484856e-05, + "loss": 54.3181, + "step": 115400 + }, + { + "epoch": 0.466271003607833, + "grad_norm": 782.4759521484375, + "learning_rate": 3.2589186660332274e-05, + "loss": 59.9024, + "step": 115410 + }, + { + "epoch": 0.4663114048732006, + "grad_norm": 623.7604370117188, + "learning_rate": 3.2585860652224585e-05, + "loss": 52.6946, + "step": 115420 + }, + { + "epoch": 0.46635180613856825, + "grad_norm": 958.4971313476562, + "learning_rate": 3.2582534496226644e-05, + "loss": 56.1301, + "step": 115430 + }, + { + "epoch": 0.4663922074039359, + "grad_norm": 803.4427490234375, + "learning_rate": 3.257920819240328e-05, + "loss": 56.7052, + "step": 115440 + }, + { + "epoch": 0.46643260866930353, + "grad_norm": 481.8475646972656, + "learning_rate": 3.2575881740819355e-05, + "loss": 53.755, + "step": 115450 + }, + { + "epoch": 0.46647300993467117, + "grad_norm": 542.654052734375, + "learning_rate": 3.257255514153971e-05, + "loss": 66.2002, + "step": 115460 + }, + { + "epoch": 0.4665134112000388, + "grad_norm": 468.3689270019531, + "learning_rate": 3.256922839462921e-05, + "loss": 60.2367, + "step": 115470 + }, + { + "epoch": 0.4665538124654064, + "grad_norm": 1403.767333984375, + "learning_rate": 3.25659015001527e-05, + "loss": 108.456, + "step": 115480 + }, + { + "epoch": 0.46659421373077403, + "grad_norm": 708.3703002929688, + "learning_rate": 3.2562574458175044e-05, + "loss": 46.7288, + "step": 115490 + }, + { + "epoch": 0.4666346149961417, + "grad_norm": 1207.593017578125, + "learning_rate": 3.2559247268761115e-05, + "loss": 63.8019, + "step": 115500 + }, + { + "epoch": 0.4666750162615093, + "grad_norm": 919.4540405273438, + "learning_rate": 3.2555919931975766e-05, + "loss": 71.2822, + "step": 115510 + }, + { + "epoch": 0.46671541752687695, + "grad_norm": 2360.607421875, + "learning_rate": 3.2552592447883865e-05, + "loss": 63.8303, + "step": 115520 + }, + { + "epoch": 0.4667558187922446, + "grad_norm": 1612.3497314453125, + "learning_rate": 3.254926481655028e-05, + "loss": 45.8642, + "step": 115530 + }, + { + "epoch": 0.4667962200576122, + "grad_norm": 1008.3920288085938, + "learning_rate": 3.25459370380399e-05, + "loss": 49.0776, + "step": 115540 + }, + { + "epoch": 0.4668366213229798, + "grad_norm": 2217.866943359375, + "learning_rate": 3.254260911241759e-05, + "loss": 72.4225, + "step": 115550 + }, + { + "epoch": 0.46687702258834746, + "grad_norm": 413.4029235839844, + "learning_rate": 3.253928103974823e-05, + "loss": 66.9536, + "step": 115560 + }, + { + "epoch": 0.4669174238537151, + "grad_norm": 389.1944580078125, + "learning_rate": 3.253595282009671e-05, + "loss": 32.475, + "step": 115570 + }, + { + "epoch": 0.46695782511908274, + "grad_norm": 0.0, + "learning_rate": 3.253262445352791e-05, + "loss": 60.644, + "step": 115580 + }, + { + "epoch": 0.4669982263844504, + "grad_norm": 366.5119934082031, + "learning_rate": 3.252929594010671e-05, + "loss": 59.2907, + "step": 115590 + }, + { + "epoch": 0.467038627649818, + "grad_norm": 882.8370971679688, + "learning_rate": 3.2525967279898015e-05, + "loss": 70.1194, + "step": 115600 + }, + { + "epoch": 0.4670790289151856, + "grad_norm": 955.0043334960938, + "learning_rate": 3.252263847296671e-05, + "loss": 62.6897, + "step": 115610 + }, + { + "epoch": 0.46711943018055324, + "grad_norm": 815.323486328125, + "learning_rate": 3.25193095193777e-05, + "loss": 30.4579, + "step": 115620 + }, + { + "epoch": 0.4671598314459209, + "grad_norm": 721.734130859375, + "learning_rate": 3.251598041919587e-05, + "loss": 66.6569, + "step": 115630 + }, + { + "epoch": 0.4672002327112885, + "grad_norm": 870.7584838867188, + "learning_rate": 3.251265117248614e-05, + "loss": 52.2032, + "step": 115640 + }, + { + "epoch": 0.46724063397665616, + "grad_norm": 4967.96728515625, + "learning_rate": 3.25093217793134e-05, + "loss": 56.7999, + "step": 115650 + }, + { + "epoch": 0.4672810352420238, + "grad_norm": 805.152099609375, + "learning_rate": 3.250599223974258e-05, + "loss": 52.8073, + "step": 115660 + }, + { + "epoch": 0.4673214365073914, + "grad_norm": 614.6864624023438, + "learning_rate": 3.250266255383857e-05, + "loss": 54.9746, + "step": 115670 + }, + { + "epoch": 0.467361837772759, + "grad_norm": 1058.3001708984375, + "learning_rate": 3.249933272166629e-05, + "loss": 92.6986, + "step": 115680 + }, + { + "epoch": 0.46740223903812667, + "grad_norm": 1258.4139404296875, + "learning_rate": 3.249600274329066e-05, + "loss": 54.6076, + "step": 115690 + }, + { + "epoch": 0.4674426403034943, + "grad_norm": 712.8331909179688, + "learning_rate": 3.24926726187766e-05, + "loss": 65.3554, + "step": 115700 + }, + { + "epoch": 0.46748304156886195, + "grad_norm": 826.5805053710938, + "learning_rate": 3.248934234818902e-05, + "loss": 82.2117, + "step": 115710 + }, + { + "epoch": 0.4675234428342296, + "grad_norm": 770.5543212890625, + "learning_rate": 3.248601193159287e-05, + "loss": 73.6685, + "step": 115720 + }, + { + "epoch": 0.4675638440995972, + "grad_norm": 780.5651245117188, + "learning_rate": 3.248268136905304e-05, + "loss": 44.8846, + "step": 115730 + }, + { + "epoch": 0.4676042453649648, + "grad_norm": 1183.1986083984375, + "learning_rate": 3.247935066063451e-05, + "loss": 57.1673, + "step": 115740 + }, + { + "epoch": 0.46764464663033245, + "grad_norm": 461.81573486328125, + "learning_rate": 3.247601980640217e-05, + "loss": 45.7133, + "step": 115750 + }, + { + "epoch": 0.4676850478957001, + "grad_norm": 720.4097290039062, + "learning_rate": 3.247268880642098e-05, + "loss": 44.3927, + "step": 115760 + }, + { + "epoch": 0.46772544916106773, + "grad_norm": 1013.3065795898438, + "learning_rate": 3.246935766075588e-05, + "loss": 52.5788, + "step": 115770 + }, + { + "epoch": 0.46776585042643537, + "grad_norm": 947.147216796875, + "learning_rate": 3.24660263694718e-05, + "loss": 93.3442, + "step": 115780 + }, + { + "epoch": 0.467806251691803, + "grad_norm": 676.0074462890625, + "learning_rate": 3.24626949326337e-05, + "loss": 48.2604, + "step": 115790 + }, + { + "epoch": 0.4678466529571706, + "grad_norm": 433.5167541503906, + "learning_rate": 3.245936335030651e-05, + "loss": 68.8754, + "step": 115800 + }, + { + "epoch": 0.46788705422253823, + "grad_norm": 759.6268920898438, + "learning_rate": 3.2456031622555197e-05, + "loss": 42.3743, + "step": 115810 + }, + { + "epoch": 0.4679274554879059, + "grad_norm": 650.4202880859375, + "learning_rate": 3.245269974944471e-05, + "loss": 83.6153, + "step": 115820 + }, + { + "epoch": 0.4679678567532735, + "grad_norm": 963.0789794921875, + "learning_rate": 3.2449367731039996e-05, + "loss": 60.6028, + "step": 115830 + }, + { + "epoch": 0.46800825801864115, + "grad_norm": 2292.56005859375, + "learning_rate": 3.244603556740603e-05, + "loss": 74.8116, + "step": 115840 + }, + { + "epoch": 0.4680486592840088, + "grad_norm": 457.2271423339844, + "learning_rate": 3.2442703258607766e-05, + "loss": 37.3418, + "step": 115850 + }, + { + "epoch": 0.4680890605493764, + "grad_norm": 474.98126220703125, + "learning_rate": 3.243937080471017e-05, + "loss": 61.6, + "step": 115860 + }, + { + "epoch": 0.468129461814744, + "grad_norm": 528.2426147460938, + "learning_rate": 3.243603820577822e-05, + "loss": 37.5654, + "step": 115870 + }, + { + "epoch": 0.46816986308011166, + "grad_norm": 885.8614501953125, + "learning_rate": 3.243270546187687e-05, + "loss": 39.1532, + "step": 115880 + }, + { + "epoch": 0.4682102643454793, + "grad_norm": 2204.4306640625, + "learning_rate": 3.242937257307109e-05, + "loss": 67.4763, + "step": 115890 + }, + { + "epoch": 0.46825066561084694, + "grad_norm": 1517.2777099609375, + "learning_rate": 3.2426039539425876e-05, + "loss": 53.94, + "step": 115900 + }, + { + "epoch": 0.4682910668762146, + "grad_norm": 2055.982666015625, + "learning_rate": 3.2422706361006194e-05, + "loss": 58.1359, + "step": 115910 + }, + { + "epoch": 0.4683314681415822, + "grad_norm": 607.13330078125, + "learning_rate": 3.241937303787703e-05, + "loss": 49.6527, + "step": 115920 + }, + { + "epoch": 0.4683718694069498, + "grad_norm": 899.364990234375, + "learning_rate": 3.2416039570103375e-05, + "loss": 57.977, + "step": 115930 + }, + { + "epoch": 0.46841227067231744, + "grad_norm": 654.2115478515625, + "learning_rate": 3.241270595775021e-05, + "loss": 73.5976, + "step": 115940 + }, + { + "epoch": 0.4684526719376851, + "grad_norm": 786.1331176757812, + "learning_rate": 3.240937220088253e-05, + "loss": 35.1766, + "step": 115950 + }, + { + "epoch": 0.4684930732030527, + "grad_norm": 1251.7491455078125, + "learning_rate": 3.240603829956531e-05, + "loss": 62.6786, + "step": 115960 + }, + { + "epoch": 0.46853347446842036, + "grad_norm": 1378.3673095703125, + "learning_rate": 3.240270425386357e-05, + "loss": 75.6629, + "step": 115970 + }, + { + "epoch": 0.468573875733788, + "grad_norm": 344.47747802734375, + "learning_rate": 3.2399370063842294e-05, + "loss": 46.953, + "step": 115980 + }, + { + "epoch": 0.4686142769991556, + "grad_norm": 383.6866760253906, + "learning_rate": 3.23960357295665e-05, + "loss": 47.923, + "step": 115990 + }, + { + "epoch": 0.46865467826452323, + "grad_norm": 471.9056091308594, + "learning_rate": 3.239270125110117e-05, + "loss": 77.3322, + "step": 116000 + }, + { + "epoch": 0.46869507952989087, + "grad_norm": 879.043701171875, + "learning_rate": 3.238936662851133e-05, + "loss": 48.9826, + "step": 116010 + }, + { + "epoch": 0.4687354807952585, + "grad_norm": 632.9555053710938, + "learning_rate": 3.2386031861861976e-05, + "loss": 31.8865, + "step": 116020 + }, + { + "epoch": 0.46877588206062615, + "grad_norm": 1467.3824462890625, + "learning_rate": 3.2382696951218135e-05, + "loss": 95.1815, + "step": 116030 + }, + { + "epoch": 0.4688162833259938, + "grad_norm": 336.1209411621094, + "learning_rate": 3.2379361896644816e-05, + "loss": 54.7575, + "step": 116040 + }, + { + "epoch": 0.46885668459136143, + "grad_norm": 873.735595703125, + "learning_rate": 3.237602669820704e-05, + "loss": 63.7571, + "step": 116050 + }, + { + "epoch": 0.468897085856729, + "grad_norm": 537.8552856445312, + "learning_rate": 3.2372691355969816e-05, + "loss": 55.9651, + "step": 116060 + }, + { + "epoch": 0.46893748712209665, + "grad_norm": 353.98614501953125, + "learning_rate": 3.2369355869998185e-05, + "loss": 39.4995, + "step": 116070 + }, + { + "epoch": 0.4689778883874643, + "grad_norm": 195.72950744628906, + "learning_rate": 3.236602024035716e-05, + "loss": 53.4221, + "step": 116080 + }, + { + "epoch": 0.46901828965283193, + "grad_norm": 657.9056396484375, + "learning_rate": 3.236268446711179e-05, + "loss": 66.8073, + "step": 116090 + }, + { + "epoch": 0.4690586909181996, + "grad_norm": 483.935546875, + "learning_rate": 3.235934855032709e-05, + "loss": 51.066, + "step": 116100 + }, + { + "epoch": 0.4690990921835672, + "grad_norm": 1542.11865234375, + "learning_rate": 3.23560124900681e-05, + "loss": 89.0544, + "step": 116110 + }, + { + "epoch": 0.4691394934489348, + "grad_norm": 838.8958740234375, + "learning_rate": 3.235267628639987e-05, + "loss": 60.8755, + "step": 116120 + }, + { + "epoch": 0.46917989471430244, + "grad_norm": 861.8223266601562, + "learning_rate": 3.234933993938742e-05, + "loss": 47.4594, + "step": 116130 + }, + { + "epoch": 0.4692202959796701, + "grad_norm": 550.157958984375, + "learning_rate": 3.2346003449095805e-05, + "loss": 53.5828, + "step": 116140 + }, + { + "epoch": 0.4692606972450377, + "grad_norm": 503.9146728515625, + "learning_rate": 3.234266681559007e-05, + "loss": 78.4864, + "step": 116150 + }, + { + "epoch": 0.46930109851040536, + "grad_norm": 333.4444885253906, + "learning_rate": 3.2339330038935265e-05, + "loss": 54.1187, + "step": 116160 + }, + { + "epoch": 0.469341499775773, + "grad_norm": 666.8385620117188, + "learning_rate": 3.233599311919644e-05, + "loss": 60.4023, + "step": 116170 + }, + { + "epoch": 0.4693819010411406, + "grad_norm": 641.484619140625, + "learning_rate": 3.233265605643866e-05, + "loss": 63.9357, + "step": 116180 + }, + { + "epoch": 0.4694223023065082, + "grad_norm": 1227.126953125, + "learning_rate": 3.232931885072697e-05, + "loss": 61.4396, + "step": 116190 + }, + { + "epoch": 0.46946270357187586, + "grad_norm": 1222.384765625, + "learning_rate": 3.2325981502126433e-05, + "loss": 61.0376, + "step": 116200 + }, + { + "epoch": 0.4695031048372435, + "grad_norm": 706.1589965820312, + "learning_rate": 3.232264401070213e-05, + "loss": 58.2779, + "step": 116210 + }, + { + "epoch": 0.46954350610261114, + "grad_norm": 558.5364990234375, + "learning_rate": 3.231930637651909e-05, + "loss": 52.5935, + "step": 116220 + }, + { + "epoch": 0.4695839073679788, + "grad_norm": 928.905029296875, + "learning_rate": 3.231596859964242e-05, + "loss": 61.5201, + "step": 116230 + }, + { + "epoch": 0.4696243086333464, + "grad_norm": 630.4097290039062, + "learning_rate": 3.2312630680137175e-05, + "loss": 37.5664, + "step": 116240 + }, + { + "epoch": 0.469664709898714, + "grad_norm": 1384.9384765625, + "learning_rate": 3.230929261806842e-05, + "loss": 53.7496, + "step": 116250 + }, + { + "epoch": 0.46970511116408165, + "grad_norm": 147.7491912841797, + "learning_rate": 3.230595441350125e-05, + "loss": 55.0247, + "step": 116260 + }, + { + "epoch": 0.4697455124294493, + "grad_norm": 334.8374938964844, + "learning_rate": 3.2302616066500735e-05, + "loss": 46.3881, + "step": 116270 + }, + { + "epoch": 0.4697859136948169, + "grad_norm": 789.24755859375, + "learning_rate": 3.229927757713196e-05, + "loss": 70.6806, + "step": 116280 + }, + { + "epoch": 0.46982631496018457, + "grad_norm": 1363.5418701171875, + "learning_rate": 3.229593894546001e-05, + "loss": 58.1901, + "step": 116290 + }, + { + "epoch": 0.4698667162255522, + "grad_norm": 609.9053344726562, + "learning_rate": 3.229260017154997e-05, + "loss": 98.0226, + "step": 116300 + }, + { + "epoch": 0.4699071174909198, + "grad_norm": 865.0936279296875, + "learning_rate": 3.228926125546695e-05, + "loss": 61.3911, + "step": 116310 + }, + { + "epoch": 0.46994751875628743, + "grad_norm": 590.865234375, + "learning_rate": 3.228592219727602e-05, + "loss": 86.893, + "step": 116320 + }, + { + "epoch": 0.46998792002165507, + "grad_norm": 789.493408203125, + "learning_rate": 3.2282582997042285e-05, + "loss": 49.3227, + "step": 116330 + }, + { + "epoch": 0.4700283212870227, + "grad_norm": 707.34814453125, + "learning_rate": 3.2279243654830836e-05, + "loss": 69.9071, + "step": 116340 + }, + { + "epoch": 0.47006872255239035, + "grad_norm": 1062.26123046875, + "learning_rate": 3.2275904170706797e-05, + "loss": 55.3705, + "step": 116350 + }, + { + "epoch": 0.470109123817758, + "grad_norm": 967.4932861328125, + "learning_rate": 3.227256454473526e-05, + "loss": 42.835, + "step": 116360 + }, + { + "epoch": 0.47014952508312563, + "grad_norm": 645.4476318359375, + "learning_rate": 3.226922477698133e-05, + "loss": 57.326, + "step": 116370 + }, + { + "epoch": 0.4701899263484932, + "grad_norm": 884.6676025390625, + "learning_rate": 3.226588486751012e-05, + "loss": 66.296, + "step": 116380 + }, + { + "epoch": 0.47023032761386085, + "grad_norm": 3511.986083984375, + "learning_rate": 3.2262544816386745e-05, + "loss": 49.9027, + "step": 116390 + }, + { + "epoch": 0.4702707288792285, + "grad_norm": 476.053955078125, + "learning_rate": 3.225920462367632e-05, + "loss": 68.0334, + "step": 116400 + }, + { + "epoch": 0.47031113014459613, + "grad_norm": 367.8993225097656, + "learning_rate": 3.225586428944396e-05, + "loss": 41.9805, + "step": 116410 + }, + { + "epoch": 0.4703515314099638, + "grad_norm": 500.8946533203125, + "learning_rate": 3.225252381375479e-05, + "loss": 65.6448, + "step": 116420 + }, + { + "epoch": 0.4703919326753314, + "grad_norm": 1217.2655029296875, + "learning_rate": 3.224918319667394e-05, + "loss": 83.4919, + "step": 116430 + }, + { + "epoch": 0.470432333940699, + "grad_norm": 662.4497680664062, + "learning_rate": 3.2245842438266526e-05, + "loss": 65.4877, + "step": 116440 + }, + { + "epoch": 0.47047273520606664, + "grad_norm": 725.7619018554688, + "learning_rate": 3.224250153859769e-05, + "loss": 55.515, + "step": 116450 + }, + { + "epoch": 0.4705131364714343, + "grad_norm": 910.7822265625, + "learning_rate": 3.223916049773256e-05, + "loss": 43.4893, + "step": 116460 + }, + { + "epoch": 0.4705535377368019, + "grad_norm": 2010.1138916015625, + "learning_rate": 3.223581931573625e-05, + "loss": 58.6502, + "step": 116470 + }, + { + "epoch": 0.47059393900216956, + "grad_norm": 715.5496826171875, + "learning_rate": 3.223247799267394e-05, + "loss": 84.7735, + "step": 116480 + }, + { + "epoch": 0.4706343402675372, + "grad_norm": 737.7570190429688, + "learning_rate": 3.2229136528610736e-05, + "loss": 39.7872, + "step": 116490 + }, + { + "epoch": 0.4706747415329048, + "grad_norm": 532.3171997070312, + "learning_rate": 3.222579492361179e-05, + "loss": 67.2663, + "step": 116500 + }, + { + "epoch": 0.4707151427982724, + "grad_norm": 1556.9482421875, + "learning_rate": 3.222245317774226e-05, + "loss": 57.7589, + "step": 116510 + }, + { + "epoch": 0.47075554406364006, + "grad_norm": 1564.6119384765625, + "learning_rate": 3.221911129106728e-05, + "loss": 41.327, + "step": 116520 + }, + { + "epoch": 0.4707959453290077, + "grad_norm": 668.7022094726562, + "learning_rate": 3.221576926365202e-05, + "loss": 53.1316, + "step": 116530 + }, + { + "epoch": 0.47083634659437534, + "grad_norm": 940.4009399414062, + "learning_rate": 3.221242709556161e-05, + "loss": 52.2775, + "step": 116540 + }, + { + "epoch": 0.470876747859743, + "grad_norm": 929.6431884765625, + "learning_rate": 3.220908478686123e-05, + "loss": 63.8318, + "step": 116550 + }, + { + "epoch": 0.4709171491251106, + "grad_norm": 770.2941284179688, + "learning_rate": 3.220574233761603e-05, + "loss": 50.6914, + "step": 116560 + }, + { + "epoch": 0.4709575503904782, + "grad_norm": 2690.2001953125, + "learning_rate": 3.220239974789117e-05, + "loss": 103.2043, + "step": 116570 + }, + { + "epoch": 0.47099795165584585, + "grad_norm": 744.5656127929688, + "learning_rate": 3.219905701775182e-05, + "loss": 40.3893, + "step": 116580 + }, + { + "epoch": 0.4710383529212135, + "grad_norm": 2249.509765625, + "learning_rate": 3.219571414726315e-05, + "loss": 55.5555, + "step": 116590 + }, + { + "epoch": 0.47107875418658113, + "grad_norm": 392.30657958984375, + "learning_rate": 3.219237113649032e-05, + "loss": 63.3421, + "step": 116600 + }, + { + "epoch": 0.47111915545194877, + "grad_norm": 0.0, + "learning_rate": 3.2189027985498514e-05, + "loss": 82.4716, + "step": 116610 + }, + { + "epoch": 0.4711595567173164, + "grad_norm": 626.5792846679688, + "learning_rate": 3.2185684694352916e-05, + "loss": 38.248, + "step": 116620 + }, + { + "epoch": 0.471199957982684, + "grad_norm": 0.0, + "learning_rate": 3.218234126311869e-05, + "loss": 39.9608, + "step": 116630 + }, + { + "epoch": 0.47124035924805163, + "grad_norm": 959.5233154296875, + "learning_rate": 3.2178997691861014e-05, + "loss": 56.8771, + "step": 116640 + }, + { + "epoch": 0.4712807605134193, + "grad_norm": 368.611083984375, + "learning_rate": 3.217565398064509e-05, + "loss": 30.2148, + "step": 116650 + }, + { + "epoch": 0.4713211617787869, + "grad_norm": 2147.029296875, + "learning_rate": 3.2172310129536096e-05, + "loss": 56.8126, + "step": 116660 + }, + { + "epoch": 0.47136156304415455, + "grad_norm": 528.3900146484375, + "learning_rate": 3.2168966138599225e-05, + "loss": 93.9726, + "step": 116670 + }, + { + "epoch": 0.4714019643095222, + "grad_norm": 1705.246337890625, + "learning_rate": 3.2165622007899676e-05, + "loss": 42.9779, + "step": 116680 + }, + { + "epoch": 0.47144236557488983, + "grad_norm": 1022.4556884765625, + "learning_rate": 3.216227773750262e-05, + "loss": 40.3776, + "step": 116690 + }, + { + "epoch": 0.4714827668402574, + "grad_norm": 836.2701416015625, + "learning_rate": 3.215893332747328e-05, + "loss": 75.0597, + "step": 116700 + }, + { + "epoch": 0.47152316810562506, + "grad_norm": 493.19110107421875, + "learning_rate": 3.2155588777876856e-05, + "loss": 45.9303, + "step": 116710 + }, + { + "epoch": 0.4715635693709927, + "grad_norm": 800.6790771484375, + "learning_rate": 3.215224408877854e-05, + "loss": 52.3173, + "step": 116720 + }, + { + "epoch": 0.47160397063636034, + "grad_norm": 1031.7352294921875, + "learning_rate": 3.2148899260243545e-05, + "loss": 41.5077, + "step": 116730 + }, + { + "epoch": 0.471644371901728, + "grad_norm": 524.3075561523438, + "learning_rate": 3.214555429233707e-05, + "loss": 43.6509, + "step": 116740 + }, + { + "epoch": 0.4716847731670956, + "grad_norm": 2053.65576171875, + "learning_rate": 3.214220918512434e-05, + "loss": 52.429, + "step": 116750 + }, + { + "epoch": 0.4717251744324632, + "grad_norm": 660.8265380859375, + "learning_rate": 3.213886393867057e-05, + "loss": 59.9484, + "step": 116760 + }, + { + "epoch": 0.47176557569783084, + "grad_norm": 539.3421630859375, + "learning_rate": 3.2135518553040964e-05, + "loss": 56.5854, + "step": 116770 + }, + { + "epoch": 0.4718059769631985, + "grad_norm": 461.3349304199219, + "learning_rate": 3.2132173028300756e-05, + "loss": 56.3017, + "step": 116780 + }, + { + "epoch": 0.4718463782285661, + "grad_norm": 1058.8426513671875, + "learning_rate": 3.212882736451516e-05, + "loss": 47.9099, + "step": 116790 + }, + { + "epoch": 0.47188677949393376, + "grad_norm": 521.5872802734375, + "learning_rate": 3.21254815617494e-05, + "loss": 48.9531, + "step": 116800 + }, + { + "epoch": 0.4719271807593014, + "grad_norm": 789.1767578125, + "learning_rate": 3.212213562006872e-05, + "loss": 77.1051, + "step": 116810 + }, + { + "epoch": 0.471967582024669, + "grad_norm": 710.3841552734375, + "learning_rate": 3.2118789539538335e-05, + "loss": 69.6694, + "step": 116820 + }, + { + "epoch": 0.4720079832900366, + "grad_norm": 1419.693115234375, + "learning_rate": 3.211544332022348e-05, + "loss": 65.4239, + "step": 116830 + }, + { + "epoch": 0.47204838455540427, + "grad_norm": 518.5504760742188, + "learning_rate": 3.21120969621894e-05, + "loss": 60.0513, + "step": 116840 + }, + { + "epoch": 0.4720887858207719, + "grad_norm": 385.5600280761719, + "learning_rate": 3.210875046550132e-05, + "loss": 39.933, + "step": 116850 + }, + { + "epoch": 0.47212918708613955, + "grad_norm": 850.2272338867188, + "learning_rate": 3.210540383022449e-05, + "loss": 81.0095, + "step": 116860 + }, + { + "epoch": 0.4721695883515072, + "grad_norm": 443.19189453125, + "learning_rate": 3.210205705642416e-05, + "loss": 81.4084, + "step": 116870 + }, + { + "epoch": 0.4722099896168748, + "grad_norm": 490.591796875, + "learning_rate": 3.209871014416557e-05, + "loss": 44.2058, + "step": 116880 + }, + { + "epoch": 0.4722503908822424, + "grad_norm": 1418.0242919921875, + "learning_rate": 3.209536309351397e-05, + "loss": 80.6199, + "step": 116890 + }, + { + "epoch": 0.47229079214761005, + "grad_norm": 1384.6348876953125, + "learning_rate": 3.209201590453461e-05, + "loss": 90.8569, + "step": 116900 + }, + { + "epoch": 0.4723311934129777, + "grad_norm": 595.4918823242188, + "learning_rate": 3.208866857729276e-05, + "loss": 63.6417, + "step": 116910 + }, + { + "epoch": 0.47237159467834533, + "grad_norm": 689.3577880859375, + "learning_rate": 3.208532111185365e-05, + "loss": 57.5565, + "step": 116920 + }, + { + "epoch": 0.47241199594371297, + "grad_norm": 611.20751953125, + "learning_rate": 3.208197350828257e-05, + "loss": 80.1349, + "step": 116930 + }, + { + "epoch": 0.4724523972090806, + "grad_norm": 793.7510375976562, + "learning_rate": 3.207862576664477e-05, + "loss": 73.2373, + "step": 116940 + }, + { + "epoch": 0.4724927984744482, + "grad_norm": 777.3289184570312, + "learning_rate": 3.207527788700551e-05, + "loss": 74.5804, + "step": 116950 + }, + { + "epoch": 0.47253319973981583, + "grad_norm": 1934.6875, + "learning_rate": 3.207192986943006e-05, + "loss": 78.4664, + "step": 116960 + }, + { + "epoch": 0.4725736010051835, + "grad_norm": 573.094482421875, + "learning_rate": 3.206858171398371e-05, + "loss": 77.9668, + "step": 116970 + }, + { + "epoch": 0.4726140022705511, + "grad_norm": 290.15185546875, + "learning_rate": 3.206523342073172e-05, + "loss": 56.8567, + "step": 116980 + }, + { + "epoch": 0.47265440353591875, + "grad_norm": 603.3993530273438, + "learning_rate": 3.206188498973935e-05, + "loss": 62.5748, + "step": 116990 + }, + { + "epoch": 0.4726948048012864, + "grad_norm": 459.7473449707031, + "learning_rate": 3.205853642107192e-05, + "loss": 55.5894, + "step": 117000 + }, + { + "epoch": 0.47273520606665403, + "grad_norm": 2034.221923828125, + "learning_rate": 3.2055187714794674e-05, + "loss": 94.7735, + "step": 117010 + }, + { + "epoch": 0.4727756073320216, + "grad_norm": 556.3843383789062, + "learning_rate": 3.205183887097291e-05, + "loss": 49.9171, + "step": 117020 + }, + { + "epoch": 0.47281600859738926, + "grad_norm": 618.3563232421875, + "learning_rate": 3.2048489889671915e-05, + "loss": 64.9501, + "step": 117030 + }, + { + "epoch": 0.4728564098627569, + "grad_norm": 1050.610107421875, + "learning_rate": 3.204514077095699e-05, + "loss": 45.0024, + "step": 117040 + }, + { + "epoch": 0.47289681112812454, + "grad_norm": 676.1912841796875, + "learning_rate": 3.2041791514893416e-05, + "loss": 71.348, + "step": 117050 + }, + { + "epoch": 0.4729372123934922, + "grad_norm": 338.9596252441406, + "learning_rate": 3.2038442121546487e-05, + "loss": 60.3322, + "step": 117060 + }, + { + "epoch": 0.4729776136588598, + "grad_norm": 1104.4334716796875, + "learning_rate": 3.2035092590981514e-05, + "loss": 65.3458, + "step": 117070 + }, + { + "epoch": 0.4730180149242274, + "grad_norm": 977.1143188476562, + "learning_rate": 3.203174292326378e-05, + "loss": 65.3839, + "step": 117080 + }, + { + "epoch": 0.47305841618959504, + "grad_norm": 552.0875854492188, + "learning_rate": 3.20283931184586e-05, + "loss": 52.0979, + "step": 117090 + }, + { + "epoch": 0.4730988174549627, + "grad_norm": 615.712646484375, + "learning_rate": 3.202504317663128e-05, + "loss": 32.2021, + "step": 117100 + }, + { + "epoch": 0.4731392187203303, + "grad_norm": 550.6535034179688, + "learning_rate": 3.2021693097847125e-05, + "loss": 70.6825, + "step": 117110 + }, + { + "epoch": 0.47317961998569796, + "grad_norm": 610.3817749023438, + "learning_rate": 3.2018342882171445e-05, + "loss": 36.3148, + "step": 117120 + }, + { + "epoch": 0.4732200212510656, + "grad_norm": 755.9873657226562, + "learning_rate": 3.2014992529669566e-05, + "loss": 44.023, + "step": 117130 + }, + { + "epoch": 0.4732604225164332, + "grad_norm": 847.390625, + "learning_rate": 3.2011642040406784e-05, + "loss": 49.9786, + "step": 117140 + }, + { + "epoch": 0.47330082378180083, + "grad_norm": 644.954345703125, + "learning_rate": 3.200829141444844e-05, + "loss": 93.2963, + "step": 117150 + }, + { + "epoch": 0.47334122504716847, + "grad_norm": 646.0951538085938, + "learning_rate": 3.2004940651859844e-05, + "loss": 55.8603, + "step": 117160 + }, + { + "epoch": 0.4733816263125361, + "grad_norm": 1527.83447265625, + "learning_rate": 3.200158975270633e-05, + "loss": 69.3903, + "step": 117170 + }, + { + "epoch": 0.47342202757790375, + "grad_norm": 1161.49658203125, + "learning_rate": 3.1998238717053206e-05, + "loss": 63.7593, + "step": 117180 + }, + { + "epoch": 0.4734624288432714, + "grad_norm": 1293.619140625, + "learning_rate": 3.199488754496582e-05, + "loss": 63.1005, + "step": 117190 + }, + { + "epoch": 0.47350283010863903, + "grad_norm": 505.3819580078125, + "learning_rate": 3.19915362365095e-05, + "loss": 50.2402, + "step": 117200 + }, + { + "epoch": 0.4735432313740066, + "grad_norm": 648.815185546875, + "learning_rate": 3.198818479174959e-05, + "loss": 53.0908, + "step": 117210 + }, + { + "epoch": 0.47358363263937425, + "grad_norm": 601.2451171875, + "learning_rate": 3.198483321075141e-05, + "loss": 56.0472, + "step": 117220 + }, + { + "epoch": 0.4736240339047419, + "grad_norm": 401.19842529296875, + "learning_rate": 3.198148149358031e-05, + "loss": 58.7673, + "step": 117230 + }, + { + "epoch": 0.47366443517010953, + "grad_norm": 942.248046875, + "learning_rate": 3.197812964030164e-05, + "loss": 42.9004, + "step": 117240 + }, + { + "epoch": 0.4737048364354772, + "grad_norm": 1621.0218505859375, + "learning_rate": 3.1974777650980735e-05, + "loss": 59.7592, + "step": 117250 + }, + { + "epoch": 0.4737452377008448, + "grad_norm": 1157.42431640625, + "learning_rate": 3.197142552568295e-05, + "loss": 53.8315, + "step": 117260 + }, + { + "epoch": 0.4737856389662124, + "grad_norm": 2135.51611328125, + "learning_rate": 3.196807326447363e-05, + "loss": 74.3025, + "step": 117270 + }, + { + "epoch": 0.47382604023158004, + "grad_norm": 982.652587890625, + "learning_rate": 3.196472086741815e-05, + "loss": 56.9442, + "step": 117280 + }, + { + "epoch": 0.4738664414969477, + "grad_norm": 303.63623046875, + "learning_rate": 3.1961368334581844e-05, + "loss": 44.213, + "step": 117290 + }, + { + "epoch": 0.4739068427623153, + "grad_norm": 555.4284057617188, + "learning_rate": 3.195801566603007e-05, + "loss": 53.6509, + "step": 117300 + }, + { + "epoch": 0.47394724402768296, + "grad_norm": 946.455078125, + "learning_rate": 3.1954662861828204e-05, + "loss": 82.5156, + "step": 117310 + }, + { + "epoch": 0.4739876452930506, + "grad_norm": 1169.52685546875, + "learning_rate": 3.195130992204161e-05, + "loss": 57.4077, + "step": 117320 + }, + { + "epoch": 0.47402804655841824, + "grad_norm": 595.98291015625, + "learning_rate": 3.1947956846735645e-05, + "loss": 76.41, + "step": 117330 + }, + { + "epoch": 0.4740684478237858, + "grad_norm": 463.3590393066406, + "learning_rate": 3.194460363597569e-05, + "loss": 44.9645, + "step": 117340 + }, + { + "epoch": 0.47410884908915346, + "grad_norm": 443.7630615234375, + "learning_rate": 3.1941250289827104e-05, + "loss": 49.6183, + "step": 117350 + }, + { + "epoch": 0.4741492503545211, + "grad_norm": 2856.91259765625, + "learning_rate": 3.193789680835527e-05, + "loss": 59.4347, + "step": 117360 + }, + { + "epoch": 0.47418965161988874, + "grad_norm": 502.811279296875, + "learning_rate": 3.193454319162557e-05, + "loss": 52.4119, + "step": 117370 + }, + { + "epoch": 0.4742300528852564, + "grad_norm": 867.6959228515625, + "learning_rate": 3.193118943970338e-05, + "loss": 90.0065, + "step": 117380 + }, + { + "epoch": 0.474270454150624, + "grad_norm": 1995.4403076171875, + "learning_rate": 3.192783555265408e-05, + "loss": 68.9316, + "step": 117390 + }, + { + "epoch": 0.4743108554159916, + "grad_norm": 522.066162109375, + "learning_rate": 3.192448153054306e-05, + "loss": 41.8321, + "step": 117400 + }, + { + "epoch": 0.47435125668135925, + "grad_norm": 835.8606567382812, + "learning_rate": 3.1921127373435714e-05, + "loss": 67.2701, + "step": 117410 + }, + { + "epoch": 0.4743916579467269, + "grad_norm": 628.2836303710938, + "learning_rate": 3.191777308139742e-05, + "loss": 57.3309, + "step": 117420 + }, + { + "epoch": 0.4744320592120945, + "grad_norm": 582.8585205078125, + "learning_rate": 3.1914418654493586e-05, + "loss": 79.4639, + "step": 117430 + }, + { + "epoch": 0.47447246047746217, + "grad_norm": 452.00830078125, + "learning_rate": 3.191106409278959e-05, + "loss": 57.4894, + "step": 117440 + }, + { + "epoch": 0.4745128617428298, + "grad_norm": 867.4202270507812, + "learning_rate": 3.1907709396350844e-05, + "loss": 54.852, + "step": 117450 + }, + { + "epoch": 0.4745532630081974, + "grad_norm": 375.7869567871094, + "learning_rate": 3.190435456524275e-05, + "loss": 42.05, + "step": 117460 + }, + { + "epoch": 0.47459366427356503, + "grad_norm": 453.1134033203125, + "learning_rate": 3.190099959953071e-05, + "loss": 34.9451, + "step": 117470 + }, + { + "epoch": 0.47463406553893267, + "grad_norm": 614.6144409179688, + "learning_rate": 3.189764449928012e-05, + "loss": 86.9492, + "step": 117480 + }, + { + "epoch": 0.4746744668043003, + "grad_norm": 673.782470703125, + "learning_rate": 3.1894289264556417e-05, + "loss": 69.0601, + "step": 117490 + }, + { + "epoch": 0.47471486806966795, + "grad_norm": 541.9735107421875, + "learning_rate": 3.1890933895424976e-05, + "loss": 46.3959, + "step": 117500 + }, + { + "epoch": 0.4747552693350356, + "grad_norm": 379.82891845703125, + "learning_rate": 3.188757839195125e-05, + "loss": 44.2336, + "step": 117510 + }, + { + "epoch": 0.47479567060040323, + "grad_norm": 680.2620849609375, + "learning_rate": 3.1884222754200625e-05, + "loss": 54.9478, + "step": 117520 + }, + { + "epoch": 0.4748360718657708, + "grad_norm": 1045.8995361328125, + "learning_rate": 3.188086698223853e-05, + "loss": 62.2926, + "step": 117530 + }, + { + "epoch": 0.47487647313113845, + "grad_norm": 1066.854248046875, + "learning_rate": 3.1877511076130404e-05, + "loss": 52.8988, + "step": 117540 + }, + { + "epoch": 0.4749168743965061, + "grad_norm": 1133.3275146484375, + "learning_rate": 3.187415503594166e-05, + "loss": 50.2685, + "step": 117550 + }, + { + "epoch": 0.47495727566187373, + "grad_norm": 561.2869873046875, + "learning_rate": 3.1870798861737705e-05, + "loss": 71.2999, + "step": 117560 + }, + { + "epoch": 0.4749976769272414, + "grad_norm": 905.6627197265625, + "learning_rate": 3.1867442553584e-05, + "loss": 35.9215, + "step": 117570 + }, + { + "epoch": 0.475038078192609, + "grad_norm": 2765.510009765625, + "learning_rate": 3.186408611154597e-05, + "loss": 62.86, + "step": 117580 + }, + { + "epoch": 0.4750784794579766, + "grad_norm": 3249.31298828125, + "learning_rate": 3.186072953568905e-05, + "loss": 89.4793, + "step": 117590 + }, + { + "epoch": 0.47511888072334424, + "grad_norm": 1029.83642578125, + "learning_rate": 3.185737282607867e-05, + "loss": 52.9404, + "step": 117600 + }, + { + "epoch": 0.4751592819887119, + "grad_norm": 638.6604614257812, + "learning_rate": 3.1854015982780275e-05, + "loss": 70.1836, + "step": 117610 + }, + { + "epoch": 0.4751996832540795, + "grad_norm": 866.2382202148438, + "learning_rate": 3.185065900585931e-05, + "loss": 60.6042, + "step": 117620 + }, + { + "epoch": 0.47524008451944716, + "grad_norm": 967.8104858398438, + "learning_rate": 3.184730189538122e-05, + "loss": 67.1052, + "step": 117630 + }, + { + "epoch": 0.4752804857848148, + "grad_norm": 990.4212646484375, + "learning_rate": 3.1843944651411456e-05, + "loss": 44.4866, + "step": 117640 + }, + { + "epoch": 0.47532088705018244, + "grad_norm": 528.0682983398438, + "learning_rate": 3.184058727401546e-05, + "loss": 51.6501, + "step": 117650 + }, + { + "epoch": 0.47536128831555, + "grad_norm": 936.557373046875, + "learning_rate": 3.1837229763258705e-05, + "loss": 97.2483, + "step": 117660 + }, + { + "epoch": 0.47540168958091766, + "grad_norm": 870.8411865234375, + "learning_rate": 3.183387211920663e-05, + "loss": 81.8832, + "step": 117670 + }, + { + "epoch": 0.4754420908462853, + "grad_norm": 628.0894165039062, + "learning_rate": 3.183051434192471e-05, + "loss": 44.0012, + "step": 117680 + }, + { + "epoch": 0.47548249211165294, + "grad_norm": 423.3321228027344, + "learning_rate": 3.1827156431478386e-05, + "loss": 77.5057, + "step": 117690 + }, + { + "epoch": 0.4755228933770206, + "grad_norm": 483.94091796875, + "learning_rate": 3.1823798387933134e-05, + "loss": 39.6376, + "step": 117700 + }, + { + "epoch": 0.4755632946423882, + "grad_norm": 791.345703125, + "learning_rate": 3.182044021135442e-05, + "loss": 55.5527, + "step": 117710 + }, + { + "epoch": 0.4756036959077558, + "grad_norm": 498.8603210449219, + "learning_rate": 3.181708190180771e-05, + "loss": 41.6617, + "step": 117720 + }, + { + "epoch": 0.47564409717312345, + "grad_norm": 771.0953979492188, + "learning_rate": 3.181372345935848e-05, + "loss": 48.7883, + "step": 117730 + }, + { + "epoch": 0.4756844984384911, + "grad_norm": 661.4583740234375, + "learning_rate": 3.1810364884072205e-05, + "loss": 72.9009, + "step": 117740 + }, + { + "epoch": 0.47572489970385873, + "grad_norm": 632.1604614257812, + "learning_rate": 3.180700617601436e-05, + "loss": 59.3145, + "step": 117750 + }, + { + "epoch": 0.47576530096922637, + "grad_norm": 806.9335327148438, + "learning_rate": 3.180364733525043e-05, + "loss": 41.7695, + "step": 117760 + }, + { + "epoch": 0.475805702234594, + "grad_norm": 776.6650390625, + "learning_rate": 3.1800288361845883e-05, + "loss": 62.7163, + "step": 117770 + }, + { + "epoch": 0.4758461034999616, + "grad_norm": 601.4182739257812, + "learning_rate": 3.179692925586622e-05, + "loss": 54.4891, + "step": 117780 + }, + { + "epoch": 0.47588650476532923, + "grad_norm": 907.4159545898438, + "learning_rate": 3.179357001737692e-05, + "loss": 54.7433, + "step": 117790 + }, + { + "epoch": 0.4759269060306969, + "grad_norm": 792.1842651367188, + "learning_rate": 3.179021064644347e-05, + "loss": 73.5924, + "step": 117800 + }, + { + "epoch": 0.4759673072960645, + "grad_norm": 641.330810546875, + "learning_rate": 3.178685114313137e-05, + "loss": 35.0172, + "step": 117810 + }, + { + "epoch": 0.47600770856143215, + "grad_norm": 272.5650939941406, + "learning_rate": 3.178349150750612e-05, + "loss": 77.2498, + "step": 117820 + }, + { + "epoch": 0.4760481098267998, + "grad_norm": 917.3323974609375, + "learning_rate": 3.1780131739633204e-05, + "loss": 66.6369, + "step": 117830 + }, + { + "epoch": 0.47608851109216743, + "grad_norm": 924.8410034179688, + "learning_rate": 3.177677183957813e-05, + "loss": 50.9081, + "step": 117840 + }, + { + "epoch": 0.476128912357535, + "grad_norm": 539.0849609375, + "learning_rate": 3.17734118074064e-05, + "loss": 43.3925, + "step": 117850 + }, + { + "epoch": 0.47616931362290266, + "grad_norm": 1520.1893310546875, + "learning_rate": 3.177005164318353e-05, + "loss": 52.2887, + "step": 117860 + }, + { + "epoch": 0.4762097148882703, + "grad_norm": 0.0, + "learning_rate": 3.1766691346974996e-05, + "loss": 31.6033, + "step": 117870 + }, + { + "epoch": 0.47625011615363794, + "grad_norm": 0.0, + "learning_rate": 3.176333091884635e-05, + "loss": 59.7049, + "step": 117880 + }, + { + "epoch": 0.4762905174190056, + "grad_norm": 1278.6094970703125, + "learning_rate": 3.175997035886307e-05, + "loss": 56.4383, + "step": 117890 + }, + { + "epoch": 0.4763309186843732, + "grad_norm": 1077.4521484375, + "learning_rate": 3.1756609667090696e-05, + "loss": 70.2687, + "step": 117900 + }, + { + "epoch": 0.4763713199497408, + "grad_norm": 842.562255859375, + "learning_rate": 3.175324884359474e-05, + "loss": 112.2075, + "step": 117910 + }, + { + "epoch": 0.47641172121510844, + "grad_norm": 310.8381042480469, + "learning_rate": 3.174988788844072e-05, + "loss": 48.3912, + "step": 117920 + }, + { + "epoch": 0.4764521224804761, + "grad_norm": 1312.25244140625, + "learning_rate": 3.1746526801694156e-05, + "loss": 52.8015, + "step": 117930 + }, + { + "epoch": 0.4764925237458437, + "grad_norm": 885.5872802734375, + "learning_rate": 3.174316558342059e-05, + "loss": 78.4678, + "step": 117940 + }, + { + "epoch": 0.47653292501121136, + "grad_norm": 629.9498291015625, + "learning_rate": 3.173980423368553e-05, + "loss": 57.0286, + "step": 117950 + }, + { + "epoch": 0.476573326276579, + "grad_norm": 622.9977416992188, + "learning_rate": 3.173644275255451e-05, + "loss": 61.0304, + "step": 117960 + }, + { + "epoch": 0.47661372754194664, + "grad_norm": 772.329833984375, + "learning_rate": 3.173308114009308e-05, + "loss": 34.8157, + "step": 117970 + }, + { + "epoch": 0.4766541288073142, + "grad_norm": 884.4105834960938, + "learning_rate": 3.1729719396366765e-05, + "loss": 44.0258, + "step": 117980 + }, + { + "epoch": 0.47669453007268187, + "grad_norm": 1267.8201904296875, + "learning_rate": 3.172635752144111e-05, + "loss": 53.9944, + "step": 117990 + }, + { + "epoch": 0.4767349313380495, + "grad_norm": 1093.1011962890625, + "learning_rate": 3.172299551538164e-05, + "loss": 49.2085, + "step": 118000 + }, + { + "epoch": 0.47677533260341715, + "grad_norm": 345.0997009277344, + "learning_rate": 3.1719633378253924e-05, + "loss": 48.0018, + "step": 118010 + }, + { + "epoch": 0.4768157338687848, + "grad_norm": 610.406494140625, + "learning_rate": 3.171627111012349e-05, + "loss": 52.8992, + "step": 118020 + }, + { + "epoch": 0.4768561351341524, + "grad_norm": 1354.5128173828125, + "learning_rate": 3.1712908711055897e-05, + "loss": 72.9081, + "step": 118030 + }, + { + "epoch": 0.47689653639952, + "grad_norm": 446.1787109375, + "learning_rate": 3.170954618111669e-05, + "loss": 63.6848, + "step": 118040 + }, + { + "epoch": 0.47693693766488765, + "grad_norm": 704.4627075195312, + "learning_rate": 3.170618352037142e-05, + "loss": 58.9005, + "step": 118050 + }, + { + "epoch": 0.4769773389302553, + "grad_norm": 481.2560119628906, + "learning_rate": 3.170282072888566e-05, + "loss": 46.0265, + "step": 118060 + }, + { + "epoch": 0.47701774019562293, + "grad_norm": 560.6388549804688, + "learning_rate": 3.169945780672495e-05, + "loss": 42.9609, + "step": 118070 + }, + { + "epoch": 0.47705814146099057, + "grad_norm": 2451.45556640625, + "learning_rate": 3.169609475395486e-05, + "loss": 84.6672, + "step": 118080 + }, + { + "epoch": 0.4770985427263582, + "grad_norm": 458.1382141113281, + "learning_rate": 3.169273157064097e-05, + "loss": 59.9135, + "step": 118090 + }, + { + "epoch": 0.4771389439917258, + "grad_norm": 736.26416015625, + "learning_rate": 3.168936825684882e-05, + "loss": 42.6383, + "step": 118100 + }, + { + "epoch": 0.47717934525709343, + "grad_norm": 3540.630126953125, + "learning_rate": 3.1686004812644e-05, + "loss": 58.3743, + "step": 118110 + }, + { + "epoch": 0.4772197465224611, + "grad_norm": 609.9805908203125, + "learning_rate": 3.1682641238092064e-05, + "loss": 44.9807, + "step": 118120 + }, + { + "epoch": 0.4772601477878287, + "grad_norm": 864.6240234375, + "learning_rate": 3.16792775332586e-05, + "loss": 91.4741, + "step": 118130 + }, + { + "epoch": 0.47730054905319635, + "grad_norm": 804.3956298828125, + "learning_rate": 3.167591369820918e-05, + "loss": 64.7296, + "step": 118140 + }, + { + "epoch": 0.477340950318564, + "grad_norm": 365.2998962402344, + "learning_rate": 3.1672549733009396e-05, + "loss": 50.6288, + "step": 118150 + }, + { + "epoch": 0.47738135158393163, + "grad_norm": 402.1658630371094, + "learning_rate": 3.166918563772481e-05, + "loss": 53.5217, + "step": 118160 + }, + { + "epoch": 0.4774217528492992, + "grad_norm": 668.7144165039062, + "learning_rate": 3.1665821412421015e-05, + "loss": 54.7511, + "step": 118170 + }, + { + "epoch": 0.47746215411466686, + "grad_norm": 919.4798583984375, + "learning_rate": 3.1662457057163604e-05, + "loss": 47.6506, + "step": 118180 + }, + { + "epoch": 0.4775025553800345, + "grad_norm": 1031.98193359375, + "learning_rate": 3.165909257201816e-05, + "loss": 71.545, + "step": 118190 + }, + { + "epoch": 0.47754295664540214, + "grad_norm": 390.0589294433594, + "learning_rate": 3.1655727957050285e-05, + "loss": 49.304, + "step": 118200 + }, + { + "epoch": 0.4775833579107698, + "grad_norm": 1422.1484375, + "learning_rate": 3.165236321232557e-05, + "loss": 52.9677, + "step": 118210 + }, + { + "epoch": 0.4776237591761374, + "grad_norm": 846.0404052734375, + "learning_rate": 3.1648998337909594e-05, + "loss": 62.2265, + "step": 118220 + }, + { + "epoch": 0.477664160441505, + "grad_norm": 408.1695556640625, + "learning_rate": 3.164563333386798e-05, + "loss": 67.1796, + "step": 118230 + }, + { + "epoch": 0.47770456170687264, + "grad_norm": 256.5215759277344, + "learning_rate": 3.1642268200266317e-05, + "loss": 40.5587, + "step": 118240 + }, + { + "epoch": 0.4777449629722403, + "grad_norm": 1751.73486328125, + "learning_rate": 3.163890293717022e-05, + "loss": 52.8397, + "step": 118250 + }, + { + "epoch": 0.4777853642376079, + "grad_norm": 564.2115478515625, + "learning_rate": 3.1635537544645296e-05, + "loss": 67.6077, + "step": 118260 + }, + { + "epoch": 0.47782576550297556, + "grad_norm": 1232.5323486328125, + "learning_rate": 3.163217202275715e-05, + "loss": 49.1063, + "step": 118270 + }, + { + "epoch": 0.4778661667683432, + "grad_norm": 3210.296142578125, + "learning_rate": 3.162880637157139e-05, + "loss": 69.347, + "step": 118280 + }, + { + "epoch": 0.47790656803371084, + "grad_norm": 674.1715087890625, + "learning_rate": 3.1625440591153645e-05, + "loss": 63.0254, + "step": 118290 + }, + { + "epoch": 0.47794696929907843, + "grad_norm": 369.24017333984375, + "learning_rate": 3.162207468156952e-05, + "loss": 44.4211, + "step": 118300 + }, + { + "epoch": 0.47798737056444607, + "grad_norm": 634.0050659179688, + "learning_rate": 3.161870864288464e-05, + "loss": 58.934, + "step": 118310 + }, + { + "epoch": 0.4780277718298137, + "grad_norm": 1877.2578125, + "learning_rate": 3.1615342475164636e-05, + "loss": 62.1043, + "step": 118320 + }, + { + "epoch": 0.47806817309518135, + "grad_norm": 550.0879516601562, + "learning_rate": 3.161197617847511e-05, + "loss": 41.5093, + "step": 118330 + }, + { + "epoch": 0.478108574360549, + "grad_norm": 544.1702270507812, + "learning_rate": 3.160860975288171e-05, + "loss": 59.2241, + "step": 118340 + }, + { + "epoch": 0.47814897562591663, + "grad_norm": 316.4408264160156, + "learning_rate": 3.1605243198450066e-05, + "loss": 60.8573, + "step": 118350 + }, + { + "epoch": 0.4781893768912842, + "grad_norm": 412.7470703125, + "learning_rate": 3.16018765152458e-05, + "loss": 47.1499, + "step": 118360 + }, + { + "epoch": 0.47822977815665185, + "grad_norm": 998.5579833984375, + "learning_rate": 3.159850970333456e-05, + "loss": 62.5579, + "step": 118370 + }, + { + "epoch": 0.4782701794220195, + "grad_norm": 425.9685974121094, + "learning_rate": 3.159514276278197e-05, + "loss": 55.9611, + "step": 118380 + }, + { + "epoch": 0.47831058068738713, + "grad_norm": 393.94635009765625, + "learning_rate": 3.1591775693653674e-05, + "loss": 50.0992, + "step": 118390 + }, + { + "epoch": 0.4783509819527548, + "grad_norm": 1880.3455810546875, + "learning_rate": 3.158840849601532e-05, + "loss": 65.1443, + "step": 118400 + }, + { + "epoch": 0.4783913832181224, + "grad_norm": 503.0107421875, + "learning_rate": 3.1585041169932545e-05, + "loss": 44.3258, + "step": 118410 + }, + { + "epoch": 0.47843178448349, + "grad_norm": 0.0, + "learning_rate": 3.1581673715471006e-05, + "loss": 53.8743, + "step": 118420 + }, + { + "epoch": 0.47847218574885764, + "grad_norm": 608.0613403320312, + "learning_rate": 3.157830613269635e-05, + "loss": 54.61, + "step": 118430 + }, + { + "epoch": 0.4785125870142253, + "grad_norm": 427.8965148925781, + "learning_rate": 3.157493842167423e-05, + "loss": 63.0965, + "step": 118440 + }, + { + "epoch": 0.4785529882795929, + "grad_norm": 507.14190673828125, + "learning_rate": 3.15715705824703e-05, + "loss": 50.7432, + "step": 118450 + }, + { + "epoch": 0.47859338954496056, + "grad_norm": 983.0687255859375, + "learning_rate": 3.156820261515022e-05, + "loss": 87.0616, + "step": 118460 + }, + { + "epoch": 0.4786337908103282, + "grad_norm": 1234.0418701171875, + "learning_rate": 3.1564834519779647e-05, + "loss": 48.0925, + "step": 118470 + }, + { + "epoch": 0.47867419207569584, + "grad_norm": 935.4703369140625, + "learning_rate": 3.156146629642425e-05, + "loss": 53.1847, + "step": 118480 + }, + { + "epoch": 0.4787145933410634, + "grad_norm": 820.0416259765625, + "learning_rate": 3.155809794514968e-05, + "loss": 77.3875, + "step": 118490 + }, + { + "epoch": 0.47875499460643106, + "grad_norm": 877.6055297851562, + "learning_rate": 3.155472946602162e-05, + "loss": 59.0853, + "step": 118500 + }, + { + "epoch": 0.4787953958717987, + "grad_norm": 979.28662109375, + "learning_rate": 3.155136085910573e-05, + "loss": 60.1543, + "step": 118510 + }, + { + "epoch": 0.47883579713716634, + "grad_norm": 702.0006713867188, + "learning_rate": 3.15479921244677e-05, + "loss": 44.0273, + "step": 118520 + }, + { + "epoch": 0.478876198402534, + "grad_norm": 1405.5360107421875, + "learning_rate": 3.1544623262173176e-05, + "loss": 63.5543, + "step": 118530 + }, + { + "epoch": 0.4789165996679016, + "grad_norm": 971.0574340820312, + "learning_rate": 3.1541254272287865e-05, + "loss": 55.8479, + "step": 118540 + }, + { + "epoch": 0.4789570009332692, + "grad_norm": 1137.547119140625, + "learning_rate": 3.153788515487742e-05, + "loss": 66.1714, + "step": 118550 + }, + { + "epoch": 0.47899740219863685, + "grad_norm": 813.9231567382812, + "learning_rate": 3.153451591000756e-05, + "loss": 102.5478, + "step": 118560 + }, + { + "epoch": 0.4790378034640045, + "grad_norm": 870.7437133789062, + "learning_rate": 3.153114653774393e-05, + "loss": 81.2061, + "step": 118570 + }, + { + "epoch": 0.4790782047293721, + "grad_norm": 984.029296875, + "learning_rate": 3.152777703815223e-05, + "loss": 64.9325, + "step": 118580 + }, + { + "epoch": 0.47911860599473977, + "grad_norm": 853.7675170898438, + "learning_rate": 3.152440741129817e-05, + "loss": 69.4327, + "step": 118590 + }, + { + "epoch": 0.4791590072601074, + "grad_norm": 1823.9547119140625, + "learning_rate": 3.152103765724743e-05, + "loss": 51.6635, + "step": 118600 + }, + { + "epoch": 0.479199408525475, + "grad_norm": 1002.6013793945312, + "learning_rate": 3.1517667776065696e-05, + "loss": 53.4132, + "step": 118610 + }, + { + "epoch": 0.47923980979084263, + "grad_norm": 1199.445556640625, + "learning_rate": 3.151429776781868e-05, + "loss": 57.5089, + "step": 118620 + }, + { + "epoch": 0.47928021105621027, + "grad_norm": 545.897705078125, + "learning_rate": 3.151092763257206e-05, + "loss": 66.9297, + "step": 118630 + }, + { + "epoch": 0.4793206123215779, + "grad_norm": 1726.234375, + "learning_rate": 3.150755737039157e-05, + "loss": 53.6566, + "step": 118640 + }, + { + "epoch": 0.47936101358694555, + "grad_norm": 719.9454956054688, + "learning_rate": 3.150418698134289e-05, + "loss": 48.0167, + "step": 118650 + }, + { + "epoch": 0.4794014148523132, + "grad_norm": 864.4921264648438, + "learning_rate": 3.150081646549174e-05, + "loss": 84.6051, + "step": 118660 + }, + { + "epoch": 0.47944181611768083, + "grad_norm": 824.414306640625, + "learning_rate": 3.149744582290383e-05, + "loss": 53.0983, + "step": 118670 + }, + { + "epoch": 0.4794822173830484, + "grad_norm": 471.1961975097656, + "learning_rate": 3.149407505364486e-05, + "loss": 69.1806, + "step": 118680 + }, + { + "epoch": 0.47952261864841605, + "grad_norm": 684.9292602539062, + "learning_rate": 3.149070415778056e-05, + "loss": 43.8025, + "step": 118690 + }, + { + "epoch": 0.4795630199137837, + "grad_norm": 507.1294860839844, + "learning_rate": 3.148733313537664e-05, + "loss": 53.6068, + "step": 118700 + }, + { + "epoch": 0.47960342117915133, + "grad_norm": 1452.51953125, + "learning_rate": 3.148396198649882e-05, + "loss": 62.2304, + "step": 118710 + }, + { + "epoch": 0.479643822444519, + "grad_norm": 568.4696655273438, + "learning_rate": 3.148059071121282e-05, + "loss": 69.3875, + "step": 118720 + }, + { + "epoch": 0.4796842237098866, + "grad_norm": 527.501953125, + "learning_rate": 3.147721930958437e-05, + "loss": 53.4141, + "step": 118730 + }, + { + "epoch": 0.4797246249752542, + "grad_norm": 1219.31640625, + "learning_rate": 3.14738477816792e-05, + "loss": 58.4269, + "step": 118740 + }, + { + "epoch": 0.47976502624062184, + "grad_norm": 1039.8255615234375, + "learning_rate": 3.147047612756302e-05, + "loss": 39.0244, + "step": 118750 + }, + { + "epoch": 0.4798054275059895, + "grad_norm": 1056.8941650390625, + "learning_rate": 3.146710434730159e-05, + "loss": 54.2339, + "step": 118760 + }, + { + "epoch": 0.4798458287713571, + "grad_norm": 756.7150268554688, + "learning_rate": 3.1463732440960625e-05, + "loss": 53.7804, + "step": 118770 + }, + { + "epoch": 0.47988623003672476, + "grad_norm": 990.2318725585938, + "learning_rate": 3.1460360408605866e-05, + "loss": 45.0402, + "step": 118780 + }, + { + "epoch": 0.4799266313020924, + "grad_norm": 782.805908203125, + "learning_rate": 3.145698825030307e-05, + "loss": 52.3285, + "step": 118790 + }, + { + "epoch": 0.47996703256746004, + "grad_norm": 781.7633666992188, + "learning_rate": 3.145361596611795e-05, + "loss": 53.3584, + "step": 118800 + }, + { + "epoch": 0.4800074338328276, + "grad_norm": 429.9420166015625, + "learning_rate": 3.1450243556116266e-05, + "loss": 61.5989, + "step": 118810 + }, + { + "epoch": 0.48004783509819526, + "grad_norm": 1086.17822265625, + "learning_rate": 3.144687102036376e-05, + "loss": 44.8307, + "step": 118820 + }, + { + "epoch": 0.4800882363635629, + "grad_norm": 858.8822631835938, + "learning_rate": 3.1443498358926186e-05, + "loss": 63.265, + "step": 118830 + }, + { + "epoch": 0.48012863762893054, + "grad_norm": 530.9268188476562, + "learning_rate": 3.1440125571869306e-05, + "loss": 45.8561, + "step": 118840 + }, + { + "epoch": 0.4801690388942982, + "grad_norm": 426.1288146972656, + "learning_rate": 3.143675265925885e-05, + "loss": 52.3246, + "step": 118850 + }, + { + "epoch": 0.4802094401596658, + "grad_norm": 374.76849365234375, + "learning_rate": 3.1433379621160586e-05, + "loss": 44.3146, + "step": 118860 + }, + { + "epoch": 0.4802498414250334, + "grad_norm": 727.5894165039062, + "learning_rate": 3.143000645764028e-05, + "loss": 45.2602, + "step": 118870 + }, + { + "epoch": 0.48029024269040105, + "grad_norm": 914.193603515625, + "learning_rate": 3.142663316876368e-05, + "loss": 67.587, + "step": 118880 + }, + { + "epoch": 0.4803306439557687, + "grad_norm": 119.14258575439453, + "learning_rate": 3.1423259754596576e-05, + "loss": 48.5501, + "step": 118890 + }, + { + "epoch": 0.48037104522113633, + "grad_norm": 947.4161376953125, + "learning_rate": 3.1419886215204694e-05, + "loss": 56.8426, + "step": 118900 + }, + { + "epoch": 0.48041144648650397, + "grad_norm": 593.0452270507812, + "learning_rate": 3.1416512550653835e-05, + "loss": 70.3968, + "step": 118910 + }, + { + "epoch": 0.4804518477518716, + "grad_norm": 757.6504516601562, + "learning_rate": 3.141313876100976e-05, + "loss": 59.1005, + "step": 118920 + }, + { + "epoch": 0.4804922490172392, + "grad_norm": 3180.365966796875, + "learning_rate": 3.1409764846338245e-05, + "loss": 64.8498, + "step": 118930 + }, + { + "epoch": 0.48053265028260683, + "grad_norm": 566.63671875, + "learning_rate": 3.140639080670507e-05, + "loss": 64.9276, + "step": 118940 + }, + { + "epoch": 0.4805730515479745, + "grad_norm": 714.4104614257812, + "learning_rate": 3.140301664217599e-05, + "loss": 59.2069, + "step": 118950 + }, + { + "epoch": 0.4806134528133421, + "grad_norm": 575.2021484375, + "learning_rate": 3.139964235281682e-05, + "loss": 43.4193, + "step": 118960 + }, + { + "epoch": 0.48065385407870975, + "grad_norm": 1377.7861328125, + "learning_rate": 3.1396267938693316e-05, + "loss": 63.2737, + "step": 118970 + }, + { + "epoch": 0.4806942553440774, + "grad_norm": 661.6658935546875, + "learning_rate": 3.1392893399871295e-05, + "loss": 60.5852, + "step": 118980 + }, + { + "epoch": 0.48073465660944503, + "grad_norm": 513.0794067382812, + "learning_rate": 3.1389518736416507e-05, + "loss": 50.5535, + "step": 118990 + }, + { + "epoch": 0.4807750578748126, + "grad_norm": 1068.8321533203125, + "learning_rate": 3.138614394839476e-05, + "loss": 56.0899, + "step": 119000 + }, + { + "epoch": 0.48081545914018026, + "grad_norm": 346.4224548339844, + "learning_rate": 3.138276903587186e-05, + "loss": 62.7875, + "step": 119010 + }, + { + "epoch": 0.4808558604055479, + "grad_norm": 591.9736328125, + "learning_rate": 3.137939399891359e-05, + "loss": 50.5829, + "step": 119020 + }, + { + "epoch": 0.48089626167091554, + "grad_norm": 762.9810791015625, + "learning_rate": 3.1376018837585747e-05, + "loss": 66.904, + "step": 119030 + }, + { + "epoch": 0.4809366629362832, + "grad_norm": 700.6834106445312, + "learning_rate": 3.137264355195413e-05, + "loss": 66.1691, + "step": 119040 + }, + { + "epoch": 0.4809770642016508, + "grad_norm": 315.9108581542969, + "learning_rate": 3.1369268142084556e-05, + "loss": 49.7968, + "step": 119050 + }, + { + "epoch": 0.4810174654670184, + "grad_norm": 0.0, + "learning_rate": 3.136589260804282e-05, + "loss": 47.6848, + "step": 119060 + }, + { + "epoch": 0.48105786673238604, + "grad_norm": 395.9146423339844, + "learning_rate": 3.1362516949894725e-05, + "loss": 57.0576, + "step": 119070 + }, + { + "epoch": 0.4810982679977537, + "grad_norm": 476.3524475097656, + "learning_rate": 3.135914116770609e-05, + "loss": 58.1471, + "step": 119080 + }, + { + "epoch": 0.4811386692631213, + "grad_norm": 626.7835083007812, + "learning_rate": 3.135576526154272e-05, + "loss": 39.7096, + "step": 119090 + }, + { + "epoch": 0.48117907052848896, + "grad_norm": 1119.859619140625, + "learning_rate": 3.135238923147043e-05, + "loss": 87.1097, + "step": 119100 + }, + { + "epoch": 0.4812194717938566, + "grad_norm": 804.6915283203125, + "learning_rate": 3.1349013077555045e-05, + "loss": 36.7657, + "step": 119110 + }, + { + "epoch": 0.48125987305922424, + "grad_norm": 769.036865234375, + "learning_rate": 3.134563679986238e-05, + "loss": 40.638, + "step": 119120 + }, + { + "epoch": 0.4813002743245918, + "grad_norm": 191.26966857910156, + "learning_rate": 3.134226039845827e-05, + "loss": 50.2752, + "step": 119130 + }, + { + "epoch": 0.48134067558995947, + "grad_norm": 778.8662109375, + "learning_rate": 3.1338883873408516e-05, + "loss": 58.7929, + "step": 119140 + }, + { + "epoch": 0.4813810768553271, + "grad_norm": 514.7074584960938, + "learning_rate": 3.133550722477896e-05, + "loss": 66.8902, + "step": 119150 + }, + { + "epoch": 0.48142147812069475, + "grad_norm": 355.43096923828125, + "learning_rate": 3.133213045263543e-05, + "loss": 58.968, + "step": 119160 + }, + { + "epoch": 0.4814618793860624, + "grad_norm": 3042.458984375, + "learning_rate": 3.132875355704376e-05, + "loss": 88.0314, + "step": 119170 + }, + { + "epoch": 0.48150228065143, + "grad_norm": 656.7515869140625, + "learning_rate": 3.1325376538069776e-05, + "loss": 68.5067, + "step": 119180 + }, + { + "epoch": 0.4815426819167976, + "grad_norm": 495.6517028808594, + "learning_rate": 3.132199939577932e-05, + "loss": 73.5789, + "step": 119190 + }, + { + "epoch": 0.48158308318216525, + "grad_norm": 1507.7938232421875, + "learning_rate": 3.1318622130238236e-05, + "loss": 55.6776, + "step": 119200 + }, + { + "epoch": 0.4816234844475329, + "grad_norm": 802.6456298828125, + "learning_rate": 3.1315244741512356e-05, + "loss": 49.8764, + "step": 119210 + }, + { + "epoch": 0.48166388571290053, + "grad_norm": 814.6461181640625, + "learning_rate": 3.131186722966753e-05, + "loss": 51.0948, + "step": 119220 + }, + { + "epoch": 0.48170428697826817, + "grad_norm": 826.6341552734375, + "learning_rate": 3.1308489594769605e-05, + "loss": 54.8559, + "step": 119230 + }, + { + "epoch": 0.4817446882436358, + "grad_norm": 536.5158081054688, + "learning_rate": 3.1305111836884425e-05, + "loss": 49.7287, + "step": 119240 + }, + { + "epoch": 0.4817850895090034, + "grad_norm": 1080.1739501953125, + "learning_rate": 3.130173395607785e-05, + "loss": 96.4672, + "step": 119250 + }, + { + "epoch": 0.48182549077437103, + "grad_norm": 875.5479125976562, + "learning_rate": 3.129835595241571e-05, + "loss": 41.3818, + "step": 119260 + }, + { + "epoch": 0.4818658920397387, + "grad_norm": 977.9671020507812, + "learning_rate": 3.129497782596389e-05, + "loss": 44.1907, + "step": 119270 + }, + { + "epoch": 0.4819062933051063, + "grad_norm": 469.0617370605469, + "learning_rate": 3.129159957678824e-05, + "loss": 50.5765, + "step": 119280 + }, + { + "epoch": 0.48194669457047395, + "grad_norm": 851.140869140625, + "learning_rate": 3.128822120495462e-05, + "loss": 50.6099, + "step": 119290 + }, + { + "epoch": 0.4819870958358416, + "grad_norm": 608.3344116210938, + "learning_rate": 3.1284842710528876e-05, + "loss": 72.0118, + "step": 119300 + }, + { + "epoch": 0.48202749710120923, + "grad_norm": 380.56195068359375, + "learning_rate": 3.128146409357689e-05, + "loss": 83.0865, + "step": 119310 + }, + { + "epoch": 0.4820678983665768, + "grad_norm": 601.8255615234375, + "learning_rate": 3.127808535416454e-05, + "loss": 48.3086, + "step": 119320 + }, + { + "epoch": 0.48210829963194446, + "grad_norm": 949.0869140625, + "learning_rate": 3.127470649235768e-05, + "loss": 94.7472, + "step": 119330 + }, + { + "epoch": 0.4821487008973121, + "grad_norm": 398.5122985839844, + "learning_rate": 3.1271327508222174e-05, + "loss": 73.9752, + "step": 119340 + }, + { + "epoch": 0.48218910216267974, + "grad_norm": 683.5873413085938, + "learning_rate": 3.126794840182392e-05, + "loss": 65.0503, + "step": 119350 + }, + { + "epoch": 0.4822295034280474, + "grad_norm": 574.1619873046875, + "learning_rate": 3.126456917322878e-05, + "loss": 55.5617, + "step": 119360 + }, + { + "epoch": 0.482269904693415, + "grad_norm": 918.1862182617188, + "learning_rate": 3.1261189822502644e-05, + "loss": 48.1121, + "step": 119370 + }, + { + "epoch": 0.4823103059587826, + "grad_norm": 1102.67919921875, + "learning_rate": 3.125781034971139e-05, + "loss": 79.4511, + "step": 119380 + }, + { + "epoch": 0.48235070722415024, + "grad_norm": 865.8079223632812, + "learning_rate": 3.125443075492089e-05, + "loss": 58.5068, + "step": 119390 + }, + { + "epoch": 0.4823911084895179, + "grad_norm": 644.5775756835938, + "learning_rate": 3.1251051038197055e-05, + "loss": 53.9188, + "step": 119400 + }, + { + "epoch": 0.4824315097548855, + "grad_norm": 472.7004699707031, + "learning_rate": 3.124767119960576e-05, + "loss": 46.8185, + "step": 119410 + }, + { + "epoch": 0.48247191102025316, + "grad_norm": 988.4273681640625, + "learning_rate": 3.1244291239212896e-05, + "loss": 64.0862, + "step": 119420 + }, + { + "epoch": 0.4825123122856208, + "grad_norm": 320.8127746582031, + "learning_rate": 3.124091115708436e-05, + "loss": 66.8379, + "step": 119430 + }, + { + "epoch": 0.48255271355098844, + "grad_norm": 703.6538696289062, + "learning_rate": 3.123753095328604e-05, + "loss": 74.7884, + "step": 119440 + }, + { + "epoch": 0.48259311481635603, + "grad_norm": 741.8955078125, + "learning_rate": 3.123415062788385e-05, + "loss": 47.4477, + "step": 119450 + }, + { + "epoch": 0.48263351608172367, + "grad_norm": 764.6326904296875, + "learning_rate": 3.123077018094369e-05, + "loss": 48.5175, + "step": 119460 + }, + { + "epoch": 0.4826739173470913, + "grad_norm": 436.60516357421875, + "learning_rate": 3.122738961253145e-05, + "loss": 66.414, + "step": 119470 + }, + { + "epoch": 0.48271431861245895, + "grad_norm": 369.33343505859375, + "learning_rate": 3.1224008922713044e-05, + "loss": 71.549, + "step": 119480 + }, + { + "epoch": 0.4827547198778266, + "grad_norm": 921.5698852539062, + "learning_rate": 3.122062811155438e-05, + "loss": 50.5309, + "step": 119490 + }, + { + "epoch": 0.48279512114319423, + "grad_norm": 597.4305419921875, + "learning_rate": 3.121724717912138e-05, + "loss": 50.5436, + "step": 119500 + }, + { + "epoch": 0.4828355224085618, + "grad_norm": 1035.259765625, + "learning_rate": 3.121386612547993e-05, + "loss": 31.3353, + "step": 119510 + }, + { + "epoch": 0.48287592367392945, + "grad_norm": 1129.0030517578125, + "learning_rate": 3.121048495069596e-05, + "loss": 74.6693, + "step": 119520 + }, + { + "epoch": 0.4829163249392971, + "grad_norm": 1025.2454833984375, + "learning_rate": 3.1207103654835394e-05, + "loss": 102.4706, + "step": 119530 + }, + { + "epoch": 0.48295672620466473, + "grad_norm": 379.8052062988281, + "learning_rate": 3.120372223796415e-05, + "loss": 49.5203, + "step": 119540 + }, + { + "epoch": 0.4829971274700324, + "grad_norm": 607.2550048828125, + "learning_rate": 3.120034070014814e-05, + "loss": 70.4186, + "step": 119550 + }, + { + "epoch": 0.4830375287354, + "grad_norm": 690.1021728515625, + "learning_rate": 3.11969590414533e-05, + "loss": 43.4509, + "step": 119560 + }, + { + "epoch": 0.4830779300007676, + "grad_norm": 1215.8643798828125, + "learning_rate": 3.119357726194556e-05, + "loss": 42.1141, + "step": 119570 + }, + { + "epoch": 0.48311833126613524, + "grad_norm": 259.37872314453125, + "learning_rate": 3.119019536169083e-05, + "loss": 58.9376, + "step": 119580 + }, + { + "epoch": 0.4831587325315029, + "grad_norm": 755.9671020507812, + "learning_rate": 3.118681334075506e-05, + "loss": 45.3444, + "step": 119590 + }, + { + "epoch": 0.4831991337968705, + "grad_norm": 1142.39892578125, + "learning_rate": 3.118343119920418e-05, + "loss": 58.759, + "step": 119600 + }, + { + "epoch": 0.48323953506223816, + "grad_norm": 651.2678833007812, + "learning_rate": 3.1180048937104114e-05, + "loss": 51.0291, + "step": 119610 + }, + { + "epoch": 0.4832799363276058, + "grad_norm": 555.2761840820312, + "learning_rate": 3.117666655452083e-05, + "loss": 48.9106, + "step": 119620 + }, + { + "epoch": 0.48332033759297344, + "grad_norm": 1292.247802734375, + "learning_rate": 3.117328405152024e-05, + "loss": 67.4017, + "step": 119630 + }, + { + "epoch": 0.483360738858341, + "grad_norm": 579.1093139648438, + "learning_rate": 3.11699014281683e-05, + "loss": 56.0868, + "step": 119640 + }, + { + "epoch": 0.48340114012370866, + "grad_norm": 571.940673828125, + "learning_rate": 3.116651868453097e-05, + "loss": 57.993, + "step": 119650 + }, + { + "epoch": 0.4834415413890763, + "grad_norm": 592.5040893554688, + "learning_rate": 3.116313582067416e-05, + "loss": 51.3045, + "step": 119660 + }, + { + "epoch": 0.48348194265444394, + "grad_norm": 820.21337890625, + "learning_rate": 3.115975283666386e-05, + "loss": 50.0175, + "step": 119670 + }, + { + "epoch": 0.4835223439198116, + "grad_norm": 1067.8109130859375, + "learning_rate": 3.1156369732566006e-05, + "loss": 31.4268, + "step": 119680 + }, + { + "epoch": 0.4835627451851792, + "grad_norm": 589.9420776367188, + "learning_rate": 3.115298650844655e-05, + "loss": 85.4477, + "step": 119690 + }, + { + "epoch": 0.4836031464505468, + "grad_norm": 485.7039489746094, + "learning_rate": 3.114960316437145e-05, + "loss": 53.188, + "step": 119700 + }, + { + "epoch": 0.48364354771591445, + "grad_norm": 809.940673828125, + "learning_rate": 3.1146219700406674e-05, + "loss": 73.8831, + "step": 119710 + }, + { + "epoch": 0.4836839489812821, + "grad_norm": 1266.80419921875, + "learning_rate": 3.114283611661818e-05, + "loss": 65.2104, + "step": 119720 + }, + { + "epoch": 0.4837243502466497, + "grad_norm": 1203.2113037109375, + "learning_rate": 3.113945241307194e-05, + "loss": 64.743, + "step": 119730 + }, + { + "epoch": 0.48376475151201737, + "grad_norm": 655.8514404296875, + "learning_rate": 3.1136068589833914e-05, + "loss": 70.6579, + "step": 119740 + }, + { + "epoch": 0.483805152777385, + "grad_norm": 842.0756225585938, + "learning_rate": 3.1132684646970064e-05, + "loss": 41.7421, + "step": 119750 + }, + { + "epoch": 0.48384555404275265, + "grad_norm": 859.2930297851562, + "learning_rate": 3.1129300584546375e-05, + "loss": 61.2274, + "step": 119760 + }, + { + "epoch": 0.48388595530812023, + "grad_norm": 972.3688354492188, + "learning_rate": 3.1125916402628814e-05, + "loss": 58.9962, + "step": 119770 + }, + { + "epoch": 0.48392635657348787, + "grad_norm": 983.5337524414062, + "learning_rate": 3.112253210128336e-05, + "loss": 57.9547, + "step": 119780 + }, + { + "epoch": 0.4839667578388555, + "grad_norm": 997.2120361328125, + "learning_rate": 3.111914768057599e-05, + "loss": 47.62, + "step": 119790 + }, + { + "epoch": 0.48400715910422315, + "grad_norm": 1432.3392333984375, + "learning_rate": 3.111576314057268e-05, + "loss": 48.3912, + "step": 119800 + }, + { + "epoch": 0.4840475603695908, + "grad_norm": 502.978759765625, + "learning_rate": 3.1112378481339425e-05, + "loss": 60.9147, + "step": 119810 + }, + { + "epoch": 0.48408796163495843, + "grad_norm": 881.69580078125, + "learning_rate": 3.1108993702942205e-05, + "loss": 63.0135, + "step": 119820 + }, + { + "epoch": 0.484128362900326, + "grad_norm": 619.3832397460938, + "learning_rate": 3.110560880544701e-05, + "loss": 55.6975, + "step": 119830 + }, + { + "epoch": 0.48416876416569365, + "grad_norm": 933.4210815429688, + "learning_rate": 3.1102223788919824e-05, + "loss": 45.4596, + "step": 119840 + }, + { + "epoch": 0.4842091654310613, + "grad_norm": 1533.591552734375, + "learning_rate": 3.1098838653426645e-05, + "loss": 62.3054, + "step": 119850 + }, + { + "epoch": 0.48424956669642893, + "grad_norm": 815.6768798828125, + "learning_rate": 3.1095453399033466e-05, + "loss": 53.5172, + "step": 119860 + }, + { + "epoch": 0.4842899679617966, + "grad_norm": 1024.9600830078125, + "learning_rate": 3.109206802580629e-05, + "loss": 63.8872, + "step": 119870 + }, + { + "epoch": 0.4843303692271642, + "grad_norm": 0.0, + "learning_rate": 3.10886825338111e-05, + "loss": 48.7491, + "step": 119880 + }, + { + "epoch": 0.4843707704925318, + "grad_norm": 610.7237548828125, + "learning_rate": 3.108529692311391e-05, + "loss": 59.5073, + "step": 119890 + }, + { + "epoch": 0.48441117175789944, + "grad_norm": 999.9380493164062, + "learning_rate": 3.108191119378073e-05, + "loss": 54.3617, + "step": 119900 + }, + { + "epoch": 0.4844515730232671, + "grad_norm": 589.252685546875, + "learning_rate": 3.107852534587756e-05, + "loss": 70.6535, + "step": 119910 + }, + { + "epoch": 0.4844919742886347, + "grad_norm": 1168.3255615234375, + "learning_rate": 3.107513937947041e-05, + "loss": 65.7924, + "step": 119920 + }, + { + "epoch": 0.48453237555400236, + "grad_norm": 452.32415771484375, + "learning_rate": 3.107175329462529e-05, + "loss": 33.2393, + "step": 119930 + }, + { + "epoch": 0.48457277681937, + "grad_norm": 365.3132629394531, + "learning_rate": 3.106836709140821e-05, + "loss": 61.387, + "step": 119940 + }, + { + "epoch": 0.48461317808473764, + "grad_norm": 431.0849914550781, + "learning_rate": 3.1064980769885187e-05, + "loss": 49.9083, + "step": 119950 + }, + { + "epoch": 0.4846535793501052, + "grad_norm": 706.0770874023438, + "learning_rate": 3.1061594330122246e-05, + "loss": 41.8204, + "step": 119960 + }, + { + "epoch": 0.48469398061547286, + "grad_norm": 519.536865234375, + "learning_rate": 3.10582077721854e-05, + "loss": 39.0427, + "step": 119970 + }, + { + "epoch": 0.4847343818808405, + "grad_norm": 1396.991455078125, + "learning_rate": 3.1054821096140676e-05, + "loss": 70.4659, + "step": 119980 + }, + { + "epoch": 0.48477478314620814, + "grad_norm": 1468.1710205078125, + "learning_rate": 3.10514343020541e-05, + "loss": 91.4469, + "step": 119990 + }, + { + "epoch": 0.4848151844115758, + "grad_norm": 483.8679504394531, + "learning_rate": 3.104804738999169e-05, + "loss": 31.8263, + "step": 120000 + }, + { + "epoch": 0.4848555856769434, + "grad_norm": 1103.7783203125, + "learning_rate": 3.10446603600195e-05, + "loss": 48.717, + "step": 120010 + }, + { + "epoch": 0.484895986942311, + "grad_norm": 0.0, + "learning_rate": 3.104127321220353e-05, + "loss": 44.1308, + "step": 120020 + }, + { + "epoch": 0.48493638820767865, + "grad_norm": 2047.26025390625, + "learning_rate": 3.1037885946609824e-05, + "loss": 61.4208, + "step": 120030 + }, + { + "epoch": 0.4849767894730463, + "grad_norm": 2711.773681640625, + "learning_rate": 3.103449856330443e-05, + "loss": 70.1431, + "step": 120040 + }, + { + "epoch": 0.48501719073841393, + "grad_norm": 1064.7301025390625, + "learning_rate": 3.1031111062353373e-05, + "loss": 59.401, + "step": 120050 + }, + { + "epoch": 0.48505759200378157, + "grad_norm": 1243.82763671875, + "learning_rate": 3.102772344382271e-05, + "loss": 78.9579, + "step": 120060 + }, + { + "epoch": 0.4850979932691492, + "grad_norm": 935.9784545898438, + "learning_rate": 3.102433570777847e-05, + "loss": 56.5204, + "step": 120070 + }, + { + "epoch": 0.48513839453451685, + "grad_norm": 583.158935546875, + "learning_rate": 3.102094785428671e-05, + "loss": 63.3833, + "step": 120080 + }, + { + "epoch": 0.48517879579988443, + "grad_norm": 749.0221557617188, + "learning_rate": 3.101755988341347e-05, + "loss": 48.2709, + "step": 120090 + }, + { + "epoch": 0.4852191970652521, + "grad_norm": 876.2115478515625, + "learning_rate": 3.101417179522479e-05, + "loss": 53.4147, + "step": 120100 + }, + { + "epoch": 0.4852595983306197, + "grad_norm": 1196.973388671875, + "learning_rate": 3.101078358978675e-05, + "loss": 56.0996, + "step": 120110 + }, + { + "epoch": 0.48529999959598735, + "grad_norm": 580.7915649414062, + "learning_rate": 3.100739526716538e-05, + "loss": 58.37, + "step": 120120 + }, + { + "epoch": 0.485340400861355, + "grad_norm": 942.7879028320312, + "learning_rate": 3.100400682742675e-05, + "loss": 64.2288, + "step": 120130 + }, + { + "epoch": 0.48538080212672263, + "grad_norm": 627.4921264648438, + "learning_rate": 3.100061827063692e-05, + "loss": 58.1287, + "step": 120140 + }, + { + "epoch": 0.4854212033920902, + "grad_norm": 1478.2410888671875, + "learning_rate": 3.0997229596861944e-05, + "loss": 91.4508, + "step": 120150 + }, + { + "epoch": 0.48546160465745786, + "grad_norm": 1782.7314453125, + "learning_rate": 3.099384080616789e-05, + "loss": 56.9644, + "step": 120160 + }, + { + "epoch": 0.4855020059228255, + "grad_norm": 359.5166320800781, + "learning_rate": 3.099045189862081e-05, + "loss": 43.2427, + "step": 120170 + }, + { + "epoch": 0.48554240718819314, + "grad_norm": 604.357177734375, + "learning_rate": 3.0987062874286804e-05, + "loss": 43.9985, + "step": 120180 + }, + { + "epoch": 0.4855828084535608, + "grad_norm": 536.3892211914062, + "learning_rate": 3.098367373323192e-05, + "loss": 60.2531, + "step": 120190 + }, + { + "epoch": 0.4856232097189284, + "grad_norm": 762.7628784179688, + "learning_rate": 3.098028447552224e-05, + "loss": 45.4865, + "step": 120200 + }, + { + "epoch": 0.485663610984296, + "grad_norm": 621.0496215820312, + "learning_rate": 3.097689510122382e-05, + "loss": 49.9217, + "step": 120210 + }, + { + "epoch": 0.48570401224966364, + "grad_norm": 1028.444580078125, + "learning_rate": 3.0973505610402765e-05, + "loss": 54.5276, + "step": 120220 + }, + { + "epoch": 0.4857444135150313, + "grad_norm": 639.4829711914062, + "learning_rate": 3.0970116003125146e-05, + "loss": 46.6422, + "step": 120230 + }, + { + "epoch": 0.4857848147803989, + "grad_norm": 955.115234375, + "learning_rate": 3.0966726279457034e-05, + "loss": 59.0818, + "step": 120240 + }, + { + "epoch": 0.48582521604576656, + "grad_norm": 1233.369873046875, + "learning_rate": 3.0963336439464526e-05, + "loss": 56.1891, + "step": 120250 + }, + { + "epoch": 0.4858656173111342, + "grad_norm": 3286.7587890625, + "learning_rate": 3.09599464832137e-05, + "loss": 70.8362, + "step": 120260 + }, + { + "epoch": 0.48590601857650184, + "grad_norm": 749.9498291015625, + "learning_rate": 3.0956556410770655e-05, + "loss": 71.6872, + "step": 120270 + }, + { + "epoch": 0.4859464198418694, + "grad_norm": 337.7810974121094, + "learning_rate": 3.0953166222201476e-05, + "loss": 34.0317, + "step": 120280 + }, + { + "epoch": 0.48598682110723707, + "grad_norm": 969.7774047851562, + "learning_rate": 3.094977591757224e-05, + "loss": 39.0394, + "step": 120290 + }, + { + "epoch": 0.4860272223726047, + "grad_norm": 600.6663208007812, + "learning_rate": 3.094638549694908e-05, + "loss": 72.5898, + "step": 120300 + }, + { + "epoch": 0.48606762363797235, + "grad_norm": 388.6318664550781, + "learning_rate": 3.0942994960398064e-05, + "loss": 58.7893, + "step": 120310 + }, + { + "epoch": 0.48610802490334, + "grad_norm": 1114.3638916015625, + "learning_rate": 3.09396043079853e-05, + "loss": 67.1577, + "step": 120320 + }, + { + "epoch": 0.4861484261687076, + "grad_norm": 723.9693603515625, + "learning_rate": 3.0936213539776895e-05, + "loss": 66.5562, + "step": 120330 + }, + { + "epoch": 0.4861888274340752, + "grad_norm": 608.6270141601562, + "learning_rate": 3.093282265583895e-05, + "loss": 61.2591, + "step": 120340 + }, + { + "epoch": 0.48622922869944285, + "grad_norm": 455.3746643066406, + "learning_rate": 3.092943165623758e-05, + "loss": 57.6604, + "step": 120350 + }, + { + "epoch": 0.4862696299648105, + "grad_norm": 494.3013610839844, + "learning_rate": 3.092604054103888e-05, + "loss": 42.1755, + "step": 120360 + }, + { + "epoch": 0.48631003123017813, + "grad_norm": 2547.077392578125, + "learning_rate": 3.092264931030897e-05, + "loss": 86.2226, + "step": 120370 + }, + { + "epoch": 0.48635043249554577, + "grad_norm": 671.0562133789062, + "learning_rate": 3.0919257964113964e-05, + "loss": 60.703, + "step": 120380 + }, + { + "epoch": 0.4863908337609134, + "grad_norm": 471.1807556152344, + "learning_rate": 3.0915866502519975e-05, + "loss": 55.5357, + "step": 120390 + }, + { + "epoch": 0.48643123502628105, + "grad_norm": 656.9327392578125, + "learning_rate": 3.091247492559312e-05, + "loss": 66.8196, + "step": 120400 + }, + { + "epoch": 0.48647163629164863, + "grad_norm": 2174.91064453125, + "learning_rate": 3.090908323339952e-05, + "loss": 62.3699, + "step": 120410 + }, + { + "epoch": 0.4865120375570163, + "grad_norm": 663.234619140625, + "learning_rate": 3.090569142600531e-05, + "loss": 58.3517, + "step": 120420 + }, + { + "epoch": 0.4865524388223839, + "grad_norm": 1044.9910888671875, + "learning_rate": 3.09022995034766e-05, + "loss": 45.2753, + "step": 120430 + }, + { + "epoch": 0.48659284008775155, + "grad_norm": 1259.57666015625, + "learning_rate": 3.089890746587953e-05, + "loss": 37.8244, + "step": 120440 + }, + { + "epoch": 0.4866332413531192, + "grad_norm": 619.0317993164062, + "learning_rate": 3.089551531328021e-05, + "loss": 52.6852, + "step": 120450 + }, + { + "epoch": 0.48667364261848683, + "grad_norm": 547.9066772460938, + "learning_rate": 3.0892123045744785e-05, + "loss": 35.2198, + "step": 120460 + }, + { + "epoch": 0.4867140438838544, + "grad_norm": 538.3446044921875, + "learning_rate": 3.08887306633394e-05, + "loss": 49.3895, + "step": 120470 + }, + { + "epoch": 0.48675444514922206, + "grad_norm": 1102.380859375, + "learning_rate": 3.088533816613017e-05, + "loss": 78.057, + "step": 120480 + }, + { + "epoch": 0.4867948464145897, + "grad_norm": 836.9898071289062, + "learning_rate": 3.0881945554183235e-05, + "loss": 73.6896, + "step": 120490 + }, + { + "epoch": 0.48683524767995734, + "grad_norm": 657.3259887695312, + "learning_rate": 3.087855282756475e-05, + "loss": 45.1531, + "step": 120500 + }, + { + "epoch": 0.486875648945325, + "grad_norm": 440.7289733886719, + "learning_rate": 3.087515998634085e-05, + "loss": 57.9621, + "step": 120510 + }, + { + "epoch": 0.4869160502106926, + "grad_norm": 955.15966796875, + "learning_rate": 3.087176703057769e-05, + "loss": 56.2972, + "step": 120520 + }, + { + "epoch": 0.4869564514760602, + "grad_norm": 293.07012939453125, + "learning_rate": 3.08683739603414e-05, + "loss": 42.8358, + "step": 120530 + }, + { + "epoch": 0.48699685274142784, + "grad_norm": 516.2140502929688, + "learning_rate": 3.0864980775698145e-05, + "loss": 92.3896, + "step": 120540 + }, + { + "epoch": 0.4870372540067955, + "grad_norm": 425.08343505859375, + "learning_rate": 3.086158747671406e-05, + "loss": 90.2946, + "step": 120550 + }, + { + "epoch": 0.4870776552721631, + "grad_norm": 302.85626220703125, + "learning_rate": 3.085819406345532e-05, + "loss": 73.1443, + "step": 120560 + }, + { + "epoch": 0.48711805653753076, + "grad_norm": 910.6437377929688, + "learning_rate": 3.0854800535988064e-05, + "loss": 54.2145, + "step": 120570 + }, + { + "epoch": 0.4871584578028984, + "grad_norm": 1036.3970947265625, + "learning_rate": 3.085140689437846e-05, + "loss": 49.4283, + "step": 120580 + }, + { + "epoch": 0.48719885906826604, + "grad_norm": 1020.8902587890625, + "learning_rate": 3.084801313869266e-05, + "loss": 50.4306, + "step": 120590 + }, + { + "epoch": 0.48723926033363363, + "grad_norm": 683.2181396484375, + "learning_rate": 3.0844619268996845e-05, + "loss": 52.7355, + "step": 120600 + }, + { + "epoch": 0.48727966159900127, + "grad_norm": 712.2384033203125, + "learning_rate": 3.084122528535717e-05, + "loss": 76.377, + "step": 120610 + }, + { + "epoch": 0.4873200628643689, + "grad_norm": 1040.790771484375, + "learning_rate": 3.0837831187839784e-05, + "loss": 66.5839, + "step": 120620 + }, + { + "epoch": 0.48736046412973655, + "grad_norm": 826.3641967773438, + "learning_rate": 3.083443697651088e-05, + "loss": 81.0511, + "step": 120630 + }, + { + "epoch": 0.4874008653951042, + "grad_norm": 1687.8861083984375, + "learning_rate": 3.083104265143663e-05, + "loss": 73.7149, + "step": 120640 + }, + { + "epoch": 0.48744126666047183, + "grad_norm": 700.1195678710938, + "learning_rate": 3.08276482126832e-05, + "loss": 49.3427, + "step": 120650 + }, + { + "epoch": 0.4874816679258394, + "grad_norm": 661.125244140625, + "learning_rate": 3.082425366031676e-05, + "loss": 37.9126, + "step": 120660 + }, + { + "epoch": 0.48752206919120705, + "grad_norm": 773.9281005859375, + "learning_rate": 3.08208589944035e-05, + "loss": 47.0344, + "step": 120670 + }, + { + "epoch": 0.4875624704565747, + "grad_norm": 1276.460205078125, + "learning_rate": 3.08174642150096e-05, + "loss": 53.2406, + "step": 120680 + }, + { + "epoch": 0.48760287172194233, + "grad_norm": 669.7227172851562, + "learning_rate": 3.081406932220123e-05, + "loss": 39.3701, + "step": 120690 + }, + { + "epoch": 0.48764327298731, + "grad_norm": 2131.654541015625, + "learning_rate": 3.08106743160446e-05, + "loss": 64.2498, + "step": 120700 + }, + { + "epoch": 0.4876836742526776, + "grad_norm": 621.3330078125, + "learning_rate": 3.0807279196605876e-05, + "loss": 64.5872, + "step": 120710 + }, + { + "epoch": 0.48772407551804525, + "grad_norm": 565.8031616210938, + "learning_rate": 3.0803883963951255e-05, + "loss": 59.7642, + "step": 120720 + }, + { + "epoch": 0.48776447678341284, + "grad_norm": 936.9571533203125, + "learning_rate": 3.080048861814693e-05, + "loss": 34.2556, + "step": 120730 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 361.34063720703125, + "learning_rate": 3.0797093159259085e-05, + "loss": 52.7916, + "step": 120740 + }, + { + "epoch": 0.4878452793141481, + "grad_norm": 325.8556213378906, + "learning_rate": 3.079369758735393e-05, + "loss": 41.6967, + "step": 120750 + }, + { + "epoch": 0.48788568057951576, + "grad_norm": 522.2239990234375, + "learning_rate": 3.0790301902497666e-05, + "loss": 44.1217, + "step": 120760 + }, + { + "epoch": 0.4879260818448834, + "grad_norm": 1932.948486328125, + "learning_rate": 3.078690610475647e-05, + "loss": 70.4253, + "step": 120770 + }, + { + "epoch": 0.48796648311025104, + "grad_norm": 1131.1263427734375, + "learning_rate": 3.0783510194196576e-05, + "loss": 101.964, + "step": 120780 + }, + { + "epoch": 0.4880068843756186, + "grad_norm": 1616.0198974609375, + "learning_rate": 3.078011417088416e-05, + "loss": 55.159, + "step": 120790 + }, + { + "epoch": 0.48804728564098626, + "grad_norm": 942.608642578125, + "learning_rate": 3.0776718034885454e-05, + "loss": 47.1413, + "step": 120800 + }, + { + "epoch": 0.4880876869063539, + "grad_norm": 503.5368347167969, + "learning_rate": 3.0773321786266644e-05, + "loss": 36.5475, + "step": 120810 + }, + { + "epoch": 0.48812808817172154, + "grad_norm": 844.5652465820312, + "learning_rate": 3.076992542509396e-05, + "loss": 52.4938, + "step": 120820 + }, + { + "epoch": 0.4881684894370892, + "grad_norm": 856.3523559570312, + "learning_rate": 3.07665289514336e-05, + "loss": 83.7883, + "step": 120830 + }, + { + "epoch": 0.4882088907024568, + "grad_norm": 400.933349609375, + "learning_rate": 3.07631323653518e-05, + "loss": 55.8697, + "step": 120840 + }, + { + "epoch": 0.4882492919678244, + "grad_norm": 604.0474853515625, + "learning_rate": 3.075973566691477e-05, + "loss": 72.4179, + "step": 120850 + }, + { + "epoch": 0.48828969323319205, + "grad_norm": 584.9152221679688, + "learning_rate": 3.0756338856188716e-05, + "loss": 59.9413, + "step": 120860 + }, + { + "epoch": 0.4883300944985597, + "grad_norm": 734.94287109375, + "learning_rate": 3.075294193323988e-05, + "loss": 81.9922, + "step": 120870 + }, + { + "epoch": 0.4883704957639273, + "grad_norm": 775.8441162109375, + "learning_rate": 3.074954489813449e-05, + "loss": 59.0604, + "step": 120880 + }, + { + "epoch": 0.48841089702929497, + "grad_norm": 464.788330078125, + "learning_rate": 3.074614775093874e-05, + "loss": 77.2058, + "step": 120890 + }, + { + "epoch": 0.4884512982946626, + "grad_norm": 250.41958618164062, + "learning_rate": 3.074275049171889e-05, + "loss": 40.2773, + "step": 120900 + }, + { + "epoch": 0.48849169956003025, + "grad_norm": 608.3348388671875, + "learning_rate": 3.073935312054117e-05, + "loss": 34.6437, + "step": 120910 + }, + { + "epoch": 0.48853210082539783, + "grad_norm": 1844.59033203125, + "learning_rate": 3.0735955637471794e-05, + "loss": 71.1501, + "step": 120920 + }, + { + "epoch": 0.48857250209076547, + "grad_norm": 385.8973388671875, + "learning_rate": 3.073255804257702e-05, + "loss": 56.5991, + "step": 120930 + }, + { + "epoch": 0.4886129033561331, + "grad_norm": 503.91558837890625, + "learning_rate": 3.072916033592307e-05, + "loss": 62.6247, + "step": 120940 + }, + { + "epoch": 0.48865330462150075, + "grad_norm": 590.6072387695312, + "learning_rate": 3.0725762517576195e-05, + "loss": 39.7041, + "step": 120950 + }, + { + "epoch": 0.4886937058868684, + "grad_norm": 369.4613952636719, + "learning_rate": 3.072236458760262e-05, + "loss": 37.5118, + "step": 120960 + }, + { + "epoch": 0.48873410715223603, + "grad_norm": 743.9142456054688, + "learning_rate": 3.071896654606862e-05, + "loss": 48.3398, + "step": 120970 + }, + { + "epoch": 0.4887745084176036, + "grad_norm": 611.325439453125, + "learning_rate": 3.0715568393040405e-05, + "loss": 82.7277, + "step": 120980 + }, + { + "epoch": 0.48881490968297125, + "grad_norm": 1614.0037841796875, + "learning_rate": 3.071217012858425e-05, + "loss": 102.5452, + "step": 120990 + }, + { + "epoch": 0.4888553109483389, + "grad_norm": 984.4487915039062, + "learning_rate": 3.0708771752766394e-05, + "loss": 56.9584, + "step": 121000 + }, + { + "epoch": 0.48889571221370653, + "grad_norm": 1377.5753173828125, + "learning_rate": 3.07053732656531e-05, + "loss": 75.7293, + "step": 121010 + }, + { + "epoch": 0.4889361134790742, + "grad_norm": 438.0140686035156, + "learning_rate": 3.070197466731061e-05, + "loss": 43.8571, + "step": 121020 + }, + { + "epoch": 0.4889765147444418, + "grad_norm": 710.8450317382812, + "learning_rate": 3.069857595780519e-05, + "loss": 66.9329, + "step": 121030 + }, + { + "epoch": 0.48901691600980945, + "grad_norm": 762.4956665039062, + "learning_rate": 3.06951771372031e-05, + "loss": 52.6469, + "step": 121040 + }, + { + "epoch": 0.48905731727517704, + "grad_norm": 732.955322265625, + "learning_rate": 3.06917782055706e-05, + "loss": 84.5691, + "step": 121050 + }, + { + "epoch": 0.4890977185405447, + "grad_norm": 899.1510009765625, + "learning_rate": 3.0688379162973955e-05, + "loss": 78.2393, + "step": 121060 + }, + { + "epoch": 0.4891381198059123, + "grad_norm": 798.7470092773438, + "learning_rate": 3.0684980009479424e-05, + "loss": 58.7748, + "step": 121070 + }, + { + "epoch": 0.48917852107127996, + "grad_norm": 897.6027221679688, + "learning_rate": 3.068158074515328e-05, + "loss": 52.4273, + "step": 121080 + }, + { + "epoch": 0.4892189223366476, + "grad_norm": 503.83392333984375, + "learning_rate": 3.0678181370061805e-05, + "loss": 59.4657, + "step": 121090 + }, + { + "epoch": 0.48925932360201524, + "grad_norm": 1317.728759765625, + "learning_rate": 3.0674781884271254e-05, + "loss": 93.741, + "step": 121100 + }, + { + "epoch": 0.4892997248673828, + "grad_norm": 1355.2291259765625, + "learning_rate": 3.067138228784791e-05, + "loss": 84.8171, + "step": 121110 + }, + { + "epoch": 0.48934012613275046, + "grad_norm": 1263.6253662109375, + "learning_rate": 3.0667982580858044e-05, + "loss": 65.6653, + "step": 121120 + }, + { + "epoch": 0.4893805273981181, + "grad_norm": 630.547607421875, + "learning_rate": 3.066458276336794e-05, + "loss": 62.2357, + "step": 121130 + }, + { + "epoch": 0.48942092866348574, + "grad_norm": 1021.1478271484375, + "learning_rate": 3.0661182835443884e-05, + "loss": 57.1426, + "step": 121140 + }, + { + "epoch": 0.4894613299288534, + "grad_norm": 1333.4888916015625, + "learning_rate": 3.065778279715215e-05, + "loss": 49.1796, + "step": 121150 + }, + { + "epoch": 0.489501731194221, + "grad_norm": 481.13165283203125, + "learning_rate": 3.0654382648559026e-05, + "loss": 65.8724, + "step": 121160 + }, + { + "epoch": 0.4895421324595886, + "grad_norm": 359.9427795410156, + "learning_rate": 3.065098238973081e-05, + "loss": 77.4488, + "step": 121170 + }, + { + "epoch": 0.48958253372495625, + "grad_norm": 274.55328369140625, + "learning_rate": 3.064758202073377e-05, + "loss": 54.833, + "step": 121180 + }, + { + "epoch": 0.4896229349903239, + "grad_norm": 489.7810974121094, + "learning_rate": 3.064418154163422e-05, + "loss": 54.8473, + "step": 121190 + }, + { + "epoch": 0.48966333625569153, + "grad_norm": 674.3745727539062, + "learning_rate": 3.064078095249844e-05, + "loss": 51.562, + "step": 121200 + }, + { + "epoch": 0.48970373752105917, + "grad_norm": 443.5577087402344, + "learning_rate": 3.0637380253392736e-05, + "loss": 68.5722, + "step": 121210 + }, + { + "epoch": 0.4897441387864268, + "grad_norm": 824.37158203125, + "learning_rate": 3.06339794443834e-05, + "loss": 46.6327, + "step": 121220 + }, + { + "epoch": 0.48978454005179445, + "grad_norm": 275.5827331542969, + "learning_rate": 3.063057852553674e-05, + "loss": 32.1468, + "step": 121230 + }, + { + "epoch": 0.48982494131716203, + "grad_norm": 573.8633422851562, + "learning_rate": 3.062717749691904e-05, + "loss": 58.9486, + "step": 121240 + }, + { + "epoch": 0.4898653425825297, + "grad_norm": 383.3703918457031, + "learning_rate": 3.062377635859663e-05, + "loss": 39.3934, + "step": 121250 + }, + { + "epoch": 0.4899057438478973, + "grad_norm": 942.3863525390625, + "learning_rate": 3.06203751106358e-05, + "loss": 49.6613, + "step": 121260 + }, + { + "epoch": 0.48994614511326495, + "grad_norm": 0.0, + "learning_rate": 3.0616973753102856e-05, + "loss": 37.8872, + "step": 121270 + }, + { + "epoch": 0.4899865463786326, + "grad_norm": 1657.1973876953125, + "learning_rate": 3.0613572286064125e-05, + "loss": 68.007, + "step": 121280 + }, + { + "epoch": 0.49002694764400023, + "grad_norm": 647.4043579101562, + "learning_rate": 3.061017070958591e-05, + "loss": 92.0626, + "step": 121290 + }, + { + "epoch": 0.4900673489093678, + "grad_norm": 885.7959594726562, + "learning_rate": 3.0606769023734536e-05, + "loss": 30.4313, + "step": 121300 + }, + { + "epoch": 0.49010775017473546, + "grad_norm": 1316.2650146484375, + "learning_rate": 3.060336722857631e-05, + "loss": 83.0374, + "step": 121310 + }, + { + "epoch": 0.4901481514401031, + "grad_norm": 1012.3235473632812, + "learning_rate": 3.059996532417754e-05, + "loss": 41.8788, + "step": 121320 + }, + { + "epoch": 0.49018855270547074, + "grad_norm": 449.5831604003906, + "learning_rate": 3.059656331060458e-05, + "loss": 55.4948, + "step": 121330 + }, + { + "epoch": 0.4902289539708384, + "grad_norm": 723.0786743164062, + "learning_rate": 3.0593161187923736e-05, + "loss": 68.319, + "step": 121340 + }, + { + "epoch": 0.490269355236206, + "grad_norm": 569.5224609375, + "learning_rate": 3.0589758956201327e-05, + "loss": 62.8122, + "step": 121350 + }, + { + "epoch": 0.49030975650157366, + "grad_norm": 572.9276733398438, + "learning_rate": 3.058635661550369e-05, + "loss": 60.2772, + "step": 121360 + }, + { + "epoch": 0.49035015776694124, + "grad_norm": 379.9477844238281, + "learning_rate": 3.058295416589716e-05, + "loss": 69.8202, + "step": 121370 + }, + { + "epoch": 0.4903905590323089, + "grad_norm": 534.4418334960938, + "learning_rate": 3.0579551607448066e-05, + "loss": 51.7835, + "step": 121380 + }, + { + "epoch": 0.4904309602976765, + "grad_norm": 803.1372680664062, + "learning_rate": 3.057614894022274e-05, + "loss": 43.7992, + "step": 121390 + }, + { + "epoch": 0.49047136156304416, + "grad_norm": 2386.29736328125, + "learning_rate": 3.0572746164287514e-05, + "loss": 54.0955, + "step": 121400 + }, + { + "epoch": 0.4905117628284118, + "grad_norm": 997.156005859375, + "learning_rate": 3.0569343279708734e-05, + "loss": 65.676, + "step": 121410 + }, + { + "epoch": 0.49055216409377944, + "grad_norm": 525.251953125, + "learning_rate": 3.056594028655274e-05, + "loss": 52.9306, + "step": 121420 + }, + { + "epoch": 0.490592565359147, + "grad_norm": 1752.92236328125, + "learning_rate": 3.056253718488588e-05, + "loss": 56.5857, + "step": 121430 + }, + { + "epoch": 0.49063296662451467, + "grad_norm": 704.6538696289062, + "learning_rate": 3.055913397477448e-05, + "loss": 68.621, + "step": 121440 + }, + { + "epoch": 0.4906733678898823, + "grad_norm": 615.273681640625, + "learning_rate": 3.0555730656284914e-05, + "loss": 54.718, + "step": 121450 + }, + { + "epoch": 0.49071376915524995, + "grad_norm": 676.6206665039062, + "learning_rate": 3.0552327229483515e-05, + "loss": 60.8578, + "step": 121460 + }, + { + "epoch": 0.4907541704206176, + "grad_norm": 512.4281005859375, + "learning_rate": 3.054892369443663e-05, + "loss": 48.8229, + "step": 121470 + }, + { + "epoch": 0.4907945716859852, + "grad_norm": 885.903076171875, + "learning_rate": 3.054552005121064e-05, + "loss": 83.9689, + "step": 121480 + }, + { + "epoch": 0.4908349729513528, + "grad_norm": 405.67919921875, + "learning_rate": 3.054211629987187e-05, + "loss": 50.5087, + "step": 121490 + }, + { + "epoch": 0.49087537421672045, + "grad_norm": 430.7709045410156, + "learning_rate": 3.053871244048669e-05, + "loss": 54.2436, + "step": 121500 + }, + { + "epoch": 0.4909157754820881, + "grad_norm": 394.2481689453125, + "learning_rate": 3.0535308473121455e-05, + "loss": 37.6645, + "step": 121510 + }, + { + "epoch": 0.49095617674745573, + "grad_norm": 2119.940673828125, + "learning_rate": 3.053190439784253e-05, + "loss": 85.2859, + "step": 121520 + }, + { + "epoch": 0.49099657801282337, + "grad_norm": 509.3744201660156, + "learning_rate": 3.052850021471629e-05, + "loss": 74.4669, + "step": 121530 + }, + { + "epoch": 0.491036979278191, + "grad_norm": 0.0, + "learning_rate": 3.052509592380909e-05, + "loss": 51.207, + "step": 121540 + }, + { + "epoch": 0.49107738054355865, + "grad_norm": 615.7112426757812, + "learning_rate": 3.052169152518729e-05, + "loss": 37.4327, + "step": 121550 + }, + { + "epoch": 0.49111778180892623, + "grad_norm": 624.2453002929688, + "learning_rate": 3.051828701891729e-05, + "loss": 52.5991, + "step": 121560 + }, + { + "epoch": 0.4911581830742939, + "grad_norm": 558.4168090820312, + "learning_rate": 3.0514882405065432e-05, + "loss": 63.3106, + "step": 121570 + }, + { + "epoch": 0.4911985843396615, + "grad_norm": 509.6629333496094, + "learning_rate": 3.0511477683698108e-05, + "loss": 63.4842, + "step": 121580 + }, + { + "epoch": 0.49123898560502915, + "grad_norm": 806.631103515625, + "learning_rate": 3.050807285488168e-05, + "loss": 62.4659, + "step": 121590 + }, + { + "epoch": 0.4912793868703968, + "grad_norm": 814.9642944335938, + "learning_rate": 3.050466791868254e-05, + "loss": 66.1909, + "step": 121600 + }, + { + "epoch": 0.49131978813576443, + "grad_norm": 712.5091552734375, + "learning_rate": 3.0501262875167063e-05, + "loss": 74.5207, + "step": 121610 + }, + { + "epoch": 0.491360189401132, + "grad_norm": 615.8709106445312, + "learning_rate": 3.0497857724401642e-05, + "loss": 79.785, + "step": 121620 + }, + { + "epoch": 0.49140059066649966, + "grad_norm": 729.620361328125, + "learning_rate": 3.0494452466452644e-05, + "loss": 75.2466, + "step": 121630 + }, + { + "epoch": 0.4914409919318673, + "grad_norm": 232.82257080078125, + "learning_rate": 3.049104710138647e-05, + "loss": 40.4371, + "step": 121640 + }, + { + "epoch": 0.49148139319723494, + "grad_norm": 528.4829711914062, + "learning_rate": 3.0487641629269516e-05, + "loss": 49.6708, + "step": 121650 + }, + { + "epoch": 0.4915217944626026, + "grad_norm": 2002.76416015625, + "learning_rate": 3.0484236050168153e-05, + "loss": 52.0956, + "step": 121660 + }, + { + "epoch": 0.4915621957279702, + "grad_norm": 575.5763549804688, + "learning_rate": 3.048083036414878e-05, + "loss": 39.8568, + "step": 121670 + }, + { + "epoch": 0.4916025969933378, + "grad_norm": 0.0, + "learning_rate": 3.0477424571277807e-05, + "loss": 39.302, + "step": 121680 + }, + { + "epoch": 0.49164299825870544, + "grad_norm": 412.2666015625, + "learning_rate": 3.047401867162162e-05, + "loss": 46.3011, + "step": 121690 + }, + { + "epoch": 0.4916833995240731, + "grad_norm": 749.7333374023438, + "learning_rate": 3.0470612665246618e-05, + "loss": 63.5483, + "step": 121700 + }, + { + "epoch": 0.4917238007894407, + "grad_norm": 686.0115966796875, + "learning_rate": 3.0467206552219208e-05, + "loss": 37.7942, + "step": 121710 + }, + { + "epoch": 0.49176420205480836, + "grad_norm": 1040.0323486328125, + "learning_rate": 3.0463800332605784e-05, + "loss": 69.1257, + "step": 121720 + }, + { + "epoch": 0.491804603320176, + "grad_norm": 547.2194213867188, + "learning_rate": 3.046039400647277e-05, + "loss": 28.7517, + "step": 121730 + }, + { + "epoch": 0.49184500458554364, + "grad_norm": 1607.9515380859375, + "learning_rate": 3.0456987573886564e-05, + "loss": 67.1605, + "step": 121740 + }, + { + "epoch": 0.49188540585091123, + "grad_norm": 1343.8623046875, + "learning_rate": 3.045358103491357e-05, + "loss": 61.9395, + "step": 121750 + }, + { + "epoch": 0.49192580711627887, + "grad_norm": 360.9034729003906, + "learning_rate": 3.0450174389620205e-05, + "loss": 60.9107, + "step": 121760 + }, + { + "epoch": 0.4919662083816465, + "grad_norm": 658.2755126953125, + "learning_rate": 3.044676763807288e-05, + "loss": 44.2099, + "step": 121770 + }, + { + "epoch": 0.49200660964701415, + "grad_norm": 463.0184326171875, + "learning_rate": 3.044336078033803e-05, + "loss": 50.352, + "step": 121780 + }, + { + "epoch": 0.4920470109123818, + "grad_norm": 1313.499755859375, + "learning_rate": 3.043995381648205e-05, + "loss": 65.2679, + "step": 121790 + }, + { + "epoch": 0.49208741217774943, + "grad_norm": 662.1671142578125, + "learning_rate": 3.0436546746571372e-05, + "loss": 47.1219, + "step": 121800 + }, + { + "epoch": 0.492127813443117, + "grad_norm": 3354.996826171875, + "learning_rate": 3.0433139570672407e-05, + "loss": 87.668, + "step": 121810 + }, + { + "epoch": 0.49216821470848465, + "grad_norm": 931.5249633789062, + "learning_rate": 3.0429732288851603e-05, + "loss": 44.4428, + "step": 121820 + }, + { + "epoch": 0.4922086159738523, + "grad_norm": 798.1958618164062, + "learning_rate": 3.0426324901175374e-05, + "loss": 51.0042, + "step": 121830 + }, + { + "epoch": 0.49224901723921993, + "grad_norm": 674.7509765625, + "learning_rate": 3.0422917407710137e-05, + "loss": 33.4516, + "step": 121840 + }, + { + "epoch": 0.4922894185045876, + "grad_norm": 734.49169921875, + "learning_rate": 3.0419509808522334e-05, + "loss": 77.9193, + "step": 121850 + }, + { + "epoch": 0.4923298197699552, + "grad_norm": 561.5974731445312, + "learning_rate": 3.0416102103678402e-05, + "loss": 52.3013, + "step": 121860 + }, + { + "epoch": 0.49237022103532285, + "grad_norm": 292.3045959472656, + "learning_rate": 3.041269429324477e-05, + "loss": 31.1926, + "step": 121870 + }, + { + "epoch": 0.49241062230069044, + "grad_norm": 613.70361328125, + "learning_rate": 3.040928637728787e-05, + "loss": 56.5007, + "step": 121880 + }, + { + "epoch": 0.4924510235660581, + "grad_norm": 1232.8988037109375, + "learning_rate": 3.040587835587415e-05, + "loss": 73.1134, + "step": 121890 + }, + { + "epoch": 0.4924914248314257, + "grad_norm": 831.9037475585938, + "learning_rate": 3.0402470229070056e-05, + "loss": 46.9302, + "step": 121900 + }, + { + "epoch": 0.49253182609679336, + "grad_norm": 409.0111389160156, + "learning_rate": 3.039906199694202e-05, + "loss": 39.5951, + "step": 121910 + }, + { + "epoch": 0.492572227362161, + "grad_norm": 646.8606567382812, + "learning_rate": 3.0395653659556488e-05, + "loss": 47.5818, + "step": 121920 + }, + { + "epoch": 0.49261262862752864, + "grad_norm": 363.55609130859375, + "learning_rate": 3.039224521697991e-05, + "loss": 72.8517, + "step": 121930 + }, + { + "epoch": 0.4926530298928962, + "grad_norm": 498.0539855957031, + "learning_rate": 3.0388836669278738e-05, + "loss": 53.5023, + "step": 121940 + }, + { + "epoch": 0.49269343115826386, + "grad_norm": 943.4515991210938, + "learning_rate": 3.038542801651941e-05, + "loss": 59.0278, + "step": 121950 + }, + { + "epoch": 0.4927338324236315, + "grad_norm": 3113.823486328125, + "learning_rate": 3.0382019258768403e-05, + "loss": 72.7263, + "step": 121960 + }, + { + "epoch": 0.49277423368899914, + "grad_norm": 509.8355407714844, + "learning_rate": 3.0378610396092154e-05, + "loss": 51.3322, + "step": 121970 + }, + { + "epoch": 0.4928146349543668, + "grad_norm": 791.736572265625, + "learning_rate": 3.0375201428557132e-05, + "loss": 48.9846, + "step": 121980 + }, + { + "epoch": 0.4928550362197344, + "grad_norm": 532.8504638671875, + "learning_rate": 3.0371792356229783e-05, + "loss": 35.4836, + "step": 121990 + }, + { + "epoch": 0.492895437485102, + "grad_norm": 419.5154113769531, + "learning_rate": 3.0368383179176585e-05, + "loss": 69.5128, + "step": 122000 + }, + { + "epoch": 0.49293583875046965, + "grad_norm": 3489.129150390625, + "learning_rate": 3.036497389746399e-05, + "loss": 68.8055, + "step": 122010 + }, + { + "epoch": 0.4929762400158373, + "grad_norm": 1044.6541748046875, + "learning_rate": 3.0361564511158457e-05, + "loss": 62.8193, + "step": 122020 + }, + { + "epoch": 0.4930166412812049, + "grad_norm": 1858.55615234375, + "learning_rate": 3.0358155020326477e-05, + "loss": 77.7083, + "step": 122030 + }, + { + "epoch": 0.49305704254657257, + "grad_norm": 682.7332153320312, + "learning_rate": 3.0354745425034498e-05, + "loss": 64.1814, + "step": 122040 + }, + { + "epoch": 0.4930974438119402, + "grad_norm": 653.3331298828125, + "learning_rate": 3.0351335725349e-05, + "loss": 49.824, + "step": 122050 + }, + { + "epoch": 0.49313784507730785, + "grad_norm": 859.71337890625, + "learning_rate": 3.0347925921336463e-05, + "loss": 54.182, + "step": 122060 + }, + { + "epoch": 0.49317824634267543, + "grad_norm": 1686.3477783203125, + "learning_rate": 3.0344516013063357e-05, + "loss": 43.8826, + "step": 122070 + }, + { + "epoch": 0.49321864760804307, + "grad_norm": 1858.2529296875, + "learning_rate": 3.034110600059616e-05, + "loss": 42.0115, + "step": 122080 + }, + { + "epoch": 0.4932590488734107, + "grad_norm": 857.298828125, + "learning_rate": 3.0337695884001343e-05, + "loss": 44.892, + "step": 122090 + }, + { + "epoch": 0.49329945013877835, + "grad_norm": 3229.34765625, + "learning_rate": 3.0334285663345404e-05, + "loss": 49.8066, + "step": 122100 + }, + { + "epoch": 0.493339851404146, + "grad_norm": 782.3743896484375, + "learning_rate": 3.033087533869482e-05, + "loss": 46.6393, + "step": 122110 + }, + { + "epoch": 0.49338025266951363, + "grad_norm": 1177.9285888671875, + "learning_rate": 3.032746491011607e-05, + "loss": 54.9906, + "step": 122120 + }, + { + "epoch": 0.4934206539348812, + "grad_norm": 997.4578247070312, + "learning_rate": 3.0324054377675654e-05, + "loss": 47.8525, + "step": 122130 + }, + { + "epoch": 0.49346105520024885, + "grad_norm": 1065.852294921875, + "learning_rate": 3.032064374144005e-05, + "loss": 58.0819, + "step": 122140 + }, + { + "epoch": 0.4935014564656165, + "grad_norm": 2079.648681640625, + "learning_rate": 3.031723300147577e-05, + "loss": 95.1813, + "step": 122150 + }, + { + "epoch": 0.49354185773098413, + "grad_norm": 759.1595458984375, + "learning_rate": 3.0313822157849287e-05, + "loss": 54.4963, + "step": 122160 + }, + { + "epoch": 0.4935822589963518, + "grad_norm": 2073.466064453125, + "learning_rate": 3.031041121062711e-05, + "loss": 70.8082, + "step": 122170 + }, + { + "epoch": 0.4936226602617194, + "grad_norm": 1743.0040283203125, + "learning_rate": 3.030700015987573e-05, + "loss": 57.2004, + "step": 122180 + }, + { + "epoch": 0.49366306152708705, + "grad_norm": 714.07275390625, + "learning_rate": 3.030358900566165e-05, + "loss": 60.7911, + "step": 122190 + }, + { + "epoch": 0.49370346279245464, + "grad_norm": 1210.780029296875, + "learning_rate": 3.0300177748051373e-05, + "loss": 57.1338, + "step": 122200 + }, + { + "epoch": 0.4937438640578223, + "grad_norm": 815.665771484375, + "learning_rate": 3.02967663871114e-05, + "loss": 74.5411, + "step": 122210 + }, + { + "epoch": 0.4937842653231899, + "grad_norm": 1104.837646484375, + "learning_rate": 3.0293354922908235e-05, + "loss": 69.4025, + "step": 122220 + }, + { + "epoch": 0.49382466658855756, + "grad_norm": 693.127197265625, + "learning_rate": 3.0289943355508392e-05, + "loss": 63.0513, + "step": 122230 + }, + { + "epoch": 0.4938650678539252, + "grad_norm": 683.1317749023438, + "learning_rate": 3.028653168497838e-05, + "loss": 46.7121, + "step": 122240 + }, + { + "epoch": 0.49390546911929284, + "grad_norm": 317.39532470703125, + "learning_rate": 3.028311991138472e-05, + "loss": 63.5555, + "step": 122250 + }, + { + "epoch": 0.4939458703846604, + "grad_norm": 466.8677062988281, + "learning_rate": 3.0279708034793907e-05, + "loss": 45.8454, + "step": 122260 + }, + { + "epoch": 0.49398627165002806, + "grad_norm": 1304.4661865234375, + "learning_rate": 3.027629605527248e-05, + "loss": 67.0937, + "step": 122270 + }, + { + "epoch": 0.4940266729153957, + "grad_norm": 341.1806335449219, + "learning_rate": 3.0272883972886935e-05, + "loss": 32.4989, + "step": 122280 + }, + { + "epoch": 0.49406707418076334, + "grad_norm": 668.6641235351562, + "learning_rate": 3.02694717877038e-05, + "loss": 58.9114, + "step": 122290 + }, + { + "epoch": 0.494107475446131, + "grad_norm": 1312.1033935546875, + "learning_rate": 3.02660594997896e-05, + "loss": 52.9595, + "step": 122300 + }, + { + "epoch": 0.4941478767114986, + "grad_norm": 206.2220916748047, + "learning_rate": 3.0262647109210867e-05, + "loss": 42.2671, + "step": 122310 + }, + { + "epoch": 0.4941882779768662, + "grad_norm": 533.5427856445312, + "learning_rate": 3.0259234616034116e-05, + "loss": 54.6035, + "step": 122320 + }, + { + "epoch": 0.49422867924223385, + "grad_norm": 389.654296875, + "learning_rate": 3.0255822020325873e-05, + "loss": 46.6373, + "step": 122330 + }, + { + "epoch": 0.4942690805076015, + "grad_norm": 509.89276123046875, + "learning_rate": 3.025240932215268e-05, + "loss": 65.848, + "step": 122340 + }, + { + "epoch": 0.49430948177296913, + "grad_norm": 738.1375732421875, + "learning_rate": 3.024899652158107e-05, + "loss": 46.7456, + "step": 122350 + }, + { + "epoch": 0.49434988303833677, + "grad_norm": 272.38751220703125, + "learning_rate": 3.0245583618677558e-05, + "loss": 58.7914, + "step": 122360 + }, + { + "epoch": 0.4943902843037044, + "grad_norm": 599.438232421875, + "learning_rate": 3.0242170613508692e-05, + "loss": 57.4107, + "step": 122370 + }, + { + "epoch": 0.49443068556907205, + "grad_norm": 323.33062744140625, + "learning_rate": 3.0238757506141012e-05, + "loss": 48.1619, + "step": 122380 + }, + { + "epoch": 0.49447108683443963, + "grad_norm": 1464.4964599609375, + "learning_rate": 3.0235344296641067e-05, + "loss": 58.9648, + "step": 122390 + }, + { + "epoch": 0.4945114880998073, + "grad_norm": 2316.94873046875, + "learning_rate": 3.023193098507538e-05, + "loss": 80.6663, + "step": 122400 + }, + { + "epoch": 0.4945518893651749, + "grad_norm": 777.03173828125, + "learning_rate": 3.0228517571510507e-05, + "loss": 74.6564, + "step": 122410 + }, + { + "epoch": 0.49459229063054255, + "grad_norm": 832.2376098632812, + "learning_rate": 3.0225104056013e-05, + "loss": 50.8063, + "step": 122420 + }, + { + "epoch": 0.4946326918959102, + "grad_norm": 1374.7584228515625, + "learning_rate": 3.0221690438649386e-05, + "loss": 84.333, + "step": 122430 + }, + { + "epoch": 0.49467309316127783, + "grad_norm": 586.2657470703125, + "learning_rate": 3.0218276719486244e-05, + "loss": 54.8292, + "step": 122440 + }, + { + "epoch": 0.4947134944266454, + "grad_norm": 362.1689147949219, + "learning_rate": 3.0214862898590095e-05, + "loss": 45.3425, + "step": 122450 + }, + { + "epoch": 0.49475389569201306, + "grad_norm": 740.5028076171875, + "learning_rate": 3.021144897602752e-05, + "loss": 65.865, + "step": 122460 + }, + { + "epoch": 0.4947942969573807, + "grad_norm": 776.755615234375, + "learning_rate": 3.020803495186506e-05, + "loss": 45.3244, + "step": 122470 + }, + { + "epoch": 0.49483469822274834, + "grad_norm": 1503.520263671875, + "learning_rate": 3.020462082616928e-05, + "loss": 44.012, + "step": 122480 + }, + { + "epoch": 0.494875099488116, + "grad_norm": 772.3239135742188, + "learning_rate": 3.0201206599006733e-05, + "loss": 50.5993, + "step": 122490 + }, + { + "epoch": 0.4949155007534836, + "grad_norm": 343.9445495605469, + "learning_rate": 3.0197792270443982e-05, + "loss": 43.6323, + "step": 122500 + }, + { + "epoch": 0.49495590201885126, + "grad_norm": 765.3672485351562, + "learning_rate": 3.0194377840547606e-05, + "loss": 48.1465, + "step": 122510 + }, + { + "epoch": 0.49499630328421884, + "grad_norm": 936.2794799804688, + "learning_rate": 3.0190963309384156e-05, + "loss": 58.2882, + "step": 122520 + }, + { + "epoch": 0.4950367045495865, + "grad_norm": 770.3196411132812, + "learning_rate": 3.01875486770202e-05, + "loss": 43.9133, + "step": 122530 + }, + { + "epoch": 0.4950771058149541, + "grad_norm": 411.43255615234375, + "learning_rate": 3.0184133943522314e-05, + "loss": 50.147, + "step": 122540 + }, + { + "epoch": 0.49511750708032176, + "grad_norm": 589.0484619140625, + "learning_rate": 3.0180719108957063e-05, + "loss": 48.991, + "step": 122550 + }, + { + "epoch": 0.4951579083456894, + "grad_norm": 317.48046875, + "learning_rate": 3.0177304173391037e-05, + "loss": 47.1381, + "step": 122560 + }, + { + "epoch": 0.49519830961105704, + "grad_norm": 2199.10888671875, + "learning_rate": 3.0173889136890786e-05, + "loss": 60.0664, + "step": 122570 + }, + { + "epoch": 0.4952387108764246, + "grad_norm": 2371.26025390625, + "learning_rate": 3.0170473999522915e-05, + "loss": 53.2513, + "step": 122580 + }, + { + "epoch": 0.49527911214179227, + "grad_norm": 2204.5810546875, + "learning_rate": 3.016705876135399e-05, + "loss": 75.0456, + "step": 122590 + }, + { + "epoch": 0.4953195134071599, + "grad_norm": 1063.3651123046875, + "learning_rate": 3.016364342245059e-05, + "loss": 85.0364, + "step": 122600 + }, + { + "epoch": 0.49535991467252755, + "grad_norm": 1718.4310302734375, + "learning_rate": 3.016022798287931e-05, + "loss": 66.0194, + "step": 122610 + }, + { + "epoch": 0.4954003159378952, + "grad_norm": 953.0413208007812, + "learning_rate": 3.0156812442706715e-05, + "loss": 54.6552, + "step": 122620 + }, + { + "epoch": 0.4954407172032628, + "grad_norm": 702.347900390625, + "learning_rate": 3.015339680199941e-05, + "loss": 39.405, + "step": 122630 + }, + { + "epoch": 0.4954811184686304, + "grad_norm": 477.78302001953125, + "learning_rate": 3.0149981060823995e-05, + "loss": 39.2307, + "step": 122640 + }, + { + "epoch": 0.49552151973399805, + "grad_norm": 893.2791137695312, + "learning_rate": 3.0146565219247036e-05, + "loss": 70.0523, + "step": 122650 + }, + { + "epoch": 0.4955619209993657, + "grad_norm": 493.2530517578125, + "learning_rate": 3.0143149277335138e-05, + "loss": 59.8175, + "step": 122660 + }, + { + "epoch": 0.49560232226473333, + "grad_norm": 2252.927734375, + "learning_rate": 3.01397332351549e-05, + "loss": 79.7985, + "step": 122670 + }, + { + "epoch": 0.49564272353010097, + "grad_norm": 639.860595703125, + "learning_rate": 3.013631709277292e-05, + "loss": 80.3618, + "step": 122680 + }, + { + "epoch": 0.4956831247954686, + "grad_norm": 542.6490478515625, + "learning_rate": 3.013290085025579e-05, + "loss": 64.4751, + "step": 122690 + }, + { + "epoch": 0.49572352606083625, + "grad_norm": 933.4337158203125, + "learning_rate": 3.0129484507670115e-05, + "loss": 57.9422, + "step": 122700 + }, + { + "epoch": 0.49576392732620383, + "grad_norm": 1414.308837890625, + "learning_rate": 3.0126068065082504e-05, + "loss": 50.3865, + "step": 122710 + }, + { + "epoch": 0.4958043285915715, + "grad_norm": 900.0178833007812, + "learning_rate": 3.0122651522559553e-05, + "loss": 54.273, + "step": 122720 + }, + { + "epoch": 0.4958447298569391, + "grad_norm": 1027.331787109375, + "learning_rate": 3.0119234880167867e-05, + "loss": 46.3332, + "step": 122730 + }, + { + "epoch": 0.49588513112230675, + "grad_norm": 527.3615112304688, + "learning_rate": 3.0115818137974067e-05, + "loss": 98.7442, + "step": 122740 + }, + { + "epoch": 0.4959255323876744, + "grad_norm": 546.8230590820312, + "learning_rate": 3.0112401296044757e-05, + "loss": 68.4313, + "step": 122750 + }, + { + "epoch": 0.49596593365304203, + "grad_norm": 835.651611328125, + "learning_rate": 3.0108984354446556e-05, + "loss": 45.3258, + "step": 122760 + }, + { + "epoch": 0.4960063349184096, + "grad_norm": 459.29144287109375, + "learning_rate": 3.0105567313246074e-05, + "loss": 70.6443, + "step": 122770 + }, + { + "epoch": 0.49604673618377726, + "grad_norm": 1120.5982666015625, + "learning_rate": 3.010215017250993e-05, + "loss": 73.4087, + "step": 122780 + }, + { + "epoch": 0.4960871374491449, + "grad_norm": 815.575927734375, + "learning_rate": 3.0098732932304734e-05, + "loss": 51.7506, + "step": 122790 + }, + { + "epoch": 0.49612753871451254, + "grad_norm": 529.2800903320312, + "learning_rate": 3.0095315592697126e-05, + "loss": 43.3278, + "step": 122800 + }, + { + "epoch": 0.4961679399798802, + "grad_norm": 352.13128662109375, + "learning_rate": 3.0091898153753705e-05, + "loss": 52.9101, + "step": 122810 + }, + { + "epoch": 0.4962083412452478, + "grad_norm": 2164.66064453125, + "learning_rate": 3.0088480615541113e-05, + "loss": 69.647, + "step": 122820 + }, + { + "epoch": 0.49624874251061546, + "grad_norm": 468.87158203125, + "learning_rate": 3.0085062978125967e-05, + "loss": 65.0103, + "step": 122830 + }, + { + "epoch": 0.49628914377598304, + "grad_norm": 771.53857421875, + "learning_rate": 3.008164524157491e-05, + "loss": 47.0131, + "step": 122840 + }, + { + "epoch": 0.4963295450413507, + "grad_norm": 630.9580078125, + "learning_rate": 3.0078227405954557e-05, + "loss": 36.1967, + "step": 122850 + }, + { + "epoch": 0.4963699463067183, + "grad_norm": 632.0338745117188, + "learning_rate": 3.007480947133155e-05, + "loss": 78.9644, + "step": 122860 + }, + { + "epoch": 0.49641034757208596, + "grad_norm": 748.582763671875, + "learning_rate": 3.0071391437772516e-05, + "loss": 52.5742, + "step": 122870 + }, + { + "epoch": 0.4964507488374536, + "grad_norm": 402.97882080078125, + "learning_rate": 3.00679733053441e-05, + "loss": 49.2446, + "step": 122880 + }, + { + "epoch": 0.49649115010282124, + "grad_norm": 814.5822143554688, + "learning_rate": 3.0064555074112927e-05, + "loss": 49.1269, + "step": 122890 + }, + { + "epoch": 0.49653155136818883, + "grad_norm": 484.45751953125, + "learning_rate": 3.0061136744145652e-05, + "loss": 46.5377, + "step": 122900 + }, + { + "epoch": 0.49657195263355647, + "grad_norm": 1381.5684814453125, + "learning_rate": 3.0057718315508905e-05, + "loss": 56.9945, + "step": 122910 + }, + { + "epoch": 0.4966123538989241, + "grad_norm": 1052.2681884765625, + "learning_rate": 3.005429978826934e-05, + "loss": 54.9941, + "step": 122920 + }, + { + "epoch": 0.49665275516429175, + "grad_norm": 1034.386474609375, + "learning_rate": 3.0050881162493593e-05, + "loss": 46.4848, + "step": 122930 + }, + { + "epoch": 0.4966931564296594, + "grad_norm": 919.5922241210938, + "learning_rate": 3.004746243824833e-05, + "loss": 56.9942, + "step": 122940 + }, + { + "epoch": 0.49673355769502703, + "grad_norm": 787.20947265625, + "learning_rate": 3.0044043615600175e-05, + "loss": 65.2977, + "step": 122950 + }, + { + "epoch": 0.4967739589603946, + "grad_norm": 0.0, + "learning_rate": 3.0040624694615803e-05, + "loss": 41.2134, + "step": 122960 + }, + { + "epoch": 0.49681436022576225, + "grad_norm": 683.5494995117188, + "learning_rate": 3.003720567536185e-05, + "loss": 72.0073, + "step": 122970 + }, + { + "epoch": 0.4968547614911299, + "grad_norm": 575.966796875, + "learning_rate": 3.003378655790498e-05, + "loss": 65.4787, + "step": 122980 + }, + { + "epoch": 0.49689516275649753, + "grad_norm": 1353.0836181640625, + "learning_rate": 3.0030367342311848e-05, + "loss": 67.0195, + "step": 122990 + }, + { + "epoch": 0.4969355640218652, + "grad_norm": 590.2861328125, + "learning_rate": 3.002694802864912e-05, + "loss": 53.1483, + "step": 123000 + }, + { + "epoch": 0.4969759652872328, + "grad_norm": 1127.41259765625, + "learning_rate": 3.002352861698345e-05, + "loss": 88.6338, + "step": 123010 + }, + { + "epoch": 0.49701636655260045, + "grad_norm": 902.4027099609375, + "learning_rate": 3.00201091073815e-05, + "loss": 39.8632, + "step": 123020 + }, + { + "epoch": 0.49705676781796804, + "grad_norm": 760.8033447265625, + "learning_rate": 3.0016689499909945e-05, + "loss": 37.9242, + "step": 123030 + }, + { + "epoch": 0.4970971690833357, + "grad_norm": 612.1544189453125, + "learning_rate": 3.0013269794635446e-05, + "loss": 56.6241, + "step": 123040 + }, + { + "epoch": 0.4971375703487033, + "grad_norm": 777.5595092773438, + "learning_rate": 3.0009849991624662e-05, + "loss": 82.5385, + "step": 123050 + }, + { + "epoch": 0.49717797161407096, + "grad_norm": 1702.484619140625, + "learning_rate": 3.0006430090944277e-05, + "loss": 59.1968, + "step": 123060 + }, + { + "epoch": 0.4972183728794386, + "grad_norm": 537.38134765625, + "learning_rate": 3.000301009266096e-05, + "loss": 46.0222, + "step": 123070 + }, + { + "epoch": 0.49725877414480624, + "grad_norm": 305.5016784667969, + "learning_rate": 2.9999589996841386e-05, + "loss": 57.2696, + "step": 123080 + }, + { + "epoch": 0.4972991754101738, + "grad_norm": 844.5637817382812, + "learning_rate": 2.9996169803552233e-05, + "loss": 56.0423, + "step": 123090 + }, + { + "epoch": 0.49733957667554146, + "grad_norm": 883.1146850585938, + "learning_rate": 2.9992749512860173e-05, + "loss": 32.5165, + "step": 123100 + }, + { + "epoch": 0.4973799779409091, + "grad_norm": 1908.025634765625, + "learning_rate": 2.99893291248319e-05, + "loss": 110.2845, + "step": 123110 + }, + { + "epoch": 0.49742037920627674, + "grad_norm": 1058.1068115234375, + "learning_rate": 2.9985908639534075e-05, + "loss": 67.6201, + "step": 123120 + }, + { + "epoch": 0.4974607804716444, + "grad_norm": 1846.614013671875, + "learning_rate": 2.998248805703341e-05, + "loss": 49.1556, + "step": 123130 + }, + { + "epoch": 0.497501181737012, + "grad_norm": 1011.847412109375, + "learning_rate": 2.9979067377396565e-05, + "loss": 25.5965, + "step": 123140 + }, + { + "epoch": 0.49754158300237966, + "grad_norm": 627.6660766601562, + "learning_rate": 2.9975646600690234e-05, + "loss": 34.3208, + "step": 123150 + }, + { + "epoch": 0.49758198426774725, + "grad_norm": 770.2060546875, + "learning_rate": 2.9972225726981113e-05, + "loss": 52.1218, + "step": 123160 + }, + { + "epoch": 0.4976223855331149, + "grad_norm": 398.234130859375, + "learning_rate": 2.99688047563359e-05, + "loss": 60.0683, + "step": 123170 + }, + { + "epoch": 0.4976627867984825, + "grad_norm": 1440.9393310546875, + "learning_rate": 2.996538368882127e-05, + "loss": 94.9321, + "step": 123180 + }, + { + "epoch": 0.49770318806385017, + "grad_norm": 580.39892578125, + "learning_rate": 2.9961962524503927e-05, + "loss": 46.6695, + "step": 123190 + }, + { + "epoch": 0.4977435893292178, + "grad_norm": 402.80419921875, + "learning_rate": 2.9958541263450584e-05, + "loss": 58.4956, + "step": 123200 + }, + { + "epoch": 0.49778399059458545, + "grad_norm": 709.5952758789062, + "learning_rate": 2.9955119905727925e-05, + "loss": 54.5307, + "step": 123210 + }, + { + "epoch": 0.49782439185995303, + "grad_norm": 418.1485290527344, + "learning_rate": 2.995169845140264e-05, + "loss": 33.9692, + "step": 123220 + }, + { + "epoch": 0.49786479312532067, + "grad_norm": 555.658203125, + "learning_rate": 2.994827690054145e-05, + "loss": 57.7686, + "step": 123230 + }, + { + "epoch": 0.4979051943906883, + "grad_norm": 1826.1614990234375, + "learning_rate": 2.9944855253211052e-05, + "loss": 59.2031, + "step": 123240 + }, + { + "epoch": 0.49794559565605595, + "grad_norm": 728.795166015625, + "learning_rate": 2.9941433509478156e-05, + "loss": 64.7671, + "step": 123250 + }, + { + "epoch": 0.4979859969214236, + "grad_norm": 562.8650512695312, + "learning_rate": 2.993801166940947e-05, + "loss": 30.4642, + "step": 123260 + }, + { + "epoch": 0.49802639818679123, + "grad_norm": 718.48681640625, + "learning_rate": 2.9934589733071704e-05, + "loss": 43.2636, + "step": 123270 + }, + { + "epoch": 0.4980667994521588, + "grad_norm": 775.666748046875, + "learning_rate": 2.9931167700531578e-05, + "loss": 64.5946, + "step": 123280 + }, + { + "epoch": 0.49810720071752645, + "grad_norm": 941.3685302734375, + "learning_rate": 2.9927745571855786e-05, + "loss": 68.1941, + "step": 123290 + }, + { + "epoch": 0.4981476019828941, + "grad_norm": 806.8750610351562, + "learning_rate": 2.9924323347111073e-05, + "loss": 75.4404, + "step": 123300 + }, + { + "epoch": 0.49818800324826173, + "grad_norm": 1879.86279296875, + "learning_rate": 2.992090102636413e-05, + "loss": 82.7253, + "step": 123310 + }, + { + "epoch": 0.4982284045136294, + "grad_norm": 500.6525573730469, + "learning_rate": 2.991747860968168e-05, + "loss": 59.617, + "step": 123320 + }, + { + "epoch": 0.498268805778997, + "grad_norm": 576.9788818359375, + "learning_rate": 2.9914056097130473e-05, + "loss": 37.7997, + "step": 123330 + }, + { + "epoch": 0.49830920704436465, + "grad_norm": 1132.8011474609375, + "learning_rate": 2.9910633488777196e-05, + "loss": 60.6151, + "step": 123340 + }, + { + "epoch": 0.49834960830973224, + "grad_norm": 1793.87109375, + "learning_rate": 2.99072107846886e-05, + "loss": 49.606, + "step": 123350 + }, + { + "epoch": 0.4983900095750999, + "grad_norm": 226.36434936523438, + "learning_rate": 2.9903787984931396e-05, + "loss": 29.6053, + "step": 123360 + }, + { + "epoch": 0.4984304108404675, + "grad_norm": 257.8052673339844, + "learning_rate": 2.9900365089572328e-05, + "loss": 42.242, + "step": 123370 + }, + { + "epoch": 0.49847081210583516, + "grad_norm": 719.3456420898438, + "learning_rate": 2.9896942098678122e-05, + "loss": 46.6997, + "step": 123380 + }, + { + "epoch": 0.4985112133712028, + "grad_norm": 641.3980102539062, + "learning_rate": 2.9893519012315503e-05, + "loss": 67.5891, + "step": 123390 + }, + { + "epoch": 0.49855161463657044, + "grad_norm": 685.2548828125, + "learning_rate": 2.9890095830551207e-05, + "loss": 60.4898, + "step": 123400 + }, + { + "epoch": 0.498592015901938, + "grad_norm": 632.763671875, + "learning_rate": 2.9886672553451985e-05, + "loss": 61.0664, + "step": 123410 + }, + { + "epoch": 0.49863241716730566, + "grad_norm": 0.0, + "learning_rate": 2.988324918108456e-05, + "loss": 56.5434, + "step": 123420 + }, + { + "epoch": 0.4986728184326733, + "grad_norm": 1752.1431884765625, + "learning_rate": 2.9879825713515676e-05, + "loss": 105.4006, + "step": 123430 + }, + { + "epoch": 0.49871321969804094, + "grad_norm": 575.599853515625, + "learning_rate": 2.9876402150812078e-05, + "loss": 52.491, + "step": 123440 + }, + { + "epoch": 0.4987536209634086, + "grad_norm": 620.3259887695312, + "learning_rate": 2.9872978493040514e-05, + "loss": 48.9634, + "step": 123450 + }, + { + "epoch": 0.4987940222287762, + "grad_norm": 837.0609741210938, + "learning_rate": 2.9869554740267724e-05, + "loss": 46.9197, + "step": 123460 + }, + { + "epoch": 0.49883442349414386, + "grad_norm": 231.35792541503906, + "learning_rate": 2.986613089256046e-05, + "loss": 62.4719, + "step": 123470 + }, + { + "epoch": 0.49887482475951145, + "grad_norm": 848.2174072265625, + "learning_rate": 2.9862706949985463e-05, + "loss": 63.5692, + "step": 123480 + }, + { + "epoch": 0.4989152260248791, + "grad_norm": 482.8079833984375, + "learning_rate": 2.9859282912609497e-05, + "loss": 48.4904, + "step": 123490 + }, + { + "epoch": 0.49895562729024673, + "grad_norm": 786.4293212890625, + "learning_rate": 2.98558587804993e-05, + "loss": 51.905, + "step": 123500 + }, + { + "epoch": 0.49899602855561437, + "grad_norm": 967.361083984375, + "learning_rate": 2.9852434553721642e-05, + "loss": 55.1196, + "step": 123510 + }, + { + "epoch": 0.499036429820982, + "grad_norm": 691.3804321289062, + "learning_rate": 2.984901023234327e-05, + "loss": 63.2347, + "step": 123520 + }, + { + "epoch": 0.49907683108634965, + "grad_norm": 2461.232421875, + "learning_rate": 2.9845585816430955e-05, + "loss": 86.1027, + "step": 123530 + }, + { + "epoch": 0.49911723235171723, + "grad_norm": 625.51220703125, + "learning_rate": 2.9842161306051446e-05, + "loss": 59.4591, + "step": 123540 + }, + { + "epoch": 0.4991576336170849, + "grad_norm": 937.5235595703125, + "learning_rate": 2.9838736701271514e-05, + "loss": 86.4743, + "step": 123550 + }, + { + "epoch": 0.4991980348824525, + "grad_norm": 719.2523803710938, + "learning_rate": 2.9835312002157913e-05, + "loss": 57.3074, + "step": 123560 + }, + { + "epoch": 0.49923843614782015, + "grad_norm": 1146.9417724609375, + "learning_rate": 2.983188720877741e-05, + "loss": 59.0976, + "step": 123570 + }, + { + "epoch": 0.4992788374131878, + "grad_norm": 643.1884155273438, + "learning_rate": 2.9828462321196788e-05, + "loss": 59.2565, + "step": 123580 + }, + { + "epoch": 0.49931923867855543, + "grad_norm": 717.5197143554688, + "learning_rate": 2.9825037339482804e-05, + "loss": 62.1937, + "step": 123590 + }, + { + "epoch": 0.499359639943923, + "grad_norm": 911.0812377929688, + "learning_rate": 2.9821612263702226e-05, + "loss": 53.7806, + "step": 123600 + }, + { + "epoch": 0.49940004120929066, + "grad_norm": 655.2685546875, + "learning_rate": 2.981818709392184e-05, + "loss": 76.8916, + "step": 123610 + }, + { + "epoch": 0.4994404424746583, + "grad_norm": 694.6813354492188, + "learning_rate": 2.981476183020842e-05, + "loss": 54.2385, + "step": 123620 + }, + { + "epoch": 0.49948084374002594, + "grad_norm": 277.7393798828125, + "learning_rate": 2.9811336472628737e-05, + "loss": 56.6968, + "step": 123630 + }, + { + "epoch": 0.4995212450053936, + "grad_norm": 2142.0654296875, + "learning_rate": 2.9807911021249573e-05, + "loss": 71.4143, + "step": 123640 + }, + { + "epoch": 0.4995616462707612, + "grad_norm": 685.448974609375, + "learning_rate": 2.9804485476137706e-05, + "loss": 41.1674, + "step": 123650 + }, + { + "epoch": 0.49960204753612886, + "grad_norm": 0.0, + "learning_rate": 2.9801059837359925e-05, + "loss": 67.2189, + "step": 123660 + }, + { + "epoch": 0.49964244880149644, + "grad_norm": 563.3724975585938, + "learning_rate": 2.979763410498301e-05, + "loss": 42.097, + "step": 123670 + }, + { + "epoch": 0.4996828500668641, + "grad_norm": 823.16259765625, + "learning_rate": 2.9794208279073743e-05, + "loss": 46.5123, + "step": 123680 + }, + { + "epoch": 0.4997232513322317, + "grad_norm": 396.4298095703125, + "learning_rate": 2.9790782359698914e-05, + "loss": 38.3301, + "step": 123690 + }, + { + "epoch": 0.49976365259759936, + "grad_norm": 487.17205810546875, + "learning_rate": 2.9787356346925327e-05, + "loss": 54.4319, + "step": 123700 + }, + { + "epoch": 0.499804053862967, + "grad_norm": 1401.6353759765625, + "learning_rate": 2.9783930240819758e-05, + "loss": 89.3769, + "step": 123710 + }, + { + "epoch": 0.49984445512833464, + "grad_norm": 401.6158447265625, + "learning_rate": 2.978050404144901e-05, + "loss": 39.3288, + "step": 123720 + }, + { + "epoch": 0.4998848563937022, + "grad_norm": 1638.397216796875, + "learning_rate": 2.977707774887987e-05, + "loss": 85.3148, + "step": 123730 + }, + { + "epoch": 0.49992525765906987, + "grad_norm": 525.0819702148438, + "learning_rate": 2.9773651363179144e-05, + "loss": 62.3121, + "step": 123740 + }, + { + "epoch": 0.4999656589244375, + "grad_norm": 576.019775390625, + "learning_rate": 2.9770224884413623e-05, + "loss": 34.6271, + "step": 123750 + }, + { + "epoch": 0.5000060601898051, + "grad_norm": 596.5224609375, + "learning_rate": 2.9766798312650112e-05, + "loss": 45.5462, + "step": 123760 + }, + { + "epoch": 0.5000464614551727, + "grad_norm": 1210.31005859375, + "learning_rate": 2.976337164795541e-05, + "loss": 43.6167, + "step": 123770 + }, + { + "epoch": 0.5000868627205404, + "grad_norm": 1921.7060546875, + "learning_rate": 2.975994489039634e-05, + "loss": 52.5438, + "step": 123780 + }, + { + "epoch": 0.500127263985908, + "grad_norm": 906.3182373046875, + "learning_rate": 2.9756518040039682e-05, + "loss": 64.2235, + "step": 123790 + }, + { + "epoch": 0.5001676652512757, + "grad_norm": 724.0418701171875, + "learning_rate": 2.9753091096952255e-05, + "loss": 63.5248, + "step": 123800 + }, + { + "epoch": 0.5002080665166433, + "grad_norm": 1490.1466064453125, + "learning_rate": 2.9749664061200877e-05, + "loss": 38.7024, + "step": 123810 + }, + { + "epoch": 0.5002484677820109, + "grad_norm": 262.531494140625, + "learning_rate": 2.9746236932852355e-05, + "loss": 66.6208, + "step": 123820 + }, + { + "epoch": 0.5002888690473786, + "grad_norm": 283.681640625, + "learning_rate": 2.974280971197349e-05, + "loss": 45.1853, + "step": 123830 + }, + { + "epoch": 0.5003292703127462, + "grad_norm": 636.9791259765625, + "learning_rate": 2.973938239863111e-05, + "loss": 54.2482, + "step": 123840 + }, + { + "epoch": 0.5003696715781139, + "grad_norm": 1471.0166015625, + "learning_rate": 2.9735954992892033e-05, + "loss": 73.4811, + "step": 123850 + }, + { + "epoch": 0.5004100728434815, + "grad_norm": 870.6261596679688, + "learning_rate": 2.9732527494823083e-05, + "loss": 69.1818, + "step": 123860 + }, + { + "epoch": 0.5004504741088491, + "grad_norm": 1015.466064453125, + "learning_rate": 2.9729099904491058e-05, + "loss": 65.9109, + "step": 123870 + }, + { + "epoch": 0.5004908753742168, + "grad_norm": 769.5751953125, + "learning_rate": 2.97256722219628e-05, + "loss": 41.2782, + "step": 123880 + }, + { + "epoch": 0.5005312766395843, + "grad_norm": 814.8590087890625, + "learning_rate": 2.9722244447305135e-05, + "loss": 84.8106, + "step": 123890 + }, + { + "epoch": 0.5005716779049519, + "grad_norm": 476.1192932128906, + "learning_rate": 2.9718816580584884e-05, + "loss": 43.7513, + "step": 123900 + }, + { + "epoch": 0.5006120791703196, + "grad_norm": 729.9635620117188, + "learning_rate": 2.9715388621868873e-05, + "loss": 44.9804, + "step": 123910 + }, + { + "epoch": 0.5006524804356872, + "grad_norm": 657.6481323242188, + "learning_rate": 2.971196057122393e-05, + "loss": 68.3173, + "step": 123920 + }, + { + "epoch": 0.5006928817010549, + "grad_norm": 620.4969482421875, + "learning_rate": 2.9708532428716883e-05, + "loss": 50.9084, + "step": 123930 + }, + { + "epoch": 0.5007332829664225, + "grad_norm": 840.12890625, + "learning_rate": 2.9705104194414586e-05, + "loss": 63.0937, + "step": 123940 + }, + { + "epoch": 0.5007736842317901, + "grad_norm": 1180.94970703125, + "learning_rate": 2.9701675868383848e-05, + "loss": 69.3689, + "step": 123950 + }, + { + "epoch": 0.5008140854971578, + "grad_norm": 630.738037109375, + "learning_rate": 2.9698247450691525e-05, + "loss": 56.1486, + "step": 123960 + }, + { + "epoch": 0.5008544867625254, + "grad_norm": 1723.3427734375, + "learning_rate": 2.9694818941404444e-05, + "loss": 52.5067, + "step": 123970 + }, + { + "epoch": 0.5008948880278931, + "grad_norm": 1080.7830810546875, + "learning_rate": 2.9691390340589466e-05, + "loss": 63.3121, + "step": 123980 + }, + { + "epoch": 0.5009352892932607, + "grad_norm": 428.3757629394531, + "learning_rate": 2.9687961648313405e-05, + "loss": 45.4763, + "step": 123990 + }, + { + "epoch": 0.5009756905586283, + "grad_norm": 585.084228515625, + "learning_rate": 2.9684532864643122e-05, + "loss": 47.8538, + "step": 124000 + }, + { + "epoch": 0.501016091823996, + "grad_norm": 951.522705078125, + "learning_rate": 2.9681103989645453e-05, + "loss": 64.0775, + "step": 124010 + }, + { + "epoch": 0.5010564930893635, + "grad_norm": 414.710693359375, + "learning_rate": 2.9677675023387258e-05, + "loss": 35.3562, + "step": 124020 + }, + { + "epoch": 0.5010968943547311, + "grad_norm": 1510.4676513671875, + "learning_rate": 2.9674245965935378e-05, + "loss": 85.7635, + "step": 124030 + }, + { + "epoch": 0.5011372956200988, + "grad_norm": 555.8351440429688, + "learning_rate": 2.9670816817356668e-05, + "loss": 39.8303, + "step": 124040 + }, + { + "epoch": 0.5011776968854664, + "grad_norm": 976.0780639648438, + "learning_rate": 2.9667387577717976e-05, + "loss": 61.9712, + "step": 124050 + }, + { + "epoch": 0.5012180981508341, + "grad_norm": 1387.890869140625, + "learning_rate": 2.9663958247086166e-05, + "loss": 58.7585, + "step": 124060 + }, + { + "epoch": 0.5012584994162017, + "grad_norm": 1036.8333740234375, + "learning_rate": 2.966052882552809e-05, + "loss": 44.4313, + "step": 124070 + }, + { + "epoch": 0.5012989006815693, + "grad_norm": 1022.4268798828125, + "learning_rate": 2.9657099313110593e-05, + "loss": 57.0276, + "step": 124080 + }, + { + "epoch": 0.501339301946937, + "grad_norm": 1398.85546875, + "learning_rate": 2.9653669709900555e-05, + "loss": 58.4346, + "step": 124090 + }, + { + "epoch": 0.5013797032123046, + "grad_norm": 496.8158264160156, + "learning_rate": 2.9650240015964825e-05, + "loss": 62.5083, + "step": 124100 + }, + { + "epoch": 0.5014201044776723, + "grad_norm": 911.7360229492188, + "learning_rate": 2.964681023137028e-05, + "loss": 52.9003, + "step": 124110 + }, + { + "epoch": 0.5014605057430399, + "grad_norm": 932.399658203125, + "learning_rate": 2.9643380356183775e-05, + "loss": 56.112, + "step": 124120 + }, + { + "epoch": 0.5015009070084075, + "grad_norm": 0.0, + "learning_rate": 2.9639950390472177e-05, + "loss": 45.8005, + "step": 124130 + }, + { + "epoch": 0.5015413082737752, + "grad_norm": 789.1591796875, + "learning_rate": 2.9636520334302354e-05, + "loss": 73.2515, + "step": 124140 + }, + { + "epoch": 0.5015817095391427, + "grad_norm": 1213.167236328125, + "learning_rate": 2.9633090187741185e-05, + "loss": 45.1217, + "step": 124150 + }, + { + "epoch": 0.5016221108045104, + "grad_norm": 1115.8275146484375, + "learning_rate": 2.9629659950855544e-05, + "loss": 48.8063, + "step": 124160 + }, + { + "epoch": 0.501662512069878, + "grad_norm": 1056.0667724609375, + "learning_rate": 2.9626229623712288e-05, + "loss": 59.1212, + "step": 124170 + }, + { + "epoch": 0.5017029133352456, + "grad_norm": 269.9258728027344, + "learning_rate": 2.9622799206378305e-05, + "loss": 83.2247, + "step": 124180 + }, + { + "epoch": 0.5017433146006133, + "grad_norm": 539.0936889648438, + "learning_rate": 2.961936869892048e-05, + "loss": 53.4077, + "step": 124190 + }, + { + "epoch": 0.5017837158659809, + "grad_norm": 3030.404052734375, + "learning_rate": 2.9615938101405676e-05, + "loss": 76.7048, + "step": 124200 + }, + { + "epoch": 0.5018241171313486, + "grad_norm": 1794.911376953125, + "learning_rate": 2.961250741390078e-05, + "loss": 60.769, + "step": 124210 + }, + { + "epoch": 0.5018645183967162, + "grad_norm": 1027.421875, + "learning_rate": 2.960907663647268e-05, + "loss": 43.8524, + "step": 124220 + }, + { + "epoch": 0.5019049196620838, + "grad_norm": 726.89697265625, + "learning_rate": 2.9605645769188268e-05, + "loss": 57.9974, + "step": 124230 + }, + { + "epoch": 0.5019453209274515, + "grad_norm": 741.7726440429688, + "learning_rate": 2.9602214812114415e-05, + "loss": 45.0123, + "step": 124240 + }, + { + "epoch": 0.5019857221928191, + "grad_norm": 572.0324096679688, + "learning_rate": 2.9598783765318007e-05, + "loss": 41.7237, + "step": 124250 + }, + { + "epoch": 0.5020261234581868, + "grad_norm": 905.0762329101562, + "learning_rate": 2.9595352628865947e-05, + "loss": 60.8909, + "step": 124260 + }, + { + "epoch": 0.5020665247235544, + "grad_norm": 418.3033447265625, + "learning_rate": 2.9591921402825123e-05, + "loss": 56.9024, + "step": 124270 + }, + { + "epoch": 0.5021069259889219, + "grad_norm": 642.4514770507812, + "learning_rate": 2.958849008726242e-05, + "loss": 56.9975, + "step": 124280 + }, + { + "epoch": 0.5021473272542896, + "grad_norm": 545.2382202148438, + "learning_rate": 2.9585058682244748e-05, + "loss": 54.7912, + "step": 124290 + }, + { + "epoch": 0.5021877285196572, + "grad_norm": 1271.816162109375, + "learning_rate": 2.9581627187838994e-05, + "loss": 67.879, + "step": 124300 + }, + { + "epoch": 0.5022281297850248, + "grad_norm": 404.8785095214844, + "learning_rate": 2.9578195604112064e-05, + "loss": 68.5172, + "step": 124310 + }, + { + "epoch": 0.5022685310503925, + "grad_norm": 1688.64599609375, + "learning_rate": 2.9574763931130843e-05, + "loss": 53.8717, + "step": 124320 + }, + { + "epoch": 0.5023089323157601, + "grad_norm": 574.3252563476562, + "learning_rate": 2.9571332168962256e-05, + "loss": 66.5893, + "step": 124330 + }, + { + "epoch": 0.5023493335811278, + "grad_norm": 418.29864501953125, + "learning_rate": 2.956790031767319e-05, + "loss": 47.6842, + "step": 124340 + }, + { + "epoch": 0.5023897348464954, + "grad_norm": 1550.780029296875, + "learning_rate": 2.9564468377330556e-05, + "loss": 66.8126, + "step": 124350 + }, + { + "epoch": 0.502430136111863, + "grad_norm": 5627.90478515625, + "learning_rate": 2.956103634800126e-05, + "loss": 86.4269, + "step": 124360 + }, + { + "epoch": 0.5024705373772307, + "grad_norm": 2004.539306640625, + "learning_rate": 2.9557604229752212e-05, + "loss": 57.9298, + "step": 124370 + }, + { + "epoch": 0.5025109386425983, + "grad_norm": 531.3135986328125, + "learning_rate": 2.9554172022650317e-05, + "loss": 53.1231, + "step": 124380 + }, + { + "epoch": 0.502551339907966, + "grad_norm": 1106.7730712890625, + "learning_rate": 2.9550739726762507e-05, + "loss": 62.664, + "step": 124390 + }, + { + "epoch": 0.5025917411733335, + "grad_norm": 1226.38818359375, + "learning_rate": 2.9547307342155673e-05, + "loss": 94.7886, + "step": 124400 + }, + { + "epoch": 0.5026321424387011, + "grad_norm": 608.1491088867188, + "learning_rate": 2.9543874868896747e-05, + "loss": 71.896, + "step": 124410 + }, + { + "epoch": 0.5026725437040688, + "grad_norm": 240.57858276367188, + "learning_rate": 2.954044230705264e-05, + "loss": 96.0706, + "step": 124420 + }, + { + "epoch": 0.5027129449694364, + "grad_norm": 596.6292114257812, + "learning_rate": 2.9537009656690275e-05, + "loss": 50.2395, + "step": 124430 + }, + { + "epoch": 0.502753346234804, + "grad_norm": 742.9195556640625, + "learning_rate": 2.953357691787656e-05, + "loss": 39.4977, + "step": 124440 + }, + { + "epoch": 0.5027937475001717, + "grad_norm": 715.0127563476562, + "learning_rate": 2.9530144090678435e-05, + "loss": 40.7906, + "step": 124450 + }, + { + "epoch": 0.5028341487655393, + "grad_norm": 494.5076599121094, + "learning_rate": 2.952671117516282e-05, + "loss": 57.1695, + "step": 124460 + }, + { + "epoch": 0.502874550030907, + "grad_norm": 766.6427001953125, + "learning_rate": 2.952327817139664e-05, + "loss": 62.9475, + "step": 124470 + }, + { + "epoch": 0.5029149512962746, + "grad_norm": 670.2885131835938, + "learning_rate": 2.9519845079446823e-05, + "loss": 47.9745, + "step": 124480 + }, + { + "epoch": 0.5029553525616423, + "grad_norm": 636.1236572265625, + "learning_rate": 2.9516411899380296e-05, + "loss": 57.113, + "step": 124490 + }, + { + "epoch": 0.5029957538270099, + "grad_norm": 434.45477294921875, + "learning_rate": 2.9512978631264006e-05, + "loss": 44.9944, + "step": 124500 + }, + { + "epoch": 0.5030361550923775, + "grad_norm": 687.5653686523438, + "learning_rate": 2.950954527516487e-05, + "loss": 50.9721, + "step": 124510 + }, + { + "epoch": 0.5030765563577452, + "grad_norm": 451.1505126953125, + "learning_rate": 2.9506111831149818e-05, + "loss": 44.7546, + "step": 124520 + }, + { + "epoch": 0.5031169576231127, + "grad_norm": 1212.3389892578125, + "learning_rate": 2.9502678299285798e-05, + "loss": 44.7764, + "step": 124530 + }, + { + "epoch": 0.5031573588884803, + "grad_norm": 572.8779907226562, + "learning_rate": 2.949924467963975e-05, + "loss": 48.557, + "step": 124540 + }, + { + "epoch": 0.503197760153848, + "grad_norm": 520.7388916015625, + "learning_rate": 2.949581097227861e-05, + "loss": 80.5346, + "step": 124550 + }, + { + "epoch": 0.5032381614192156, + "grad_norm": 713.9329833984375, + "learning_rate": 2.9492377177269315e-05, + "loss": 32.9715, + "step": 124560 + }, + { + "epoch": 0.5032785626845833, + "grad_norm": 978.047119140625, + "learning_rate": 2.9488943294678818e-05, + "loss": 52.3525, + "step": 124570 + }, + { + "epoch": 0.5033189639499509, + "grad_norm": 1501.214599609375, + "learning_rate": 2.948550932457407e-05, + "loss": 55.6765, + "step": 124580 + }, + { + "epoch": 0.5033593652153185, + "grad_norm": 625.777587890625, + "learning_rate": 2.9482075267021995e-05, + "loss": 46.8336, + "step": 124590 + }, + { + "epoch": 0.5033997664806862, + "grad_norm": 601.8237915039062, + "learning_rate": 2.9478641122089562e-05, + "loss": 51.4297, + "step": 124600 + }, + { + "epoch": 0.5034401677460538, + "grad_norm": 924.4441528320312, + "learning_rate": 2.947520688984371e-05, + "loss": 60.7378, + "step": 124610 + }, + { + "epoch": 0.5034805690114215, + "grad_norm": 516.689453125, + "learning_rate": 2.9471772570351398e-05, + "loss": 31.4356, + "step": 124620 + }, + { + "epoch": 0.5035209702767891, + "grad_norm": 1193.8798828125, + "learning_rate": 2.9468338163679577e-05, + "loss": 60.812, + "step": 124630 + }, + { + "epoch": 0.5035613715421567, + "grad_norm": 663.6973266601562, + "learning_rate": 2.9464903669895205e-05, + "loss": 35.3694, + "step": 124640 + }, + { + "epoch": 0.5036017728075244, + "grad_norm": 1353.3345947265625, + "learning_rate": 2.9461469089065234e-05, + "loss": 51.9646, + "step": 124650 + }, + { + "epoch": 0.5036421740728919, + "grad_norm": 529.1851806640625, + "learning_rate": 2.945803442125663e-05, + "loss": 48.2301, + "step": 124660 + }, + { + "epoch": 0.5036825753382596, + "grad_norm": 331.9324035644531, + "learning_rate": 2.9454599666536347e-05, + "loss": 54.444, + "step": 124670 + }, + { + "epoch": 0.5037229766036272, + "grad_norm": 620.619384765625, + "learning_rate": 2.9451164824971356e-05, + "loss": 92.8183, + "step": 124680 + }, + { + "epoch": 0.5037633778689948, + "grad_norm": 441.77734375, + "learning_rate": 2.9447729896628612e-05, + "loss": 47.082, + "step": 124690 + }, + { + "epoch": 0.5038037791343625, + "grad_norm": 891.2379150390625, + "learning_rate": 2.944429488157508e-05, + "loss": 70.2383, + "step": 124700 + }, + { + "epoch": 0.5038441803997301, + "grad_norm": 1449.03955078125, + "learning_rate": 2.9440859779877728e-05, + "loss": 57.3963, + "step": 124710 + }, + { + "epoch": 0.5038845816650978, + "grad_norm": 1024.277587890625, + "learning_rate": 2.943742459160354e-05, + "loss": 48.08, + "step": 124720 + }, + { + "epoch": 0.5039249829304654, + "grad_norm": 272.8819885253906, + "learning_rate": 2.9433989316819467e-05, + "loss": 65.6629, + "step": 124730 + }, + { + "epoch": 0.503965384195833, + "grad_norm": 665.0719604492188, + "learning_rate": 2.943055395559249e-05, + "loss": 60.4535, + "step": 124740 + }, + { + "epoch": 0.5040057854612007, + "grad_norm": 1616.3466796875, + "learning_rate": 2.9427118507989586e-05, + "loss": 88.4401, + "step": 124750 + }, + { + "epoch": 0.5040461867265683, + "grad_norm": 1536.4730224609375, + "learning_rate": 2.942368297407772e-05, + "loss": 67.7257, + "step": 124760 + }, + { + "epoch": 0.504086587991936, + "grad_norm": 258.504638671875, + "learning_rate": 2.942024735392389e-05, + "loss": 45.7606, + "step": 124770 + }, + { + "epoch": 0.5041269892573036, + "grad_norm": 1484.1357421875, + "learning_rate": 2.9416811647595048e-05, + "loss": 65.3528, + "step": 124780 + }, + { + "epoch": 0.5041673905226711, + "grad_norm": 933.3783569335938, + "learning_rate": 2.94133758551582e-05, + "loss": 30.6998, + "step": 124790 + }, + { + "epoch": 0.5042077917880388, + "grad_norm": 437.1009216308594, + "learning_rate": 2.9409939976680313e-05, + "loss": 73.0693, + "step": 124800 + }, + { + "epoch": 0.5042481930534064, + "grad_norm": 888.6316528320312, + "learning_rate": 2.9406504012228375e-05, + "loss": 38.9805, + "step": 124810 + }, + { + "epoch": 0.504288594318774, + "grad_norm": 454.36627197265625, + "learning_rate": 2.9403067961869367e-05, + "loss": 32.8442, + "step": 124820 + }, + { + "epoch": 0.5043289955841417, + "grad_norm": 648.9022827148438, + "learning_rate": 2.9399631825670292e-05, + "loss": 40.4288, + "step": 124830 + }, + { + "epoch": 0.5043693968495093, + "grad_norm": 967.1868286132812, + "learning_rate": 2.939619560369813e-05, + "loss": 63.7735, + "step": 124840 + }, + { + "epoch": 0.504409798114877, + "grad_norm": 1663.5084228515625, + "learning_rate": 2.9392759296019867e-05, + "loss": 57.5468, + "step": 124850 + }, + { + "epoch": 0.5044501993802446, + "grad_norm": 1367.6448974609375, + "learning_rate": 2.9389322902702497e-05, + "loss": 61.3237, + "step": 124860 + }, + { + "epoch": 0.5044906006456122, + "grad_norm": 756.0995483398438, + "learning_rate": 2.9385886423813024e-05, + "loss": 65.3766, + "step": 124870 + }, + { + "epoch": 0.5045310019109799, + "grad_norm": 630.2203369140625, + "learning_rate": 2.938244985941844e-05, + "loss": 42.1302, + "step": 124880 + }, + { + "epoch": 0.5045714031763475, + "grad_norm": 451.697998046875, + "learning_rate": 2.9379013209585726e-05, + "loss": 47.7857, + "step": 124890 + }, + { + "epoch": 0.5046118044417152, + "grad_norm": 890.1043701171875, + "learning_rate": 2.9375576474381905e-05, + "loss": 51.9966, + "step": 124900 + }, + { + "epoch": 0.5046522057070827, + "grad_norm": 901.4338989257812, + "learning_rate": 2.9372139653873958e-05, + "loss": 65.1977, + "step": 124910 + }, + { + "epoch": 0.5046926069724503, + "grad_norm": 1069.64208984375, + "learning_rate": 2.9368702748128912e-05, + "loss": 55.9412, + "step": 124920 + }, + { + "epoch": 0.504733008237818, + "grad_norm": 1394.1937255859375, + "learning_rate": 2.9365265757213745e-05, + "loss": 66.5137, + "step": 124930 + }, + { + "epoch": 0.5047734095031856, + "grad_norm": 260.0003967285156, + "learning_rate": 2.9361828681195484e-05, + "loss": 46.3195, + "step": 124940 + }, + { + "epoch": 0.5048138107685533, + "grad_norm": 521.545654296875, + "learning_rate": 2.9358391520141122e-05, + "loss": 54.6765, + "step": 124950 + }, + { + "epoch": 0.5048542120339209, + "grad_norm": 676.0771484375, + "learning_rate": 2.935495427411768e-05, + "loss": 27.8798, + "step": 124960 + }, + { + "epoch": 0.5048946132992885, + "grad_norm": 554.6319580078125, + "learning_rate": 2.9351516943192155e-05, + "loss": 39.7234, + "step": 124970 + }, + { + "epoch": 0.5049350145646562, + "grad_norm": 1820.2799072265625, + "learning_rate": 2.9348079527431567e-05, + "loss": 47.2992, + "step": 124980 + }, + { + "epoch": 0.5049754158300238, + "grad_norm": 730.9981689453125, + "learning_rate": 2.9344642026902924e-05, + "loss": 58.4385, + "step": 124990 + }, + { + "epoch": 0.5050158170953915, + "grad_norm": 232.58773803710938, + "learning_rate": 2.9341204441673266e-05, + "loss": 51.1445, + "step": 125000 + }, + { + "epoch": 0.5050562183607591, + "grad_norm": 744.6976318359375, + "learning_rate": 2.9337766771809577e-05, + "loss": 56.7142, + "step": 125010 + }, + { + "epoch": 0.5050966196261267, + "grad_norm": 807.66748046875, + "learning_rate": 2.9334329017378898e-05, + "loss": 66.2732, + "step": 125020 + }, + { + "epoch": 0.5051370208914944, + "grad_norm": 1156.4183349609375, + "learning_rate": 2.933089117844824e-05, + "loss": 53.764, + "step": 125030 + }, + { + "epoch": 0.5051774221568619, + "grad_norm": 1409.17626953125, + "learning_rate": 2.9327453255084638e-05, + "loss": 51.1774, + "step": 125040 + }, + { + "epoch": 0.5052178234222295, + "grad_norm": 1206.735107421875, + "learning_rate": 2.9324015247355098e-05, + "loss": 50.6902, + "step": 125050 + }, + { + "epoch": 0.5052582246875972, + "grad_norm": 1590.0701904296875, + "learning_rate": 2.932057715532665e-05, + "loss": 61.7914, + "step": 125060 + }, + { + "epoch": 0.5052986259529648, + "grad_norm": 905.3733520507812, + "learning_rate": 2.9317138979066327e-05, + "loss": 44.9613, + "step": 125070 + }, + { + "epoch": 0.5053390272183325, + "grad_norm": 747.811279296875, + "learning_rate": 2.9313700718641167e-05, + "loss": 65.6674, + "step": 125080 + }, + { + "epoch": 0.5053794284837001, + "grad_norm": 0.0, + "learning_rate": 2.9310262374118185e-05, + "loss": 45.0609, + "step": 125090 + }, + { + "epoch": 0.5054198297490677, + "grad_norm": 528.493408203125, + "learning_rate": 2.9306823945564422e-05, + "loss": 58.0582, + "step": 125100 + }, + { + "epoch": 0.5054602310144354, + "grad_norm": 680.7048950195312, + "learning_rate": 2.9303385433046902e-05, + "loss": 36.582, + "step": 125110 + }, + { + "epoch": 0.505500632279803, + "grad_norm": 767.6009521484375, + "learning_rate": 2.9299946836632673e-05, + "loss": 40.1557, + "step": 125120 + }, + { + "epoch": 0.5055410335451707, + "grad_norm": 1023.16015625, + "learning_rate": 2.929650815638877e-05, + "loss": 47.9998, + "step": 125130 + }, + { + "epoch": 0.5055814348105383, + "grad_norm": 588.2609252929688, + "learning_rate": 2.9293069392382224e-05, + "loss": 50.6491, + "step": 125140 + }, + { + "epoch": 0.5056218360759059, + "grad_norm": 831.1854858398438, + "learning_rate": 2.9289630544680075e-05, + "loss": 68.4844, + "step": 125150 + }, + { + "epoch": 0.5056622373412736, + "grad_norm": 1181.1595458984375, + "learning_rate": 2.9286191613349374e-05, + "loss": 84.0284, + "step": 125160 + }, + { + "epoch": 0.5057026386066411, + "grad_norm": 435.8255920410156, + "learning_rate": 2.9282752598457165e-05, + "loss": 58.2437, + "step": 125170 + }, + { + "epoch": 0.5057430398720087, + "grad_norm": 727.278564453125, + "learning_rate": 2.9279313500070483e-05, + "loss": 66.3914, + "step": 125180 + }, + { + "epoch": 0.5057834411373764, + "grad_norm": 302.980224609375, + "learning_rate": 2.927587431825639e-05, + "loss": 44.221, + "step": 125190 + }, + { + "epoch": 0.505823842402744, + "grad_norm": 758.2953491210938, + "learning_rate": 2.9272435053081922e-05, + "loss": 65.575, + "step": 125200 + }, + { + "epoch": 0.5058642436681117, + "grad_norm": 181.88172912597656, + "learning_rate": 2.9268995704614132e-05, + "loss": 40.2345, + "step": 125210 + }, + { + "epoch": 0.5059046449334793, + "grad_norm": 920.4985961914062, + "learning_rate": 2.926555627292007e-05, + "loss": 51.6828, + "step": 125220 + }, + { + "epoch": 0.505945046198847, + "grad_norm": 1029.88330078125, + "learning_rate": 2.9262116758066793e-05, + "loss": 64.6292, + "step": 125230 + }, + { + "epoch": 0.5059854474642146, + "grad_norm": 733.74951171875, + "learning_rate": 2.9258677160121352e-05, + "loss": 53.2531, + "step": 125240 + }, + { + "epoch": 0.5060258487295822, + "grad_norm": 2031.1961669921875, + "learning_rate": 2.9255237479150816e-05, + "loss": 62.1065, + "step": 125250 + }, + { + "epoch": 0.5060662499949499, + "grad_norm": 900.5405883789062, + "learning_rate": 2.925179771522223e-05, + "loss": 52.0671, + "step": 125260 + }, + { + "epoch": 0.5061066512603175, + "grad_norm": 554.5584106445312, + "learning_rate": 2.924835786840266e-05, + "loss": 38.7164, + "step": 125270 + }, + { + "epoch": 0.5061470525256851, + "grad_norm": 386.526123046875, + "learning_rate": 2.9244917938759163e-05, + "loss": 54.9804, + "step": 125280 + }, + { + "epoch": 0.5061874537910528, + "grad_norm": 616.0713500976562, + "learning_rate": 2.9241477926358818e-05, + "loss": 56.0183, + "step": 125290 + }, + { + "epoch": 0.5062278550564203, + "grad_norm": 831.8729858398438, + "learning_rate": 2.923803783126866e-05, + "loss": 46.269, + "step": 125300 + }, + { + "epoch": 0.506268256321788, + "grad_norm": 506.3004455566406, + "learning_rate": 2.923459765355578e-05, + "loss": 44.9364, + "step": 125310 + }, + { + "epoch": 0.5063086575871556, + "grad_norm": 662.8009033203125, + "learning_rate": 2.9231157393287234e-05, + "loss": 44.294, + "step": 125320 + }, + { + "epoch": 0.5063490588525232, + "grad_norm": 1044.3468017578125, + "learning_rate": 2.9227717050530107e-05, + "loss": 59.7929, + "step": 125330 + }, + { + "epoch": 0.5063894601178909, + "grad_norm": 836.830322265625, + "learning_rate": 2.922427662535145e-05, + "loss": 50.6092, + "step": 125340 + }, + { + "epoch": 0.5064298613832585, + "grad_norm": 848.728759765625, + "learning_rate": 2.9220836117818344e-05, + "loss": 46.637, + "step": 125350 + }, + { + "epoch": 0.5064702626486262, + "grad_norm": 456.876953125, + "learning_rate": 2.9217395527997875e-05, + "loss": 45.2742, + "step": 125360 + }, + { + "epoch": 0.5065106639139938, + "grad_norm": 780.4397583007812, + "learning_rate": 2.921395485595711e-05, + "loss": 47.9569, + "step": 125370 + }, + { + "epoch": 0.5065510651793614, + "grad_norm": 851.5180053710938, + "learning_rate": 2.9210514101763113e-05, + "loss": 74.7248, + "step": 125380 + }, + { + "epoch": 0.5065914664447291, + "grad_norm": 776.1827392578125, + "learning_rate": 2.9207073265482982e-05, + "loss": 49.9978, + "step": 125390 + }, + { + "epoch": 0.5066318677100967, + "grad_norm": 528.63330078125, + "learning_rate": 2.920363234718379e-05, + "loss": 73.2618, + "step": 125400 + }, + { + "epoch": 0.5066722689754644, + "grad_norm": 1279.852294921875, + "learning_rate": 2.9200191346932627e-05, + "loss": 48.9025, + "step": 125410 + }, + { + "epoch": 0.506712670240832, + "grad_norm": 1353.2442626953125, + "learning_rate": 2.919675026479656e-05, + "loss": 51.6571, + "step": 125420 + }, + { + "epoch": 0.5067530715061995, + "grad_norm": 1191.01416015625, + "learning_rate": 2.9193309100842693e-05, + "loss": 46.1766, + "step": 125430 + }, + { + "epoch": 0.5067934727715672, + "grad_norm": 1029.6173095703125, + "learning_rate": 2.9189867855138103e-05, + "loss": 58.42, + "step": 125440 + }, + { + "epoch": 0.5068338740369348, + "grad_norm": 575.3421630859375, + "learning_rate": 2.918642652774989e-05, + "loss": 49.3698, + "step": 125450 + }, + { + "epoch": 0.5068742753023024, + "grad_norm": 710.7742919921875, + "learning_rate": 2.9182985118745136e-05, + "loss": 46.0317, + "step": 125460 + }, + { + "epoch": 0.5069146765676701, + "grad_norm": 794.8711547851562, + "learning_rate": 2.9179543628190925e-05, + "loss": 61.0076, + "step": 125470 + }, + { + "epoch": 0.5069550778330377, + "grad_norm": 621.3440551757812, + "learning_rate": 2.9176102056154363e-05, + "loss": 58.3854, + "step": 125480 + }, + { + "epoch": 0.5069954790984054, + "grad_norm": 617.9365844726562, + "learning_rate": 2.9172660402702546e-05, + "loss": 47.3791, + "step": 125490 + }, + { + "epoch": 0.507035880363773, + "grad_norm": 902.1868286132812, + "learning_rate": 2.916921866790256e-05, + "loss": 49.7716, + "step": 125500 + }, + { + "epoch": 0.5070762816291406, + "grad_norm": 882.5235595703125, + "learning_rate": 2.9165776851821508e-05, + "loss": 44.4464, + "step": 125510 + }, + { + "epoch": 0.5071166828945083, + "grad_norm": 893.1301879882812, + "learning_rate": 2.9162334954526493e-05, + "loss": 40.7801, + "step": 125520 + }, + { + "epoch": 0.5071570841598759, + "grad_norm": 457.0363464355469, + "learning_rate": 2.915889297608462e-05, + "loss": 43.9219, + "step": 125530 + }, + { + "epoch": 0.5071974854252436, + "grad_norm": 504.5679626464844, + "learning_rate": 2.9155450916562994e-05, + "loss": 33.9357, + "step": 125540 + }, + { + "epoch": 0.5072378866906111, + "grad_norm": 866.373046875, + "learning_rate": 2.91520087760287e-05, + "loss": 72.3561, + "step": 125550 + }, + { + "epoch": 0.5072782879559787, + "grad_norm": 599.7020874023438, + "learning_rate": 2.9148566554548857e-05, + "loss": 53.9173, + "step": 125560 + }, + { + "epoch": 0.5073186892213464, + "grad_norm": 567.0419921875, + "learning_rate": 2.914512425219058e-05, + "loss": 71.651, + "step": 125570 + }, + { + "epoch": 0.507359090486714, + "grad_norm": 628.0565795898438, + "learning_rate": 2.914168186902097e-05, + "loss": 56.5678, + "step": 125580 + }, + { + "epoch": 0.5073994917520817, + "grad_norm": 528.4154663085938, + "learning_rate": 2.9138239405107136e-05, + "loss": 42.4691, + "step": 125590 + }, + { + "epoch": 0.5074398930174493, + "grad_norm": 829.2301635742188, + "learning_rate": 2.9134796860516194e-05, + "loss": 49.0979, + "step": 125600 + }, + { + "epoch": 0.5074802942828169, + "grad_norm": 1135.746826171875, + "learning_rate": 2.9131354235315268e-05, + "loss": 60.1448, + "step": 125610 + }, + { + "epoch": 0.5075206955481846, + "grad_norm": 402.7625427246094, + "learning_rate": 2.912791152957145e-05, + "loss": 69.6794, + "step": 125620 + }, + { + "epoch": 0.5075610968135522, + "grad_norm": 654.6475219726562, + "learning_rate": 2.9124468743351884e-05, + "loss": 58.2971, + "step": 125630 + }, + { + "epoch": 0.5076014980789199, + "grad_norm": 552.7247924804688, + "learning_rate": 2.9121025876723674e-05, + "loss": 66.4654, + "step": 125640 + }, + { + "epoch": 0.5076418993442875, + "grad_norm": 887.902099609375, + "learning_rate": 2.9117582929753932e-05, + "loss": 76.861, + "step": 125650 + }, + { + "epoch": 0.5076823006096551, + "grad_norm": 966.7318725585938, + "learning_rate": 2.9114139902509807e-05, + "loss": 57.0813, + "step": 125660 + }, + { + "epoch": 0.5077227018750228, + "grad_norm": 1011.2301635742188, + "learning_rate": 2.9110696795058394e-05, + "loss": 45.7711, + "step": 125670 + }, + { + "epoch": 0.5077631031403903, + "grad_norm": 557.7918090820312, + "learning_rate": 2.9107253607466832e-05, + "loss": 55.3871, + "step": 125680 + }, + { + "epoch": 0.5078035044057579, + "grad_norm": 3118.763916015625, + "learning_rate": 2.910381033980225e-05, + "loss": 50.6585, + "step": 125690 + }, + { + "epoch": 0.5078439056711256, + "grad_norm": 863.99658203125, + "learning_rate": 2.910036699213178e-05, + "loss": 68.7242, + "step": 125700 + }, + { + "epoch": 0.5078843069364932, + "grad_norm": 1471.3134765625, + "learning_rate": 2.909692356452254e-05, + "loss": 52.9448, + "step": 125710 + }, + { + "epoch": 0.5079247082018609, + "grad_norm": 553.6021118164062, + "learning_rate": 2.9093480057041662e-05, + "loss": 55.443, + "step": 125720 + }, + { + "epoch": 0.5079651094672285, + "grad_norm": 329.5192565917969, + "learning_rate": 2.9090036469756276e-05, + "loss": 67.455, + "step": 125730 + }, + { + "epoch": 0.5080055107325961, + "grad_norm": 1218.230712890625, + "learning_rate": 2.9086592802733536e-05, + "loss": 84.1644, + "step": 125740 + }, + { + "epoch": 0.5080459119979638, + "grad_norm": 140.51890563964844, + "learning_rate": 2.908314905604056e-05, + "loss": 58.1322, + "step": 125750 + }, + { + "epoch": 0.5080863132633314, + "grad_norm": 557.7669677734375, + "learning_rate": 2.9079705229744493e-05, + "loss": 58.6665, + "step": 125760 + }, + { + "epoch": 0.5081267145286991, + "grad_norm": 357.09234619140625, + "learning_rate": 2.907626132391246e-05, + "loss": 33.4991, + "step": 125770 + }, + { + "epoch": 0.5081671157940667, + "grad_norm": 404.6373596191406, + "learning_rate": 2.9072817338611636e-05, + "loss": 89.8573, + "step": 125780 + }, + { + "epoch": 0.5082075170594343, + "grad_norm": 606.2252807617188, + "learning_rate": 2.9069373273909123e-05, + "loss": 61.2002, + "step": 125790 + }, + { + "epoch": 0.508247918324802, + "grad_norm": 458.4631652832031, + "learning_rate": 2.9065929129872094e-05, + "loss": 46.1727, + "step": 125800 + }, + { + "epoch": 0.5082883195901695, + "grad_norm": 901.8555908203125, + "learning_rate": 2.906248490656768e-05, + "loss": 48.2973, + "step": 125810 + }, + { + "epoch": 0.5083287208555372, + "grad_norm": 676.2325439453125, + "learning_rate": 2.905904060406303e-05, + "loss": 57.2065, + "step": 125820 + }, + { + "epoch": 0.5083691221209048, + "grad_norm": 1410.12158203125, + "learning_rate": 2.905559622242529e-05, + "loss": 45.1312, + "step": 125830 + }, + { + "epoch": 0.5084095233862724, + "grad_norm": 386.5525207519531, + "learning_rate": 2.9052151761721617e-05, + "loss": 73.7721, + "step": 125840 + }, + { + "epoch": 0.5084499246516401, + "grad_norm": 1108.298583984375, + "learning_rate": 2.9048707222019154e-05, + "loss": 66.7324, + "step": 125850 + }, + { + "epoch": 0.5084903259170077, + "grad_norm": 752.5873413085938, + "learning_rate": 2.904526260338507e-05, + "loss": 52.5618, + "step": 125860 + }, + { + "epoch": 0.5085307271823754, + "grad_norm": 393.9751281738281, + "learning_rate": 2.9041817905886504e-05, + "loss": 55.0743, + "step": 125870 + }, + { + "epoch": 0.508571128447743, + "grad_norm": 509.97735595703125, + "learning_rate": 2.9038373129590622e-05, + "loss": 51.6777, + "step": 125880 + }, + { + "epoch": 0.5086115297131106, + "grad_norm": 1160.9713134765625, + "learning_rate": 2.903492827456457e-05, + "loss": 59.5145, + "step": 125890 + }, + { + "epoch": 0.5086519309784783, + "grad_norm": 510.33514404296875, + "learning_rate": 2.903148334087552e-05, + "loss": 34.4883, + "step": 125900 + }, + { + "epoch": 0.5086923322438459, + "grad_norm": 778.451171875, + "learning_rate": 2.9028038328590617e-05, + "loss": 57.813, + "step": 125910 + }, + { + "epoch": 0.5087327335092136, + "grad_norm": 604.14990234375, + "learning_rate": 2.9024593237777037e-05, + "loss": 44.8038, + "step": 125920 + }, + { + "epoch": 0.5087731347745812, + "grad_norm": 1083.8634033203125, + "learning_rate": 2.902114806850194e-05, + "loss": 66.7281, + "step": 125930 + }, + { + "epoch": 0.5088135360399487, + "grad_norm": 626.0941772460938, + "learning_rate": 2.9017702820832498e-05, + "loss": 64.0597, + "step": 125940 + }, + { + "epoch": 0.5088539373053164, + "grad_norm": 2549.89453125, + "learning_rate": 2.9014257494835862e-05, + "loss": 83.0733, + "step": 125950 + }, + { + "epoch": 0.508894338570684, + "grad_norm": 271.8528137207031, + "learning_rate": 2.901081209057921e-05, + "loss": 72.4616, + "step": 125960 + }, + { + "epoch": 0.5089347398360516, + "grad_norm": 418.9285583496094, + "learning_rate": 2.900736660812972e-05, + "loss": 32.0242, + "step": 125970 + }, + { + "epoch": 0.5089751411014193, + "grad_norm": 748.1174926757812, + "learning_rate": 2.900392104755455e-05, + "loss": 62.5196, + "step": 125980 + }, + { + "epoch": 0.5090155423667869, + "grad_norm": 1057.87060546875, + "learning_rate": 2.900047540892088e-05, + "loss": 66.0844, + "step": 125990 + }, + { + "epoch": 0.5090559436321546, + "grad_norm": 1053.06689453125, + "learning_rate": 2.8997029692295874e-05, + "loss": 59.8133, + "step": 126000 + }, + { + "epoch": 0.5090963448975222, + "grad_norm": 1008.4521484375, + "learning_rate": 2.8993583897746717e-05, + "loss": 67.719, + "step": 126010 + }, + { + "epoch": 0.5091367461628898, + "grad_norm": 400.8494567871094, + "learning_rate": 2.8990138025340596e-05, + "loss": 55.9787, + "step": 126020 + }, + { + "epoch": 0.5091771474282575, + "grad_norm": 776.619140625, + "learning_rate": 2.8986692075144673e-05, + "loss": 53.733, + "step": 126030 + }, + { + "epoch": 0.5092175486936251, + "grad_norm": 344.3357238769531, + "learning_rate": 2.8983246047226135e-05, + "loss": 49.7655, + "step": 126040 + }, + { + "epoch": 0.5092579499589928, + "grad_norm": 461.5896911621094, + "learning_rate": 2.897979994165217e-05, + "loss": 66.4076, + "step": 126050 + }, + { + "epoch": 0.5092983512243604, + "grad_norm": 0.0, + "learning_rate": 2.8976353758489955e-05, + "loss": 44.4132, + "step": 126060 + }, + { + "epoch": 0.5093387524897279, + "grad_norm": 446.7651062011719, + "learning_rate": 2.897290749780667e-05, + "loss": 48.4824, + "step": 126070 + }, + { + "epoch": 0.5093791537550956, + "grad_norm": 1131.740966796875, + "learning_rate": 2.8969461159669513e-05, + "loss": 45.0098, + "step": 126080 + }, + { + "epoch": 0.5094195550204632, + "grad_norm": 1675.897705078125, + "learning_rate": 2.8966014744145663e-05, + "loss": 93.9438, + "step": 126090 + }, + { + "epoch": 0.5094599562858309, + "grad_norm": 665.18994140625, + "learning_rate": 2.8962568251302324e-05, + "loss": 52.6255, + "step": 126100 + }, + { + "epoch": 0.5095003575511985, + "grad_norm": 404.408203125, + "learning_rate": 2.895912168120667e-05, + "loss": 57.1605, + "step": 126110 + }, + { + "epoch": 0.5095407588165661, + "grad_norm": 690.7555541992188, + "learning_rate": 2.8955675033925895e-05, + "loss": 63.0719, + "step": 126120 + }, + { + "epoch": 0.5095811600819338, + "grad_norm": 937.3909912109375, + "learning_rate": 2.89522283095272e-05, + "loss": 61.4134, + "step": 126130 + }, + { + "epoch": 0.5096215613473014, + "grad_norm": 1157.58642578125, + "learning_rate": 2.8948781508077786e-05, + "loss": 43.2224, + "step": 126140 + }, + { + "epoch": 0.509661962612669, + "grad_norm": 766.99609375, + "learning_rate": 2.894533462964485e-05, + "loss": 45.9795, + "step": 126150 + }, + { + "epoch": 0.5097023638780367, + "grad_norm": 1816.978515625, + "learning_rate": 2.894188767429557e-05, + "loss": 53.0026, + "step": 126160 + }, + { + "epoch": 0.5097427651434043, + "grad_norm": 813.2989501953125, + "learning_rate": 2.8938440642097164e-05, + "loss": 37.4644, + "step": 126170 + }, + { + "epoch": 0.509783166408772, + "grad_norm": 691.785888671875, + "learning_rate": 2.893499353311683e-05, + "loss": 51.4019, + "step": 126180 + }, + { + "epoch": 0.5098235676741395, + "grad_norm": 1490.9388427734375, + "learning_rate": 2.8931546347421773e-05, + "loss": 45.6698, + "step": 126190 + }, + { + "epoch": 0.5098639689395071, + "grad_norm": 548.1686401367188, + "learning_rate": 2.8928099085079197e-05, + "loss": 41.4219, + "step": 126200 + }, + { + "epoch": 0.5099043702048748, + "grad_norm": 1110.064697265625, + "learning_rate": 2.89246517461563e-05, + "loss": 71.6463, + "step": 126210 + }, + { + "epoch": 0.5099447714702424, + "grad_norm": 549.9130859375, + "learning_rate": 2.892120433072031e-05, + "loss": 45.8994, + "step": 126220 + }, + { + "epoch": 0.5099851727356101, + "grad_norm": 4628.6025390625, + "learning_rate": 2.8917756838838418e-05, + "loss": 75.2251, + "step": 126230 + }, + { + "epoch": 0.5100255740009777, + "grad_norm": 518.2777099609375, + "learning_rate": 2.8914309270577834e-05, + "loss": 48.5844, + "step": 126240 + }, + { + "epoch": 0.5100659752663453, + "grad_norm": 677.185302734375, + "learning_rate": 2.8910861626005776e-05, + "loss": 61.1778, + "step": 126250 + }, + { + "epoch": 0.510106376531713, + "grad_norm": 1070.945556640625, + "learning_rate": 2.8907413905189456e-05, + "loss": 43.0625, + "step": 126260 + }, + { + "epoch": 0.5101467777970806, + "grad_norm": 1414.0343017578125, + "learning_rate": 2.8903966108196096e-05, + "loss": 69.5534, + "step": 126270 + }, + { + "epoch": 0.5101871790624483, + "grad_norm": 1134.17041015625, + "learning_rate": 2.8900518235092905e-05, + "loss": 73.4212, + "step": 126280 + }, + { + "epoch": 0.5102275803278159, + "grad_norm": 970.3851318359375, + "learning_rate": 2.8897070285947098e-05, + "loss": 58.0343, + "step": 126290 + }, + { + "epoch": 0.5102679815931835, + "grad_norm": 340.81231689453125, + "learning_rate": 2.8893622260825904e-05, + "loss": 76.9128, + "step": 126300 + }, + { + "epoch": 0.5103083828585512, + "grad_norm": 637.6676025390625, + "learning_rate": 2.889017415979654e-05, + "loss": 68.674, + "step": 126310 + }, + { + "epoch": 0.5103487841239187, + "grad_norm": 643.8228759765625, + "learning_rate": 2.8886725982926232e-05, + "loss": 81.6261, + "step": 126320 + }, + { + "epoch": 0.5103891853892863, + "grad_norm": 378.8822021484375, + "learning_rate": 2.8883277730282194e-05, + "loss": 46.2597, + "step": 126330 + }, + { + "epoch": 0.510429586654654, + "grad_norm": 1168.7510986328125, + "learning_rate": 2.8879829401931652e-05, + "loss": 71.7926, + "step": 126340 + }, + { + "epoch": 0.5104699879200216, + "grad_norm": 522.0606689453125, + "learning_rate": 2.8876380997941847e-05, + "loss": 54.6862, + "step": 126350 + }, + { + "epoch": 0.5105103891853893, + "grad_norm": 396.95263671875, + "learning_rate": 2.8872932518379997e-05, + "loss": 44.9638, + "step": 126360 + }, + { + "epoch": 0.5105507904507569, + "grad_norm": 794.6138305664062, + "learning_rate": 2.886948396331333e-05, + "loss": 76.7646, + "step": 126370 + }, + { + "epoch": 0.5105911917161245, + "grad_norm": 1046.785888671875, + "learning_rate": 2.8866035332809084e-05, + "loss": 53.4936, + "step": 126380 + }, + { + "epoch": 0.5106315929814922, + "grad_norm": 869.3177490234375, + "learning_rate": 2.886258662693449e-05, + "loss": 43.9115, + "step": 126390 + }, + { + "epoch": 0.5106719942468598, + "grad_norm": 724.6814575195312, + "learning_rate": 2.8859137845756784e-05, + "loss": 45.5164, + "step": 126400 + }, + { + "epoch": 0.5107123955122275, + "grad_norm": 411.1709289550781, + "learning_rate": 2.8855688989343193e-05, + "loss": 57.5156, + "step": 126410 + }, + { + "epoch": 0.5107527967775951, + "grad_norm": 326.8053283691406, + "learning_rate": 2.885224005776096e-05, + "loss": 51.5223, + "step": 126420 + }, + { + "epoch": 0.5107931980429627, + "grad_norm": 843.7996215820312, + "learning_rate": 2.884879105107733e-05, + "loss": 44.6353, + "step": 126430 + }, + { + "epoch": 0.5108335993083304, + "grad_norm": 1049.7962646484375, + "learning_rate": 2.884534196935953e-05, + "loss": 58.139, + "step": 126440 + }, + { + "epoch": 0.5108740005736979, + "grad_norm": 2668.488037109375, + "learning_rate": 2.8841892812674808e-05, + "loss": 37.7585, + "step": 126450 + }, + { + "epoch": 0.5109144018390656, + "grad_norm": 1897.6781005859375, + "learning_rate": 2.8838443581090412e-05, + "loss": 50.773, + "step": 126460 + }, + { + "epoch": 0.5109548031044332, + "grad_norm": 556.2504272460938, + "learning_rate": 2.8834994274673582e-05, + "loss": 56.0956, + "step": 126470 + }, + { + "epoch": 0.5109952043698008, + "grad_norm": 292.6050109863281, + "learning_rate": 2.8831544893491563e-05, + "loss": 49.7071, + "step": 126480 + }, + { + "epoch": 0.5110356056351685, + "grad_norm": 442.0137939453125, + "learning_rate": 2.882809543761161e-05, + "loss": 51.5451, + "step": 126490 + }, + { + "epoch": 0.5110760069005361, + "grad_norm": 810.0142211914062, + "learning_rate": 2.8824645907100954e-05, + "loss": 55.3643, + "step": 126500 + }, + { + "epoch": 0.5111164081659038, + "grad_norm": 1782.1546630859375, + "learning_rate": 2.8821196302026863e-05, + "loss": 68.3655, + "step": 126510 + }, + { + "epoch": 0.5111568094312714, + "grad_norm": 900.7822875976562, + "learning_rate": 2.881774662245658e-05, + "loss": 48.6357, + "step": 126520 + }, + { + "epoch": 0.511197210696639, + "grad_norm": 648.9537963867188, + "learning_rate": 2.8814296868457364e-05, + "loss": 51.299, + "step": 126530 + }, + { + "epoch": 0.5112376119620067, + "grad_norm": 949.2380981445312, + "learning_rate": 2.8810847040096467e-05, + "loss": 66.241, + "step": 126540 + }, + { + "epoch": 0.5112780132273743, + "grad_norm": 227.76019287109375, + "learning_rate": 2.8807397137441145e-05, + "loss": 32.5676, + "step": 126550 + }, + { + "epoch": 0.511318414492742, + "grad_norm": 545.3878173828125, + "learning_rate": 2.8803947160558652e-05, + "loss": 77.758, + "step": 126560 + }, + { + "epoch": 0.5113588157581096, + "grad_norm": 621.6892700195312, + "learning_rate": 2.8800497109516263e-05, + "loss": 58.1789, + "step": 126570 + }, + { + "epoch": 0.5113992170234771, + "grad_norm": 1468.47509765625, + "learning_rate": 2.8797046984381208e-05, + "loss": 72.7109, + "step": 126580 + }, + { + "epoch": 0.5114396182888448, + "grad_norm": 406.4592590332031, + "learning_rate": 2.8793596785220783e-05, + "loss": 64.9677, + "step": 126590 + }, + { + "epoch": 0.5114800195542124, + "grad_norm": 290.8427429199219, + "learning_rate": 2.879014651210223e-05, + "loss": 47.5232, + "step": 126600 + }, + { + "epoch": 0.51152042081958, + "grad_norm": 1523.126220703125, + "learning_rate": 2.8786696165092812e-05, + "loss": 42.1534, + "step": 126610 + }, + { + "epoch": 0.5115608220849477, + "grad_norm": 1009.9312133789062, + "learning_rate": 2.8783245744259806e-05, + "loss": 61.5785, + "step": 126620 + }, + { + "epoch": 0.5116012233503153, + "grad_norm": 527.927490234375, + "learning_rate": 2.877979524967048e-05, + "loss": 61.9816, + "step": 126630 + }, + { + "epoch": 0.511641624615683, + "grad_norm": 575.1068725585938, + "learning_rate": 2.8776344681392105e-05, + "loss": 60.9859, + "step": 126640 + }, + { + "epoch": 0.5116820258810506, + "grad_norm": 338.4177551269531, + "learning_rate": 2.877289403949194e-05, + "loss": 54.107, + "step": 126650 + }, + { + "epoch": 0.5117224271464182, + "grad_norm": 354.56005859375, + "learning_rate": 2.876944332403726e-05, + "loss": 54.2842, + "step": 126660 + }, + { + "epoch": 0.5117628284117859, + "grad_norm": 989.342041015625, + "learning_rate": 2.8765992535095345e-05, + "loss": 100.1021, + "step": 126670 + }, + { + "epoch": 0.5118032296771535, + "grad_norm": 547.06982421875, + "learning_rate": 2.8762541672733472e-05, + "loss": 48.5291, + "step": 126680 + }, + { + "epoch": 0.5118436309425212, + "grad_norm": 1068.9014892578125, + "learning_rate": 2.8759090737018902e-05, + "loss": 58.7899, + "step": 126690 + }, + { + "epoch": 0.5118840322078888, + "grad_norm": 1199.883544921875, + "learning_rate": 2.875563972801893e-05, + "loss": 43.4314, + "step": 126700 + }, + { + "epoch": 0.5119244334732563, + "grad_norm": 616.8649291992188, + "learning_rate": 2.8752188645800822e-05, + "loss": 46.0097, + "step": 126710 + }, + { + "epoch": 0.511964834738624, + "grad_norm": 2177.439697265625, + "learning_rate": 2.874873749043187e-05, + "loss": 33.2443, + "step": 126720 + }, + { + "epoch": 0.5120052360039916, + "grad_norm": 3227.6328125, + "learning_rate": 2.8745286261979348e-05, + "loss": 64.0699, + "step": 126730 + }, + { + "epoch": 0.5120456372693593, + "grad_norm": 754.943115234375, + "learning_rate": 2.874183496051055e-05, + "loss": 40.789, + "step": 126740 + }, + { + "epoch": 0.5120860385347269, + "grad_norm": 560.1809692382812, + "learning_rate": 2.8738383586092745e-05, + "loss": 42.9098, + "step": 126750 + }, + { + "epoch": 0.5121264398000945, + "grad_norm": 1200.2210693359375, + "learning_rate": 2.8734932138793225e-05, + "loss": 58.1281, + "step": 126760 + }, + { + "epoch": 0.5121668410654622, + "grad_norm": 569.3571166992188, + "learning_rate": 2.8731480618679285e-05, + "loss": 71.4217, + "step": 126770 + }, + { + "epoch": 0.5122072423308298, + "grad_norm": 1209.4700927734375, + "learning_rate": 2.8728029025818204e-05, + "loss": 51.1106, + "step": 126780 + }, + { + "epoch": 0.5122476435961975, + "grad_norm": 541.9495849609375, + "learning_rate": 2.872457736027728e-05, + "loss": 42.31, + "step": 126790 + }, + { + "epoch": 0.5122880448615651, + "grad_norm": 1008.5130004882812, + "learning_rate": 2.8721125622123806e-05, + "loss": 101.0686, + "step": 126800 + }, + { + "epoch": 0.5123284461269327, + "grad_norm": 794.3250732421875, + "learning_rate": 2.8717673811425072e-05, + "loss": 56.3601, + "step": 126810 + }, + { + "epoch": 0.5123688473923004, + "grad_norm": 796.3021240234375, + "learning_rate": 2.8714221928248368e-05, + "loss": 50.3433, + "step": 126820 + }, + { + "epoch": 0.5124092486576679, + "grad_norm": 536.2626953125, + "learning_rate": 2.8710769972661e-05, + "loss": 53.0116, + "step": 126830 + }, + { + "epoch": 0.5124496499230355, + "grad_norm": 816.8209228515625, + "learning_rate": 2.8707317944730268e-05, + "loss": 42.8337, + "step": 126840 + }, + { + "epoch": 0.5124900511884032, + "grad_norm": 1031.8271484375, + "learning_rate": 2.8703865844523452e-05, + "loss": 47.6963, + "step": 126850 + }, + { + "epoch": 0.5125304524537708, + "grad_norm": 1931.7418212890625, + "learning_rate": 2.8700413672107866e-05, + "loss": 52.6271, + "step": 126860 + }, + { + "epoch": 0.5125708537191385, + "grad_norm": 747.9002075195312, + "learning_rate": 2.869696142755081e-05, + "loss": 57.4997, + "step": 126870 + }, + { + "epoch": 0.5126112549845061, + "grad_norm": 460.6169128417969, + "learning_rate": 2.8693509110919598e-05, + "loss": 49.3172, + "step": 126880 + }, + { + "epoch": 0.5126516562498737, + "grad_norm": 395.9313659667969, + "learning_rate": 2.8690056722281513e-05, + "loss": 50.4048, + "step": 126890 + }, + { + "epoch": 0.5126920575152414, + "grad_norm": 673.880859375, + "learning_rate": 2.8686604261703875e-05, + "loss": 44.8503, + "step": 126900 + }, + { + "epoch": 0.512732458780609, + "grad_norm": 745.4364624023438, + "learning_rate": 2.8683151729253994e-05, + "loss": 50.778, + "step": 126910 + }, + { + "epoch": 0.5127728600459767, + "grad_norm": 887.1217041015625, + "learning_rate": 2.8679699124999166e-05, + "loss": 44.0796, + "step": 126920 + }, + { + "epoch": 0.5128132613113443, + "grad_norm": 565.948486328125, + "learning_rate": 2.8676246449006715e-05, + "loss": 79.4067, + "step": 126930 + }, + { + "epoch": 0.5128536625767119, + "grad_norm": 602.2057495117188, + "learning_rate": 2.8672793701343946e-05, + "loss": 53.1035, + "step": 126940 + }, + { + "epoch": 0.5128940638420796, + "grad_norm": 670.5474243164062, + "learning_rate": 2.8669340882078166e-05, + "loss": 60.5767, + "step": 126950 + }, + { + "epoch": 0.5129344651074471, + "grad_norm": 447.6339111328125, + "learning_rate": 2.866588799127671e-05, + "loss": 67.3426, + "step": 126960 + }, + { + "epoch": 0.5129748663728148, + "grad_norm": 728.9623413085938, + "learning_rate": 2.8662435029006868e-05, + "loss": 72.0813, + "step": 126970 + }, + { + "epoch": 0.5130152676381824, + "grad_norm": 600.4077758789062, + "learning_rate": 2.865898199533597e-05, + "loss": 54.8805, + "step": 126980 + }, + { + "epoch": 0.51305566890355, + "grad_norm": 580.8352661132812, + "learning_rate": 2.865552889033134e-05, + "loss": 55.582, + "step": 126990 + }, + { + "epoch": 0.5130960701689177, + "grad_norm": 1138.0733642578125, + "learning_rate": 2.8652075714060295e-05, + "loss": 64.3382, + "step": 127000 + }, + { + "epoch": 0.5131364714342853, + "grad_norm": 281.50640869140625, + "learning_rate": 2.864862246659015e-05, + "loss": 45.8135, + "step": 127010 + }, + { + "epoch": 0.513176872699653, + "grad_norm": 1133.2386474609375, + "learning_rate": 2.8645169147988226e-05, + "loss": 56.7795, + "step": 127020 + }, + { + "epoch": 0.5132172739650206, + "grad_norm": 846.5953979492188, + "learning_rate": 2.8641715758321857e-05, + "loss": 31.9575, + "step": 127030 + }, + { + "epoch": 0.5132576752303882, + "grad_norm": 3238.64013671875, + "learning_rate": 2.8638262297658368e-05, + "loss": 78.6456, + "step": 127040 + }, + { + "epoch": 0.5132980764957559, + "grad_norm": 1550.1759033203125, + "learning_rate": 2.863480876606508e-05, + "loss": 56.8611, + "step": 127050 + }, + { + "epoch": 0.5133384777611235, + "grad_norm": 442.9187927246094, + "learning_rate": 2.863135516360932e-05, + "loss": 41.2573, + "step": 127060 + }, + { + "epoch": 0.5133788790264912, + "grad_norm": 1137.1116943359375, + "learning_rate": 2.8627901490358422e-05, + "loss": 46.5267, + "step": 127070 + }, + { + "epoch": 0.5134192802918588, + "grad_norm": 1043.6861572265625, + "learning_rate": 2.8624447746379722e-05, + "loss": 60.2835, + "step": 127080 + }, + { + "epoch": 0.5134596815572263, + "grad_norm": 739.7015380859375, + "learning_rate": 2.862099393174055e-05, + "loss": 48.624, + "step": 127090 + }, + { + "epoch": 0.513500082822594, + "grad_norm": 945.224853515625, + "learning_rate": 2.861754004650823e-05, + "loss": 63.0741, + "step": 127100 + }, + { + "epoch": 0.5135404840879616, + "grad_norm": 425.7527160644531, + "learning_rate": 2.8614086090750103e-05, + "loss": 46.5734, + "step": 127110 + }, + { + "epoch": 0.5135808853533292, + "grad_norm": 1409.0302734375, + "learning_rate": 2.8610632064533517e-05, + "loss": 47.2388, + "step": 127120 + }, + { + "epoch": 0.5136212866186969, + "grad_norm": 581.88916015625, + "learning_rate": 2.8607177967925792e-05, + "loss": 69.2831, + "step": 127130 + }, + { + "epoch": 0.5136616878840645, + "grad_norm": 478.259765625, + "learning_rate": 2.8603723800994275e-05, + "loss": 60.1264, + "step": 127140 + }, + { + "epoch": 0.5137020891494322, + "grad_norm": 2480.73095703125, + "learning_rate": 2.8600269563806302e-05, + "loss": 68.6383, + "step": 127150 + }, + { + "epoch": 0.5137424904147998, + "grad_norm": 673.7091064453125, + "learning_rate": 2.859681525642923e-05, + "loss": 65.6581, + "step": 127160 + }, + { + "epoch": 0.5137828916801674, + "grad_norm": 675.0384521484375, + "learning_rate": 2.8593360878930392e-05, + "loss": 57.0633, + "step": 127170 + }, + { + "epoch": 0.5138232929455351, + "grad_norm": 613.3917236328125, + "learning_rate": 2.8589906431377134e-05, + "loss": 50.5051, + "step": 127180 + }, + { + "epoch": 0.5138636942109027, + "grad_norm": 358.6791687011719, + "learning_rate": 2.8586451913836797e-05, + "loss": 48.4163, + "step": 127190 + }, + { + "epoch": 0.5139040954762704, + "grad_norm": 1500.3143310546875, + "learning_rate": 2.858299732637674e-05, + "loss": 69.6514, + "step": 127200 + }, + { + "epoch": 0.513944496741638, + "grad_norm": 303.2386474609375, + "learning_rate": 2.8579542669064296e-05, + "loss": 46.2417, + "step": 127210 + }, + { + "epoch": 0.5139848980070055, + "grad_norm": 936.28662109375, + "learning_rate": 2.8576087941966835e-05, + "loss": 59.2663, + "step": 127220 + }, + { + "epoch": 0.5140252992723732, + "grad_norm": 550.0284423828125, + "learning_rate": 2.857263314515169e-05, + "loss": 46.5445, + "step": 127230 + }, + { + "epoch": 0.5140657005377408, + "grad_norm": 1154.9842529296875, + "learning_rate": 2.856917827868622e-05, + "loss": 48.4179, + "step": 127240 + }, + { + "epoch": 0.5141061018031085, + "grad_norm": 400.9862365722656, + "learning_rate": 2.8565723342637796e-05, + "loss": 84.6768, + "step": 127250 + }, + { + "epoch": 0.5141465030684761, + "grad_norm": 1014.5621948242188, + "learning_rate": 2.856226833707375e-05, + "loss": 60.3505, + "step": 127260 + }, + { + "epoch": 0.5141869043338437, + "grad_norm": 351.1904602050781, + "learning_rate": 2.855881326206145e-05, + "loss": 59.1501, + "step": 127270 + }, + { + "epoch": 0.5142273055992114, + "grad_norm": 520.452392578125, + "learning_rate": 2.855535811766825e-05, + "loss": 60.3091, + "step": 127280 + }, + { + "epoch": 0.514267706864579, + "grad_norm": 717.9920043945312, + "learning_rate": 2.8551902903961526e-05, + "loss": 40.0301, + "step": 127290 + }, + { + "epoch": 0.5143081081299467, + "grad_norm": 560.2972412109375, + "learning_rate": 2.854844762100861e-05, + "loss": 50.7043, + "step": 127300 + }, + { + "epoch": 0.5143485093953143, + "grad_norm": 1860.7342529296875, + "learning_rate": 2.854499226887689e-05, + "loss": 63.0763, + "step": 127310 + }, + { + "epoch": 0.5143889106606819, + "grad_norm": 135.43519592285156, + "learning_rate": 2.8541536847633717e-05, + "loss": 45.6506, + "step": 127320 + }, + { + "epoch": 0.5144293119260496, + "grad_norm": 766.4346313476562, + "learning_rate": 2.8538081357346465e-05, + "loss": 72.0964, + "step": 127330 + }, + { + "epoch": 0.5144697131914172, + "grad_norm": 778.751220703125, + "learning_rate": 2.8534625798082488e-05, + "loss": 65.8823, + "step": 127340 + }, + { + "epoch": 0.5145101144567847, + "grad_norm": 490.54461669921875, + "learning_rate": 2.853117016990917e-05, + "loss": 41.0561, + "step": 127350 + }, + { + "epoch": 0.5145505157221524, + "grad_norm": 450.0224914550781, + "learning_rate": 2.8527714472893862e-05, + "loss": 61.406, + "step": 127360 + }, + { + "epoch": 0.51459091698752, + "grad_norm": 298.4858093261719, + "learning_rate": 2.8524258707103957e-05, + "loss": 88.5857, + "step": 127370 + }, + { + "epoch": 0.5146313182528877, + "grad_norm": 1860.1826171875, + "learning_rate": 2.85208028726068e-05, + "loss": 108.9013, + "step": 127380 + }, + { + "epoch": 0.5146717195182553, + "grad_norm": 948.806396484375, + "learning_rate": 2.8517346969469782e-05, + "loss": 47.6132, + "step": 127390 + }, + { + "epoch": 0.5147121207836229, + "grad_norm": 1325.517578125, + "learning_rate": 2.8513890997760272e-05, + "loss": 45.2896, + "step": 127400 + }, + { + "epoch": 0.5147525220489906, + "grad_norm": 605.529052734375, + "learning_rate": 2.851043495754566e-05, + "loss": 43.9705, + "step": 127410 + }, + { + "epoch": 0.5147929233143582, + "grad_norm": 426.92193603515625, + "learning_rate": 2.8506978848893302e-05, + "loss": 39.5349, + "step": 127420 + }, + { + "epoch": 0.5148333245797259, + "grad_norm": 1157.5235595703125, + "learning_rate": 2.8503522671870585e-05, + "loss": 56.4158, + "step": 127430 + }, + { + "epoch": 0.5148737258450935, + "grad_norm": 782.9813232421875, + "learning_rate": 2.8500066426544896e-05, + "loss": 85.5858, + "step": 127440 + }, + { + "epoch": 0.5149141271104611, + "grad_norm": 1053.0633544921875, + "learning_rate": 2.849661011298361e-05, + "loss": 64.2638, + "step": 127450 + }, + { + "epoch": 0.5149545283758288, + "grad_norm": 652.576171875, + "learning_rate": 2.8493153731254102e-05, + "loss": 43.2801, + "step": 127460 + }, + { + "epoch": 0.5149949296411963, + "grad_norm": 1399.2457275390625, + "learning_rate": 2.8489697281423767e-05, + "loss": 57.9567, + "step": 127470 + }, + { + "epoch": 0.515035330906564, + "grad_norm": 1636.6573486328125, + "learning_rate": 2.8486240763559986e-05, + "loss": 87.6768, + "step": 127480 + }, + { + "epoch": 0.5150757321719316, + "grad_norm": 0.0, + "learning_rate": 2.848278417773015e-05, + "loss": 68.869, + "step": 127490 + }, + { + "epoch": 0.5151161334372992, + "grad_norm": 499.08416748046875, + "learning_rate": 2.8479327524001636e-05, + "loss": 44.9667, + "step": 127500 + }, + { + "epoch": 0.5151565347026669, + "grad_norm": 905.2075805664062, + "learning_rate": 2.8475870802441844e-05, + "loss": 51.4555, + "step": 127510 + }, + { + "epoch": 0.5151969359680345, + "grad_norm": 567.349365234375, + "learning_rate": 2.847241401311817e-05, + "loss": 37.061, + "step": 127520 + }, + { + "epoch": 0.5152373372334021, + "grad_norm": 499.4542541503906, + "learning_rate": 2.846895715609799e-05, + "loss": 46.9687, + "step": 127530 + }, + { + "epoch": 0.5152777384987698, + "grad_norm": 1374.189208984375, + "learning_rate": 2.8465500231448704e-05, + "loss": 74.0274, + "step": 127540 + }, + { + "epoch": 0.5153181397641374, + "grad_norm": 1301.90673828125, + "learning_rate": 2.8462043239237707e-05, + "loss": 51.4661, + "step": 127550 + }, + { + "epoch": 0.5153585410295051, + "grad_norm": 290.5340576171875, + "learning_rate": 2.845858617953239e-05, + "loss": 49.4092, + "step": 127560 + }, + { + "epoch": 0.5153989422948727, + "grad_norm": 617.203857421875, + "learning_rate": 2.8455129052400166e-05, + "loss": 48.6922, + "step": 127570 + }, + { + "epoch": 0.5154393435602403, + "grad_norm": 877.1514282226562, + "learning_rate": 2.8451671857908415e-05, + "loss": 57.4519, + "step": 127580 + }, + { + "epoch": 0.515479744825608, + "grad_norm": 1293.734375, + "learning_rate": 2.844821459612454e-05, + "loss": 97.9196, + "step": 127590 + }, + { + "epoch": 0.5155201460909755, + "grad_norm": 628.9718017578125, + "learning_rate": 2.844475726711595e-05, + "loss": 44.7918, + "step": 127600 + }, + { + "epoch": 0.5155605473563432, + "grad_norm": 645.7540283203125, + "learning_rate": 2.844129987095005e-05, + "loss": 42.1978, + "step": 127610 + }, + { + "epoch": 0.5156009486217108, + "grad_norm": 470.0179138183594, + "learning_rate": 2.8437842407694236e-05, + "loss": 59.6109, + "step": 127620 + }, + { + "epoch": 0.5156413498870784, + "grad_norm": 548.3775024414062, + "learning_rate": 2.843438487741591e-05, + "loss": 53.7288, + "step": 127630 + }, + { + "epoch": 0.5156817511524461, + "grad_norm": 294.67889404296875, + "learning_rate": 2.843092728018248e-05, + "loss": 49.3608, + "step": 127640 + }, + { + "epoch": 0.5157221524178137, + "grad_norm": 2212.821533203125, + "learning_rate": 2.8427469616061364e-05, + "loss": 59.0455, + "step": 127650 + }, + { + "epoch": 0.5157625536831814, + "grad_norm": 849.8643798828125, + "learning_rate": 2.8424011885119954e-05, + "loss": 44.9502, + "step": 127660 + }, + { + "epoch": 0.515802954948549, + "grad_norm": 565.5281372070312, + "learning_rate": 2.842055408742567e-05, + "loss": 53.8755, + "step": 127670 + }, + { + "epoch": 0.5158433562139166, + "grad_norm": 874.5379638671875, + "learning_rate": 2.8417096223045925e-05, + "loss": 53.4064, + "step": 127680 + }, + { + "epoch": 0.5158837574792843, + "grad_norm": 761.3839111328125, + "learning_rate": 2.841363829204814e-05, + "loss": 40.8466, + "step": 127690 + }, + { + "epoch": 0.5159241587446519, + "grad_norm": 409.4591369628906, + "learning_rate": 2.841018029449971e-05, + "loss": 40.9471, + "step": 127700 + }, + { + "epoch": 0.5159645600100196, + "grad_norm": 896.9284057617188, + "learning_rate": 2.8406722230468063e-05, + "loss": 69.344, + "step": 127710 + }, + { + "epoch": 0.5160049612753872, + "grad_norm": 248.2589569091797, + "learning_rate": 2.840326410002061e-05, + "loss": 53.1701, + "step": 127720 + }, + { + "epoch": 0.5160453625407547, + "grad_norm": 770.6051025390625, + "learning_rate": 2.839980590322477e-05, + "loss": 37.5784, + "step": 127730 + }, + { + "epoch": 0.5160857638061224, + "grad_norm": 1029.7786865234375, + "learning_rate": 2.8396347640147962e-05, + "loss": 57.3108, + "step": 127740 + }, + { + "epoch": 0.51612616507149, + "grad_norm": 450.9689636230469, + "learning_rate": 2.8392889310857612e-05, + "loss": 47.8311, + "step": 127750 + }, + { + "epoch": 0.5161665663368576, + "grad_norm": 471.0567932128906, + "learning_rate": 2.8389430915421132e-05, + "loss": 58.49, + "step": 127760 + }, + { + "epoch": 0.5162069676022253, + "grad_norm": 991.8955688476562, + "learning_rate": 2.8385972453905958e-05, + "loss": 43.4331, + "step": 127770 + }, + { + "epoch": 0.5162473688675929, + "grad_norm": 0.0, + "learning_rate": 2.8382513926379504e-05, + "loss": 66.81, + "step": 127780 + }, + { + "epoch": 0.5162877701329606, + "grad_norm": 1275.402587890625, + "learning_rate": 2.837905533290921e-05, + "loss": 33.7486, + "step": 127790 + }, + { + "epoch": 0.5163281713983282, + "grad_norm": 921.3496704101562, + "learning_rate": 2.8375596673562482e-05, + "loss": 66.0726, + "step": 127800 + }, + { + "epoch": 0.5163685726636958, + "grad_norm": 1107.1654052734375, + "learning_rate": 2.8372137948406762e-05, + "loss": 64.4679, + "step": 127810 + }, + { + "epoch": 0.5164089739290635, + "grad_norm": 978.436279296875, + "learning_rate": 2.8368679157509477e-05, + "loss": 55.1411, + "step": 127820 + }, + { + "epoch": 0.5164493751944311, + "grad_norm": 794.9774169921875, + "learning_rate": 2.8365220300938055e-05, + "loss": 44.3256, + "step": 127830 + }, + { + "epoch": 0.5164897764597988, + "grad_norm": 2009.2308349609375, + "learning_rate": 2.8361761378759934e-05, + "loss": 52.7043, + "step": 127840 + }, + { + "epoch": 0.5165301777251664, + "grad_norm": 989.152099609375, + "learning_rate": 2.8358302391042536e-05, + "loss": 37.631, + "step": 127850 + }, + { + "epoch": 0.5165705789905339, + "grad_norm": 356.483642578125, + "learning_rate": 2.8354843337853314e-05, + "loss": 77.6594, + "step": 127860 + }, + { + "epoch": 0.5166109802559016, + "grad_norm": 1363.230224609375, + "learning_rate": 2.835138421925969e-05, + "loss": 79.6859, + "step": 127870 + }, + { + "epoch": 0.5166513815212692, + "grad_norm": 811.1392211914062, + "learning_rate": 2.834792503532911e-05, + "loss": 75.7948, + "step": 127880 + }, + { + "epoch": 0.5166917827866369, + "grad_norm": 1689.7584228515625, + "learning_rate": 2.8344465786129e-05, + "loss": 79.6354, + "step": 127890 + }, + { + "epoch": 0.5167321840520045, + "grad_norm": 251.05508422851562, + "learning_rate": 2.8341006471726816e-05, + "loss": 38.7622, + "step": 127900 + }, + { + "epoch": 0.5167725853173721, + "grad_norm": 1307.02880859375, + "learning_rate": 2.833754709218998e-05, + "loss": 75.8891, + "step": 127910 + }, + { + "epoch": 0.5168129865827398, + "grad_norm": 743.4976196289062, + "learning_rate": 2.833408764758595e-05, + "loss": 37.3798, + "step": 127920 + }, + { + "epoch": 0.5168533878481074, + "grad_norm": 395.01214599609375, + "learning_rate": 2.833062813798216e-05, + "loss": 53.7051, + "step": 127930 + }, + { + "epoch": 0.516893789113475, + "grad_norm": 1006.5745849609375, + "learning_rate": 2.832716856344607e-05, + "loss": 53.2251, + "step": 127940 + }, + { + "epoch": 0.5169341903788427, + "grad_norm": 649.6722412109375, + "learning_rate": 2.832370892404511e-05, + "loss": 47.6598, + "step": 127950 + }, + { + "epoch": 0.5169745916442103, + "grad_norm": 952.0791625976562, + "learning_rate": 2.832024921984674e-05, + "loss": 46.0533, + "step": 127960 + }, + { + "epoch": 0.517014992909578, + "grad_norm": 663.2807006835938, + "learning_rate": 2.8316789450918396e-05, + "loss": 60.6055, + "step": 127970 + }, + { + "epoch": 0.5170553941749455, + "grad_norm": 1854.5562744140625, + "learning_rate": 2.8313329617327537e-05, + "loss": 104.9597, + "step": 127980 + }, + { + "epoch": 0.5170957954403131, + "grad_norm": 399.9706726074219, + "learning_rate": 2.8309869719141608e-05, + "loss": 73.0087, + "step": 127990 + }, + { + "epoch": 0.5171361967056808, + "grad_norm": 742.0582275390625, + "learning_rate": 2.8306409756428064e-05, + "loss": 63.6711, + "step": 128000 + }, + { + "epoch": 0.5171765979710484, + "grad_norm": 426.78814697265625, + "learning_rate": 2.8302949729254358e-05, + "loss": 44.5783, + "step": 128010 + }, + { + "epoch": 0.5172169992364161, + "grad_norm": 537.81494140625, + "learning_rate": 2.8299489637687954e-05, + "loss": 46.2207, + "step": 128020 + }, + { + "epoch": 0.5172574005017837, + "grad_norm": 545.3671264648438, + "learning_rate": 2.8296029481796292e-05, + "loss": 52.6649, + "step": 128030 + }, + { + "epoch": 0.5172978017671513, + "grad_norm": 1167.154541015625, + "learning_rate": 2.829256926164685e-05, + "loss": 68.5006, + "step": 128040 + }, + { + "epoch": 0.517338203032519, + "grad_norm": 743.8500366210938, + "learning_rate": 2.8289108977307067e-05, + "loss": 49.9936, + "step": 128050 + }, + { + "epoch": 0.5173786042978866, + "grad_norm": 708.7972412109375, + "learning_rate": 2.8285648628844413e-05, + "loss": 58.3343, + "step": 128060 + }, + { + "epoch": 0.5174190055632543, + "grad_norm": 952.4853515625, + "learning_rate": 2.8282188216326345e-05, + "loss": 51.3421, + "step": 128070 + }, + { + "epoch": 0.5174594068286219, + "grad_norm": 761.9249267578125, + "learning_rate": 2.8278727739820333e-05, + "loss": 47.8939, + "step": 128080 + }, + { + "epoch": 0.5174998080939895, + "grad_norm": 1164.4027099609375, + "learning_rate": 2.827526719939383e-05, + "loss": 112.3727, + "step": 128090 + }, + { + "epoch": 0.5175402093593572, + "grad_norm": 1486.0279541015625, + "learning_rate": 2.827180659511431e-05, + "loss": 54.5274, + "step": 128100 + }, + { + "epoch": 0.5175806106247247, + "grad_norm": 1370.9776611328125, + "learning_rate": 2.8268345927049234e-05, + "loss": 65.6802, + "step": 128110 + }, + { + "epoch": 0.5176210118900924, + "grad_norm": 796.0413818359375, + "learning_rate": 2.8264885195266065e-05, + "loss": 58.2697, + "step": 128120 + }, + { + "epoch": 0.51766141315546, + "grad_norm": 703.8768310546875, + "learning_rate": 2.8261424399832293e-05, + "loss": 49.1636, + "step": 128130 + }, + { + "epoch": 0.5177018144208276, + "grad_norm": 419.2386779785156, + "learning_rate": 2.825796354081537e-05, + "loss": 39.2348, + "step": 128140 + }, + { + "epoch": 0.5177422156861953, + "grad_norm": 687.80859375, + "learning_rate": 2.8254502618282763e-05, + "loss": 33.7805, + "step": 128150 + }, + { + "epoch": 0.5177826169515629, + "grad_norm": 0.0, + "learning_rate": 2.8251041632301957e-05, + "loss": 54.0625, + "step": 128160 + }, + { + "epoch": 0.5178230182169306, + "grad_norm": 458.79937744140625, + "learning_rate": 2.8247580582940413e-05, + "loss": 60.1953, + "step": 128170 + }, + { + "epoch": 0.5178634194822982, + "grad_norm": 739.254638671875, + "learning_rate": 2.824411947026563e-05, + "loss": 67.9315, + "step": 128180 + }, + { + "epoch": 0.5179038207476658, + "grad_norm": 998.814208984375, + "learning_rate": 2.824065829434505e-05, + "loss": 52.8274, + "step": 128190 + }, + { + "epoch": 0.5179442220130335, + "grad_norm": 862.947265625, + "learning_rate": 2.8237197055246172e-05, + "loss": 47.5776, + "step": 128200 + }, + { + "epoch": 0.5179846232784011, + "grad_norm": 0.0, + "learning_rate": 2.8233735753036484e-05, + "loss": 58.8695, + "step": 128210 + }, + { + "epoch": 0.5180250245437688, + "grad_norm": 1318.5653076171875, + "learning_rate": 2.823027438778344e-05, + "loss": 46.4017, + "step": 128220 + }, + { + "epoch": 0.5180654258091364, + "grad_norm": 1210.0972900390625, + "learning_rate": 2.8226812959554537e-05, + "loss": 61.901, + "step": 128230 + }, + { + "epoch": 0.5181058270745039, + "grad_norm": 756.2396850585938, + "learning_rate": 2.8223351468417254e-05, + "loss": 49.0368, + "step": 128240 + }, + { + "epoch": 0.5181462283398716, + "grad_norm": 419.0620422363281, + "learning_rate": 2.8219889914439074e-05, + "loss": 52.058, + "step": 128250 + }, + { + "epoch": 0.5181866296052392, + "grad_norm": 868.4646606445312, + "learning_rate": 2.821642829768748e-05, + "loss": 78.654, + "step": 128260 + }, + { + "epoch": 0.5182270308706068, + "grad_norm": 424.6618957519531, + "learning_rate": 2.8212966618229964e-05, + "loss": 94.6811, + "step": 128270 + }, + { + "epoch": 0.5182674321359745, + "grad_norm": 535.59228515625, + "learning_rate": 2.8209504876134007e-05, + "loss": 44.2588, + "step": 128280 + }, + { + "epoch": 0.5183078334013421, + "grad_norm": 578.2286987304688, + "learning_rate": 2.8206043071467102e-05, + "loss": 38.9842, + "step": 128290 + }, + { + "epoch": 0.5183482346667098, + "grad_norm": 948.8009033203125, + "learning_rate": 2.8202581204296742e-05, + "loss": 66.6254, + "step": 128300 + }, + { + "epoch": 0.5183886359320774, + "grad_norm": 956.4691772460938, + "learning_rate": 2.819911927469041e-05, + "loss": 83.3204, + "step": 128310 + }, + { + "epoch": 0.518429037197445, + "grad_norm": 662.3033447265625, + "learning_rate": 2.8195657282715594e-05, + "loss": 33.0041, + "step": 128320 + }, + { + "epoch": 0.5184694384628127, + "grad_norm": 841.9630737304688, + "learning_rate": 2.81921952284398e-05, + "loss": 40.0045, + "step": 128330 + }, + { + "epoch": 0.5185098397281803, + "grad_norm": 644.8792114257812, + "learning_rate": 2.818873311193051e-05, + "loss": 49.6458, + "step": 128340 + }, + { + "epoch": 0.518550240993548, + "grad_norm": 1007.34765625, + "learning_rate": 2.8185270933255237e-05, + "loss": 45.5391, + "step": 128350 + }, + { + "epoch": 0.5185906422589156, + "grad_norm": 412.73883056640625, + "learning_rate": 2.8181808692481453e-05, + "loss": 45.3399, + "step": 128360 + }, + { + "epoch": 0.5186310435242831, + "grad_norm": 775.2849731445312, + "learning_rate": 2.817834638967668e-05, + "loss": 56.8275, + "step": 128370 + }, + { + "epoch": 0.5186714447896508, + "grad_norm": 880.3590698242188, + "learning_rate": 2.817488402490841e-05, + "loss": 62.0417, + "step": 128380 + }, + { + "epoch": 0.5187118460550184, + "grad_norm": 0.0, + "learning_rate": 2.8171421598244134e-05, + "loss": 100.0851, + "step": 128390 + }, + { + "epoch": 0.518752247320386, + "grad_norm": 757.4847412109375, + "learning_rate": 2.816795910975137e-05, + "loss": 81.3571, + "step": 128400 + }, + { + "epoch": 0.5187926485857537, + "grad_norm": 986.7409057617188, + "learning_rate": 2.8164496559497605e-05, + "loss": 99.9801, + "step": 128410 + }, + { + "epoch": 0.5188330498511213, + "grad_norm": 1231.8388671875, + "learning_rate": 2.816103394755035e-05, + "loss": 36.7881, + "step": 128420 + }, + { + "epoch": 0.518873451116489, + "grad_norm": 551.5934448242188, + "learning_rate": 2.8157571273977117e-05, + "loss": 48.0908, + "step": 128430 + }, + { + "epoch": 0.5189138523818566, + "grad_norm": 957.9308471679688, + "learning_rate": 2.8154108538845404e-05, + "loss": 54.7105, + "step": 128440 + }, + { + "epoch": 0.5189542536472243, + "grad_norm": 746.921875, + "learning_rate": 2.8150645742222714e-05, + "loss": 47.9935, + "step": 128450 + }, + { + "epoch": 0.5189946549125919, + "grad_norm": 817.0953369140625, + "learning_rate": 2.814718288417657e-05, + "loss": 34.7214, + "step": 128460 + }, + { + "epoch": 0.5190350561779595, + "grad_norm": 652.5723876953125, + "learning_rate": 2.814371996477448e-05, + "loss": 51.5591, + "step": 128470 + }, + { + "epoch": 0.5190754574433272, + "grad_norm": 1426.8663330078125, + "learning_rate": 2.8140256984083947e-05, + "loss": 53.6987, + "step": 128480 + }, + { + "epoch": 0.5191158587086948, + "grad_norm": 1349.2745361328125, + "learning_rate": 2.8136793942172483e-05, + "loss": 67.8195, + "step": 128490 + }, + { + "epoch": 0.5191562599740623, + "grad_norm": 1255.9248046875, + "learning_rate": 2.8133330839107608e-05, + "loss": 89.4671, + "step": 128500 + }, + { + "epoch": 0.51919666123943, + "grad_norm": 772.8133544921875, + "learning_rate": 2.8129867674956838e-05, + "loss": 99.116, + "step": 128510 + }, + { + "epoch": 0.5192370625047976, + "grad_norm": 799.4025268554688, + "learning_rate": 2.8126404449787685e-05, + "loss": 37.4048, + "step": 128520 + }, + { + "epoch": 0.5192774637701653, + "grad_norm": 767.2107543945312, + "learning_rate": 2.8122941163667667e-05, + "loss": 53.6676, + "step": 128530 + }, + { + "epoch": 0.5193178650355329, + "grad_norm": 717.8088989257812, + "learning_rate": 2.8119477816664296e-05, + "loss": 53.9604, + "step": 128540 + }, + { + "epoch": 0.5193582663009005, + "grad_norm": 568.8512573242188, + "learning_rate": 2.8116014408845116e-05, + "loss": 52.336, + "step": 128550 + }, + { + "epoch": 0.5193986675662682, + "grad_norm": 1033.583984375, + "learning_rate": 2.8112550940277616e-05, + "loss": 60.4758, + "step": 128560 + }, + { + "epoch": 0.5194390688316358, + "grad_norm": 267.0958251953125, + "learning_rate": 2.810908741102934e-05, + "loss": 87.3894, + "step": 128570 + }, + { + "epoch": 0.5194794700970035, + "grad_norm": 966.0076293945312, + "learning_rate": 2.8105623821167804e-05, + "loss": 64.1264, + "step": 128580 + }, + { + "epoch": 0.5195198713623711, + "grad_norm": 888.2479858398438, + "learning_rate": 2.810216017076053e-05, + "loss": 37.3439, + "step": 128590 + }, + { + "epoch": 0.5195602726277387, + "grad_norm": 1010.2241821289062, + "learning_rate": 2.8098696459875046e-05, + "loss": 59.4499, + "step": 128600 + }, + { + "epoch": 0.5196006738931064, + "grad_norm": 672.6810302734375, + "learning_rate": 2.8095232688578883e-05, + "loss": 59.5572, + "step": 128610 + }, + { + "epoch": 0.5196410751584739, + "grad_norm": 1322.0794677734375, + "learning_rate": 2.809176885693956e-05, + "loss": 75.077, + "step": 128620 + }, + { + "epoch": 0.5196814764238415, + "grad_norm": 647.8616943359375, + "learning_rate": 2.8088304965024614e-05, + "loss": 54.4772, + "step": 128630 + }, + { + "epoch": 0.5197218776892092, + "grad_norm": 895.9517211914062, + "learning_rate": 2.8084841012901574e-05, + "loss": 32.6104, + "step": 128640 + }, + { + "epoch": 0.5197622789545768, + "grad_norm": 346.7025451660156, + "learning_rate": 2.808137700063797e-05, + "loss": 39.3208, + "step": 128650 + }, + { + "epoch": 0.5198026802199445, + "grad_norm": 911.7136840820312, + "learning_rate": 2.807791292830133e-05, + "loss": 58.0953, + "step": 128660 + }, + { + "epoch": 0.5198430814853121, + "grad_norm": 522.1300659179688, + "learning_rate": 2.8074448795959203e-05, + "loss": 73.4923, + "step": 128670 + }, + { + "epoch": 0.5198834827506797, + "grad_norm": 657.18505859375, + "learning_rate": 2.8070984603679107e-05, + "loss": 42.6445, + "step": 128680 + }, + { + "epoch": 0.5199238840160474, + "grad_norm": 389.8962707519531, + "learning_rate": 2.8067520351528587e-05, + "loss": 43.4139, + "step": 128690 + }, + { + "epoch": 0.519964285281415, + "grad_norm": 851.1239013671875, + "learning_rate": 2.806405603957517e-05, + "loss": 72.5371, + "step": 128700 + }, + { + "epoch": 0.5200046865467827, + "grad_norm": 3148.775634765625, + "learning_rate": 2.8060591667886416e-05, + "loss": 56.5808, + "step": 128710 + }, + { + "epoch": 0.5200450878121503, + "grad_norm": 1202.496826171875, + "learning_rate": 2.8057127236529844e-05, + "loss": 46.4309, + "step": 128720 + }, + { + "epoch": 0.520085489077518, + "grad_norm": 962.0496826171875, + "learning_rate": 2.805366274557301e-05, + "loss": 50.884, + "step": 128730 + }, + { + "epoch": 0.5201258903428856, + "grad_norm": 453.456298828125, + "learning_rate": 2.8050198195083444e-05, + "loss": 59.2252, + "step": 128740 + }, + { + "epoch": 0.5201662916082531, + "grad_norm": 239.76718139648438, + "learning_rate": 2.8046733585128687e-05, + "loss": 70.7534, + "step": 128750 + }, + { + "epoch": 0.5202066928736208, + "grad_norm": 457.1228942871094, + "learning_rate": 2.80432689157763e-05, + "loss": 52.9828, + "step": 128760 + }, + { + "epoch": 0.5202470941389884, + "grad_norm": 877.8587036132812, + "learning_rate": 2.8039804187093816e-05, + "loss": 52.8952, + "step": 128770 + }, + { + "epoch": 0.520287495404356, + "grad_norm": 1153.672119140625, + "learning_rate": 2.803633939914878e-05, + "loss": 60.0604, + "step": 128780 + }, + { + "epoch": 0.5203278966697237, + "grad_norm": 120.61048126220703, + "learning_rate": 2.803287455200875e-05, + "loss": 47.4182, + "step": 128790 + }, + { + "epoch": 0.5203682979350913, + "grad_norm": 1153.82080078125, + "learning_rate": 2.8029409645741267e-05, + "loss": 59.4981, + "step": 128800 + }, + { + "epoch": 0.520408699200459, + "grad_norm": 418.24920654296875, + "learning_rate": 2.8025944680413878e-05, + "loss": 37.4401, + "step": 128810 + }, + { + "epoch": 0.5204491004658266, + "grad_norm": 781.8113403320312, + "learning_rate": 2.8022479656094154e-05, + "loss": 48.7074, + "step": 128820 + }, + { + "epoch": 0.5204895017311942, + "grad_norm": 836.5515747070312, + "learning_rate": 2.801901457284962e-05, + "loss": 51.3323, + "step": 128830 + }, + { + "epoch": 0.5205299029965619, + "grad_norm": 1007.6831665039062, + "learning_rate": 2.8015549430747852e-05, + "loss": 63.1766, + "step": 128840 + }, + { + "epoch": 0.5205703042619295, + "grad_norm": 1059.2291259765625, + "learning_rate": 2.8012084229856382e-05, + "loss": 48.2031, + "step": 128850 + }, + { + "epoch": 0.5206107055272972, + "grad_norm": 435.08837890625, + "learning_rate": 2.800861897024279e-05, + "loss": 61.0167, + "step": 128860 + }, + { + "epoch": 0.5206511067926648, + "grad_norm": 520.9664916992188, + "learning_rate": 2.8005153651974614e-05, + "loss": 68.8364, + "step": 128870 + }, + { + "epoch": 0.5206915080580323, + "grad_norm": 1375.3497314453125, + "learning_rate": 2.8001688275119432e-05, + "loss": 73.5849, + "step": 128880 + }, + { + "epoch": 0.5207319093234, + "grad_norm": 570.526611328125, + "learning_rate": 2.799822283974478e-05, + "loss": 45.2284, + "step": 128890 + }, + { + "epoch": 0.5207723105887676, + "grad_norm": 3453.25927734375, + "learning_rate": 2.7994757345918244e-05, + "loss": 65.3527, + "step": 128900 + }, + { + "epoch": 0.5208127118541352, + "grad_norm": 807.7914428710938, + "learning_rate": 2.7991291793707357e-05, + "loss": 37.759, + "step": 128910 + }, + { + "epoch": 0.5208531131195029, + "grad_norm": 2692.131103515625, + "learning_rate": 2.7987826183179712e-05, + "loss": 70.0045, + "step": 128920 + }, + { + "epoch": 0.5208935143848705, + "grad_norm": 316.426025390625, + "learning_rate": 2.798436051440284e-05, + "loss": 50.6802, + "step": 128930 + }, + { + "epoch": 0.5209339156502382, + "grad_norm": 1145.9749755859375, + "learning_rate": 2.7980894787444334e-05, + "loss": 44.7317, + "step": 128940 + }, + { + "epoch": 0.5209743169156058, + "grad_norm": 571.5775146484375, + "learning_rate": 2.7977429002371747e-05, + "loss": 39.6738, + "step": 128950 + }, + { + "epoch": 0.5210147181809734, + "grad_norm": 625.548828125, + "learning_rate": 2.797396315925265e-05, + "loss": 34.7972, + "step": 128960 + }, + { + "epoch": 0.5210551194463411, + "grad_norm": 733.4710083007812, + "learning_rate": 2.7970497258154603e-05, + "loss": 38.4996, + "step": 128970 + }, + { + "epoch": 0.5210955207117087, + "grad_norm": 1038.670654296875, + "learning_rate": 2.7967031299145193e-05, + "loss": 49.1648, + "step": 128980 + }, + { + "epoch": 0.5211359219770764, + "grad_norm": 545.0449829101562, + "learning_rate": 2.7963565282291977e-05, + "loss": 48.0368, + "step": 128990 + }, + { + "epoch": 0.521176323242444, + "grad_norm": 428.3600769042969, + "learning_rate": 2.7960099207662532e-05, + "loss": 60.4256, + "step": 129000 + }, + { + "epoch": 0.5212167245078115, + "grad_norm": 714.7028198242188, + "learning_rate": 2.7956633075324424e-05, + "loss": 60.8093, + "step": 129010 + }, + { + "epoch": 0.5212571257731792, + "grad_norm": 1208.1380615234375, + "learning_rate": 2.795316688534523e-05, + "loss": 74.678, + "step": 129020 + }, + { + "epoch": 0.5212975270385468, + "grad_norm": 534.9729614257812, + "learning_rate": 2.794970063779253e-05, + "loss": 51.2989, + "step": 129030 + }, + { + "epoch": 0.5213379283039145, + "grad_norm": 756.8164672851562, + "learning_rate": 2.79462343327339e-05, + "loss": 66.086, + "step": 129040 + }, + { + "epoch": 0.5213783295692821, + "grad_norm": 575.9217529296875, + "learning_rate": 2.794276797023691e-05, + "loss": 49.4426, + "step": 129050 + }, + { + "epoch": 0.5214187308346497, + "grad_norm": 498.71441650390625, + "learning_rate": 2.7939301550369146e-05, + "loss": 48.8029, + "step": 129060 + }, + { + "epoch": 0.5214591321000174, + "grad_norm": 594.4951782226562, + "learning_rate": 2.7935835073198192e-05, + "loss": 34.5576, + "step": 129070 + }, + { + "epoch": 0.521499533365385, + "grad_norm": 1014.3102416992188, + "learning_rate": 2.793236853879161e-05, + "loss": 58.8379, + "step": 129080 + }, + { + "epoch": 0.5215399346307527, + "grad_norm": 560.2127685546875, + "learning_rate": 2.7928901947217008e-05, + "loss": 63.1494, + "step": 129090 + }, + { + "epoch": 0.5215803358961203, + "grad_norm": 666.1370239257812, + "learning_rate": 2.792543529854194e-05, + "loss": 53.1794, + "step": 129100 + }, + { + "epoch": 0.5216207371614879, + "grad_norm": 636.662353515625, + "learning_rate": 2.7921968592834006e-05, + "loss": 25.9828, + "step": 129110 + }, + { + "epoch": 0.5216611384268556, + "grad_norm": 835.5521850585938, + "learning_rate": 2.79185018301608e-05, + "loss": 57.4081, + "step": 129120 + }, + { + "epoch": 0.5217015396922232, + "grad_norm": 992.4476928710938, + "learning_rate": 2.791503501058989e-05, + "loss": 52.8603, + "step": 129130 + }, + { + "epoch": 0.5217419409575907, + "grad_norm": 328.57794189453125, + "learning_rate": 2.7911568134188875e-05, + "loss": 88.1504, + "step": 129140 + }, + { + "epoch": 0.5217823422229584, + "grad_norm": 1140.0035400390625, + "learning_rate": 2.7908101201025337e-05, + "loss": 48.8337, + "step": 129150 + }, + { + "epoch": 0.521822743488326, + "grad_norm": 745.549072265625, + "learning_rate": 2.7904634211166876e-05, + "loss": 55.3195, + "step": 129160 + }, + { + "epoch": 0.5218631447536937, + "grad_norm": 969.3555297851562, + "learning_rate": 2.7901167164681073e-05, + "loss": 55.3426, + "step": 129170 + }, + { + "epoch": 0.5219035460190613, + "grad_norm": 726.4353637695312, + "learning_rate": 2.7897700061635517e-05, + "loss": 49.0015, + "step": 129180 + }, + { + "epoch": 0.5219439472844289, + "grad_norm": 374.95843505859375, + "learning_rate": 2.7894232902097813e-05, + "loss": 46.3873, + "step": 129190 + }, + { + "epoch": 0.5219843485497966, + "grad_norm": 511.00885009765625, + "learning_rate": 2.7890765686135544e-05, + "loss": 47.9072, + "step": 129200 + }, + { + "epoch": 0.5220247498151642, + "grad_norm": 512.494873046875, + "learning_rate": 2.788729841381631e-05, + "loss": 29.6466, + "step": 129210 + }, + { + "epoch": 0.5220651510805319, + "grad_norm": 551.0260620117188, + "learning_rate": 2.7883831085207707e-05, + "loss": 65.4076, + "step": 129220 + }, + { + "epoch": 0.5221055523458995, + "grad_norm": 621.8154907226562, + "learning_rate": 2.788036370037733e-05, + "loss": 55.3526, + "step": 129230 + }, + { + "epoch": 0.5221459536112671, + "grad_norm": 477.7518310546875, + "learning_rate": 2.7876896259392788e-05, + "loss": 70.7479, + "step": 129240 + }, + { + "epoch": 0.5221863548766348, + "grad_norm": 228.69644165039062, + "learning_rate": 2.787342876232167e-05, + "loss": 45.7491, + "step": 129250 + }, + { + "epoch": 0.5222267561420023, + "grad_norm": 406.46026611328125, + "learning_rate": 2.7869961209231577e-05, + "loss": 40.5636, + "step": 129260 + }, + { + "epoch": 0.52226715740737, + "grad_norm": 1248.6951904296875, + "learning_rate": 2.7866493600190107e-05, + "loss": 43.6552, + "step": 129270 + }, + { + "epoch": 0.5223075586727376, + "grad_norm": 780.5540161132812, + "learning_rate": 2.7863025935264875e-05, + "loss": 42.3198, + "step": 129280 + }, + { + "epoch": 0.5223479599381052, + "grad_norm": 830.3927612304688, + "learning_rate": 2.785955821452348e-05, + "loss": 50.129, + "step": 129290 + }, + { + "epoch": 0.5223883612034729, + "grad_norm": 162.7823028564453, + "learning_rate": 2.7856090438033522e-05, + "loss": 35.8086, + "step": 129300 + }, + { + "epoch": 0.5224287624688405, + "grad_norm": 577.8681030273438, + "learning_rate": 2.785262260586261e-05, + "loss": 48.4552, + "step": 129310 + }, + { + "epoch": 0.5224691637342082, + "grad_norm": 495.5697021484375, + "learning_rate": 2.7849154718078346e-05, + "loss": 41.2047, + "step": 129320 + }, + { + "epoch": 0.5225095649995758, + "grad_norm": 650.5645141601562, + "learning_rate": 2.784568677474836e-05, + "loss": 65.8662, + "step": 129330 + }, + { + "epoch": 0.5225499662649434, + "grad_norm": 276.66241455078125, + "learning_rate": 2.7842218775940237e-05, + "loss": 33.7441, + "step": 129340 + }, + { + "epoch": 0.5225903675303111, + "grad_norm": 840.7539672851562, + "learning_rate": 2.783875072172159e-05, + "loss": 49.3532, + "step": 129350 + }, + { + "epoch": 0.5226307687956787, + "grad_norm": 315.2930603027344, + "learning_rate": 2.783528261216004e-05, + "loss": 37.2258, + "step": 129360 + }, + { + "epoch": 0.5226711700610464, + "grad_norm": 398.0323791503906, + "learning_rate": 2.78318144473232e-05, + "loss": 85.9904, + "step": 129370 + }, + { + "epoch": 0.522711571326414, + "grad_norm": 779.0153198242188, + "learning_rate": 2.7828346227278674e-05, + "loss": 53.5858, + "step": 129380 + }, + { + "epoch": 0.5227519725917815, + "grad_norm": 941.3084106445312, + "learning_rate": 2.782487795209408e-05, + "loss": 77.1186, + "step": 129390 + }, + { + "epoch": 0.5227923738571492, + "grad_norm": 1087.8790283203125, + "learning_rate": 2.782140962183704e-05, + "loss": 75.8238, + "step": 129400 + }, + { + "epoch": 0.5228327751225168, + "grad_norm": 836.9957275390625, + "learning_rate": 2.7817941236575173e-05, + "loss": 60.6787, + "step": 129410 + }, + { + "epoch": 0.5228731763878844, + "grad_norm": 542.893798828125, + "learning_rate": 2.781447279637608e-05, + "loss": 54.1359, + "step": 129420 + }, + { + "epoch": 0.5229135776532521, + "grad_norm": 706.9591064453125, + "learning_rate": 2.7811004301307403e-05, + "loss": 42.0238, + "step": 129430 + }, + { + "epoch": 0.5229539789186197, + "grad_norm": 975.5703735351562, + "learning_rate": 2.7807535751436738e-05, + "loss": 42.8671, + "step": 129440 + }, + { + "epoch": 0.5229943801839874, + "grad_norm": 1424.2943115234375, + "learning_rate": 2.7804067146831725e-05, + "loss": 69.0784, + "step": 129450 + }, + { + "epoch": 0.523034781449355, + "grad_norm": 1137.7862548828125, + "learning_rate": 2.7800598487559975e-05, + "loss": 46.7197, + "step": 129460 + }, + { + "epoch": 0.5230751827147226, + "grad_norm": 195.2952423095703, + "learning_rate": 2.7797129773689118e-05, + "loss": 55.5766, + "step": 129470 + }, + { + "epoch": 0.5231155839800903, + "grad_norm": 810.8615112304688, + "learning_rate": 2.7793661005286774e-05, + "loss": 42.1326, + "step": 129480 + }, + { + "epoch": 0.5231559852454579, + "grad_norm": 259.7433166503906, + "learning_rate": 2.7790192182420578e-05, + "loss": 40.0814, + "step": 129490 + }, + { + "epoch": 0.5231963865108256, + "grad_norm": 465.7834777832031, + "learning_rate": 2.7786723305158136e-05, + "loss": 41.8416, + "step": 129500 + }, + { + "epoch": 0.5232367877761932, + "grad_norm": 309.6441650390625, + "learning_rate": 2.7783254373567103e-05, + "loss": 92.4435, + "step": 129510 + }, + { + "epoch": 0.5232771890415607, + "grad_norm": 689.2532348632812, + "learning_rate": 2.7779785387715078e-05, + "loss": 50.8395, + "step": 129520 + }, + { + "epoch": 0.5233175903069284, + "grad_norm": 551.0718994140625, + "learning_rate": 2.7776316347669722e-05, + "loss": 53.9811, + "step": 129530 + }, + { + "epoch": 0.523357991572296, + "grad_norm": 792.376953125, + "learning_rate": 2.7772847253498636e-05, + "loss": 38.2251, + "step": 129540 + }, + { + "epoch": 0.5233983928376637, + "grad_norm": 687.2542724609375, + "learning_rate": 2.7769378105269467e-05, + "loss": 72.9509, + "step": 129550 + }, + { + "epoch": 0.5234387941030313, + "grad_norm": 856.1763916015625, + "learning_rate": 2.7765908903049848e-05, + "loss": 42.2359, + "step": 129560 + }, + { + "epoch": 0.5234791953683989, + "grad_norm": 865.5512084960938, + "learning_rate": 2.7762439646907417e-05, + "loss": 46.5808, + "step": 129570 + }, + { + "epoch": 0.5235195966337666, + "grad_norm": 218.27801513671875, + "learning_rate": 2.7758970336909795e-05, + "loss": 58.7573, + "step": 129580 + }, + { + "epoch": 0.5235599978991342, + "grad_norm": 797.5692749023438, + "learning_rate": 2.7755500973124625e-05, + "loss": 64.8211, + "step": 129590 + }, + { + "epoch": 0.5236003991645019, + "grad_norm": 1161.782470703125, + "learning_rate": 2.7752031555619555e-05, + "loss": 62.3158, + "step": 129600 + }, + { + "epoch": 0.5236408004298695, + "grad_norm": 495.8021545410156, + "learning_rate": 2.774856208446221e-05, + "loss": 34.1338, + "step": 129610 + }, + { + "epoch": 0.5236812016952371, + "grad_norm": 538.5377807617188, + "learning_rate": 2.7745092559720227e-05, + "loss": 84.3193, + "step": 129620 + }, + { + "epoch": 0.5237216029606048, + "grad_norm": 472.7118225097656, + "learning_rate": 2.7741622981461253e-05, + "loss": 88.2121, + "step": 129630 + }, + { + "epoch": 0.5237620042259724, + "grad_norm": 680.1412353515625, + "learning_rate": 2.773815334975292e-05, + "loss": 61.683, + "step": 129640 + }, + { + "epoch": 0.5238024054913399, + "grad_norm": 194.98385620117188, + "learning_rate": 2.7734683664662892e-05, + "loss": 35.9615, + "step": 129650 + }, + { + "epoch": 0.5238428067567076, + "grad_norm": 664.1810913085938, + "learning_rate": 2.7731213926258794e-05, + "loss": 76.9602, + "step": 129660 + }, + { + "epoch": 0.5238832080220752, + "grad_norm": 675.6226806640625, + "learning_rate": 2.7727744134608263e-05, + "loss": 49.6265, + "step": 129670 + }, + { + "epoch": 0.5239236092874429, + "grad_norm": 424.91912841796875, + "learning_rate": 2.7724274289778974e-05, + "loss": 49.0146, + "step": 129680 + }, + { + "epoch": 0.5239640105528105, + "grad_norm": 1368.484619140625, + "learning_rate": 2.7720804391838544e-05, + "loss": 53.2469, + "step": 129690 + }, + { + "epoch": 0.5240044118181781, + "grad_norm": 988.5595092773438, + "learning_rate": 2.771733444085463e-05, + "loss": 72.9639, + "step": 129700 + }, + { + "epoch": 0.5240448130835458, + "grad_norm": 1024.283447265625, + "learning_rate": 2.771386443689489e-05, + "loss": 35.8563, + "step": 129710 + }, + { + "epoch": 0.5240852143489134, + "grad_norm": 2501.96142578125, + "learning_rate": 2.7710394380026954e-05, + "loss": 93.8633, + "step": 129720 + }, + { + "epoch": 0.5241256156142811, + "grad_norm": 397.55413818359375, + "learning_rate": 2.7706924270318496e-05, + "loss": 51.6667, + "step": 129730 + }, + { + "epoch": 0.5241660168796487, + "grad_norm": 401.48687744140625, + "learning_rate": 2.770345410783715e-05, + "loss": 53.5041, + "step": 129740 + }, + { + "epoch": 0.5242064181450163, + "grad_norm": 561.6734619140625, + "learning_rate": 2.7699983892650573e-05, + "loss": 60.5054, + "step": 129750 + }, + { + "epoch": 0.524246819410384, + "grad_norm": 1459.1732177734375, + "learning_rate": 2.769651362482642e-05, + "loss": 80.3514, + "step": 129760 + }, + { + "epoch": 0.5242872206757516, + "grad_norm": 211.4439239501953, + "learning_rate": 2.7693043304432354e-05, + "loss": 53.1344, + "step": 129770 + }, + { + "epoch": 0.5243276219411191, + "grad_norm": 535.6337890625, + "learning_rate": 2.7689572931536017e-05, + "loss": 59.1036, + "step": 129780 + }, + { + "epoch": 0.5243680232064868, + "grad_norm": 212.34646606445312, + "learning_rate": 2.7686102506205068e-05, + "loss": 50.72, + "step": 129790 + }, + { + "epoch": 0.5244084244718544, + "grad_norm": 1038.571044921875, + "learning_rate": 2.7682632028507167e-05, + "loss": 43.0176, + "step": 129800 + }, + { + "epoch": 0.5244488257372221, + "grad_norm": 621.1820678710938, + "learning_rate": 2.7679161498509976e-05, + "loss": 25.86, + "step": 129810 + }, + { + "epoch": 0.5244892270025897, + "grad_norm": 441.5834045410156, + "learning_rate": 2.7675690916281156e-05, + "loss": 54.9453, + "step": 129820 + }, + { + "epoch": 0.5245296282679573, + "grad_norm": 317.9010925292969, + "learning_rate": 2.7672220281888357e-05, + "loss": 95.5096, + "step": 129830 + }, + { + "epoch": 0.524570029533325, + "grad_norm": 862.2459106445312, + "learning_rate": 2.766874959539925e-05, + "loss": 69.8221, + "step": 129840 + }, + { + "epoch": 0.5246104307986926, + "grad_norm": 531.95263671875, + "learning_rate": 2.76652788568815e-05, + "loss": 49.4422, + "step": 129850 + }, + { + "epoch": 0.5246508320640603, + "grad_norm": 694.7820434570312, + "learning_rate": 2.7661808066402767e-05, + "loss": 46.5116, + "step": 129860 + }, + { + "epoch": 0.5246912333294279, + "grad_norm": 1010.5025024414062, + "learning_rate": 2.765833722403071e-05, + "loss": 71.7905, + "step": 129870 + }, + { + "epoch": 0.5247316345947955, + "grad_norm": 285.8402099609375, + "learning_rate": 2.7654866329833002e-05, + "loss": 37.7266, + "step": 129880 + }, + { + "epoch": 0.5247720358601632, + "grad_norm": 556.6085205078125, + "learning_rate": 2.7651395383877304e-05, + "loss": 54.9936, + "step": 129890 + }, + { + "epoch": 0.5248124371255307, + "grad_norm": 1335.543701171875, + "learning_rate": 2.76479243862313e-05, + "loss": 71.0658, + "step": 129900 + }, + { + "epoch": 0.5248528383908984, + "grad_norm": 510.7142639160156, + "learning_rate": 2.7644453336962633e-05, + "loss": 86.5262, + "step": 129910 + }, + { + "epoch": 0.524893239656266, + "grad_norm": 629.845458984375, + "learning_rate": 2.7640982236138992e-05, + "loss": 47.5349, + "step": 129920 + }, + { + "epoch": 0.5249336409216336, + "grad_norm": 1113.2122802734375, + "learning_rate": 2.7637511083828043e-05, + "loss": 66.3521, + "step": 129930 + }, + { + "epoch": 0.5249740421870013, + "grad_norm": 365.0244140625, + "learning_rate": 2.763403988009746e-05, + "loss": 54.905, + "step": 129940 + }, + { + "epoch": 0.5250144434523689, + "grad_norm": 543.3900756835938, + "learning_rate": 2.7630568625014917e-05, + "loss": 48.9247, + "step": 129950 + }, + { + "epoch": 0.5250548447177366, + "grad_norm": 381.6798400878906, + "learning_rate": 2.7627097318648076e-05, + "loss": 39.9984, + "step": 129960 + }, + { + "epoch": 0.5250952459831042, + "grad_norm": 1278.4371337890625, + "learning_rate": 2.7623625961064618e-05, + "loss": 60.6886, + "step": 129970 + }, + { + "epoch": 0.5251356472484718, + "grad_norm": 500.2679443359375, + "learning_rate": 2.7620154552332232e-05, + "loss": 54.9953, + "step": 129980 + }, + { + "epoch": 0.5251760485138395, + "grad_norm": 482.51458740234375, + "learning_rate": 2.7616683092518576e-05, + "loss": 55.6544, + "step": 129990 + }, + { + "epoch": 0.5252164497792071, + "grad_norm": 359.9608154296875, + "learning_rate": 2.761321158169134e-05, + "loss": 64.5541, + "step": 130000 + }, + { + "epoch": 0.5252568510445748, + "grad_norm": 398.5904541015625, + "learning_rate": 2.7609740019918197e-05, + "loss": 53.5339, + "step": 130010 + }, + { + "epoch": 0.5252972523099424, + "grad_norm": 698.3560791015625, + "learning_rate": 2.7606268407266827e-05, + "loss": 61.4191, + "step": 130020 + }, + { + "epoch": 0.5253376535753099, + "grad_norm": 768.4830932617188, + "learning_rate": 2.7602796743804922e-05, + "loss": 47.3355, + "step": 130030 + }, + { + "epoch": 0.5253780548406776, + "grad_norm": 775.7195434570312, + "learning_rate": 2.7599325029600143e-05, + "loss": 25.0439, + "step": 130040 + }, + { + "epoch": 0.5254184561060452, + "grad_norm": 1195.4205322265625, + "learning_rate": 2.7595853264720184e-05, + "loss": 70.0329, + "step": 130050 + }, + { + "epoch": 0.5254588573714128, + "grad_norm": 606.2637329101562, + "learning_rate": 2.759238144923274e-05, + "loss": 53.4816, + "step": 130060 + }, + { + "epoch": 0.5254992586367805, + "grad_norm": 469.8358459472656, + "learning_rate": 2.7588909583205475e-05, + "loss": 49.5405, + "step": 130070 + }, + { + "epoch": 0.5255396599021481, + "grad_norm": 648.6143188476562, + "learning_rate": 2.7585437666706087e-05, + "loss": 69.9888, + "step": 130080 + }, + { + "epoch": 0.5255800611675158, + "grad_norm": 612.1096801757812, + "learning_rate": 2.758196569980226e-05, + "loss": 51.9348, + "step": 130090 + }, + { + "epoch": 0.5256204624328834, + "grad_norm": 1045.827392578125, + "learning_rate": 2.7578493682561685e-05, + "loss": 75.3227, + "step": 130100 + }, + { + "epoch": 0.525660863698251, + "grad_norm": 990.9442138671875, + "learning_rate": 2.757502161505205e-05, + "loss": 73.4528, + "step": 130110 + }, + { + "epoch": 0.5257012649636187, + "grad_norm": 510.2374572753906, + "learning_rate": 2.7571549497341042e-05, + "loss": 60.0831, + "step": 130120 + }, + { + "epoch": 0.5257416662289863, + "grad_norm": 1088.753662109375, + "learning_rate": 2.756807732949635e-05, + "loss": 63.8334, + "step": 130130 + }, + { + "epoch": 0.525782067494354, + "grad_norm": 1749.695068359375, + "learning_rate": 2.756460511158567e-05, + "loss": 54.9062, + "step": 130140 + }, + { + "epoch": 0.5258224687597216, + "grad_norm": 992.9537963867188, + "learning_rate": 2.756113284367669e-05, + "loss": 55.7644, + "step": 130150 + }, + { + "epoch": 0.5258628700250891, + "grad_norm": 679.5419311523438, + "learning_rate": 2.7557660525837108e-05, + "loss": 51.0159, + "step": 130160 + }, + { + "epoch": 0.5259032712904568, + "grad_norm": 444.13031005859375, + "learning_rate": 2.7554188158134616e-05, + "loss": 19.1531, + "step": 130170 + }, + { + "epoch": 0.5259436725558244, + "grad_norm": 416.7135314941406, + "learning_rate": 2.7550715740636917e-05, + "loss": 46.4561, + "step": 130180 + }, + { + "epoch": 0.525984073821192, + "grad_norm": 933.5376586914062, + "learning_rate": 2.7547243273411695e-05, + "loss": 66.3492, + "step": 130190 + }, + { + "epoch": 0.5260244750865597, + "grad_norm": 444.9652404785156, + "learning_rate": 2.754377075652666e-05, + "loss": 50.9244, + "step": 130200 + }, + { + "epoch": 0.5260648763519273, + "grad_norm": 652.8956298828125, + "learning_rate": 2.7540298190049503e-05, + "loss": 59.4898, + "step": 130210 + }, + { + "epoch": 0.526105277617295, + "grad_norm": 645.0548095703125, + "learning_rate": 2.7536825574047925e-05, + "loss": 35.8312, + "step": 130220 + }, + { + "epoch": 0.5261456788826626, + "grad_norm": 534.1641845703125, + "learning_rate": 2.7533352908589622e-05, + "loss": 40.8292, + "step": 130230 + }, + { + "epoch": 0.5261860801480303, + "grad_norm": 420.173583984375, + "learning_rate": 2.7529880193742297e-05, + "loss": 30.2268, + "step": 130240 + }, + { + "epoch": 0.5262264814133979, + "grad_norm": 465.4990234375, + "learning_rate": 2.7526407429573657e-05, + "loss": 64.3663, + "step": 130250 + }, + { + "epoch": 0.5262668826787655, + "grad_norm": 511.3827819824219, + "learning_rate": 2.7522934616151414e-05, + "loss": 56.9852, + "step": 130260 + }, + { + "epoch": 0.5263072839441332, + "grad_norm": 624.65087890625, + "learning_rate": 2.751946175354325e-05, + "loss": 63.1493, + "step": 130270 + }, + { + "epoch": 0.5263476852095008, + "grad_norm": 751.966064453125, + "learning_rate": 2.7515988841816887e-05, + "loss": 69.2688, + "step": 130280 + }, + { + "epoch": 0.5263880864748683, + "grad_norm": 907.3563842773438, + "learning_rate": 2.7512515881040028e-05, + "loss": 54.0088, + "step": 130290 + }, + { + "epoch": 0.526428487740236, + "grad_norm": 893.8932495117188, + "learning_rate": 2.7509042871280372e-05, + "loss": 54.6191, + "step": 130300 + }, + { + "epoch": 0.5264688890056036, + "grad_norm": 1132.3380126953125, + "learning_rate": 2.750556981260564e-05, + "loss": 51.4256, + "step": 130310 + }, + { + "epoch": 0.5265092902709713, + "grad_norm": 408.7311096191406, + "learning_rate": 2.7502096705083535e-05, + "loss": 44.0828, + "step": 130320 + }, + { + "epoch": 0.5265496915363389, + "grad_norm": 569.1919555664062, + "learning_rate": 2.749862354878176e-05, + "loss": 34.8458, + "step": 130330 + }, + { + "epoch": 0.5265900928017065, + "grad_norm": 759.1968383789062, + "learning_rate": 2.7495150343768034e-05, + "loss": 59.1251, + "step": 130340 + }, + { + "epoch": 0.5266304940670742, + "grad_norm": 563.669677734375, + "learning_rate": 2.7491677090110076e-05, + "loss": 48.0987, + "step": 130350 + }, + { + "epoch": 0.5266708953324418, + "grad_norm": 1401.8712158203125, + "learning_rate": 2.7488203787875577e-05, + "loss": 67.7033, + "step": 130360 + }, + { + "epoch": 0.5267112965978095, + "grad_norm": 496.8099060058594, + "learning_rate": 2.7484730437132278e-05, + "loss": 44.0139, + "step": 130370 + }, + { + "epoch": 0.5267516978631771, + "grad_norm": 533.447998046875, + "learning_rate": 2.7481257037947872e-05, + "loss": 47.0373, + "step": 130380 + }, + { + "epoch": 0.5267920991285447, + "grad_norm": 825.5570068359375, + "learning_rate": 2.7477783590390082e-05, + "loss": 63.7042, + "step": 130390 + }, + { + "epoch": 0.5268325003939124, + "grad_norm": 577.7359619140625, + "learning_rate": 2.747431009452663e-05, + "loss": 43.8473, + "step": 130400 + }, + { + "epoch": 0.52687290165928, + "grad_norm": 2243.215087890625, + "learning_rate": 2.747083655042522e-05, + "loss": 58.3573, + "step": 130410 + }, + { + "epoch": 0.5269133029246476, + "grad_norm": 897.6618041992188, + "learning_rate": 2.7467362958153587e-05, + "loss": 85.0911, + "step": 130420 + }, + { + "epoch": 0.5269537041900152, + "grad_norm": 976.666259765625, + "learning_rate": 2.7463889317779446e-05, + "loss": 55.5755, + "step": 130430 + }, + { + "epoch": 0.5269941054553828, + "grad_norm": 1133.8804931640625, + "learning_rate": 2.7460415629370508e-05, + "loss": 59.8554, + "step": 130440 + }, + { + "epoch": 0.5270345067207505, + "grad_norm": 757.4534912109375, + "learning_rate": 2.7456941892994497e-05, + "loss": 43.9233, + "step": 130450 + }, + { + "epoch": 0.5270749079861181, + "grad_norm": 962.9010009765625, + "learning_rate": 2.7453468108719145e-05, + "loss": 65.6795, + "step": 130460 + }, + { + "epoch": 0.5271153092514858, + "grad_norm": 394.7574157714844, + "learning_rate": 2.744999427661217e-05, + "loss": 69.4007, + "step": 130470 + }, + { + "epoch": 0.5271557105168534, + "grad_norm": 762.9472045898438, + "learning_rate": 2.744652039674129e-05, + "loss": 49.1858, + "step": 130480 + }, + { + "epoch": 0.527196111782221, + "grad_norm": 2199.121337890625, + "learning_rate": 2.7443046469174237e-05, + "loss": 77.0622, + "step": 130490 + }, + { + "epoch": 0.5272365130475887, + "grad_norm": 370.2516784667969, + "learning_rate": 2.7439572493978736e-05, + "loss": 51.3727, + "step": 130500 + }, + { + "epoch": 0.5272769143129563, + "grad_norm": 1177.574951171875, + "learning_rate": 2.7436098471222522e-05, + "loss": 47.053, + "step": 130510 + }, + { + "epoch": 0.527317315578324, + "grad_norm": 1359.2659912109375, + "learning_rate": 2.74326244009733e-05, + "loss": 61.0058, + "step": 130520 + }, + { + "epoch": 0.5273577168436916, + "grad_norm": 1003.060546875, + "learning_rate": 2.7429150283298817e-05, + "loss": 52.1394, + "step": 130530 + }, + { + "epoch": 0.5273981181090591, + "grad_norm": 1656.4512939453125, + "learning_rate": 2.7425676118266808e-05, + "loss": 46.5512, + "step": 130540 + }, + { + "epoch": 0.5274385193744268, + "grad_norm": 509.01220703125, + "learning_rate": 2.7422201905944982e-05, + "loss": 52.2351, + "step": 130550 + }, + { + "epoch": 0.5274789206397944, + "grad_norm": 1386.0987548828125, + "learning_rate": 2.7418727646401094e-05, + "loss": 44.8062, + "step": 130560 + }, + { + "epoch": 0.527519321905162, + "grad_norm": 708.5176391601562, + "learning_rate": 2.741525333970285e-05, + "loss": 36.5751, + "step": 130570 + }, + { + "epoch": 0.5275597231705297, + "grad_norm": 3067.193115234375, + "learning_rate": 2.7411778985918006e-05, + "loss": 98.0832, + "step": 130580 + }, + { + "epoch": 0.5276001244358973, + "grad_norm": 598.675537109375, + "learning_rate": 2.7408304585114298e-05, + "loss": 60.4569, + "step": 130590 + }, + { + "epoch": 0.527640525701265, + "grad_norm": 821.6907348632812, + "learning_rate": 2.7404830137359444e-05, + "loss": 71.1968, + "step": 130600 + }, + { + "epoch": 0.5276809269666326, + "grad_norm": 796.5547485351562, + "learning_rate": 2.740135564272119e-05, + "loss": 64.8786, + "step": 130610 + }, + { + "epoch": 0.5277213282320002, + "grad_norm": 587.248291015625, + "learning_rate": 2.7397881101267263e-05, + "loss": 61.1327, + "step": 130620 + }, + { + "epoch": 0.5277617294973679, + "grad_norm": 603.5186157226562, + "learning_rate": 2.7394406513065423e-05, + "loss": 54.6609, + "step": 130630 + }, + { + "epoch": 0.5278021307627355, + "grad_norm": 1664.0540771484375, + "learning_rate": 2.739093187818339e-05, + "loss": 69.9045, + "step": 130640 + }, + { + "epoch": 0.5278425320281032, + "grad_norm": 847.181640625, + "learning_rate": 2.7387457196688908e-05, + "loss": 60.1619, + "step": 130650 + }, + { + "epoch": 0.5278829332934708, + "grad_norm": 1435.385498046875, + "learning_rate": 2.7383982468649714e-05, + "loss": 48.167, + "step": 130660 + }, + { + "epoch": 0.5279233345588383, + "grad_norm": 884.9189453125, + "learning_rate": 2.738050769413357e-05, + "loss": 57.8182, + "step": 130670 + }, + { + "epoch": 0.527963735824206, + "grad_norm": 568.2256469726562, + "learning_rate": 2.7377032873208186e-05, + "loss": 66.162, + "step": 130680 + }, + { + "epoch": 0.5280041370895736, + "grad_norm": 744.9805908203125, + "learning_rate": 2.737355800594133e-05, + "loss": 60.4696, + "step": 130690 + }, + { + "epoch": 0.5280445383549413, + "grad_norm": 907.050537109375, + "learning_rate": 2.7370083092400735e-05, + "loss": 43.196, + "step": 130700 + }, + { + "epoch": 0.5280849396203089, + "grad_norm": 778.4954833984375, + "learning_rate": 2.7366608132654154e-05, + "loss": 51.7738, + "step": 130710 + }, + { + "epoch": 0.5281253408856765, + "grad_norm": 405.15338134765625, + "learning_rate": 2.7363133126769325e-05, + "loss": 64.0481, + "step": 130720 + }, + { + "epoch": 0.5281657421510442, + "grad_norm": 591.9571533203125, + "learning_rate": 2.735965807481401e-05, + "loss": 45.1595, + "step": 130730 + }, + { + "epoch": 0.5282061434164118, + "grad_norm": 985.9591064453125, + "learning_rate": 2.7356182976855934e-05, + "loss": 35.7408, + "step": 130740 + }, + { + "epoch": 0.5282465446817795, + "grad_norm": 460.90887451171875, + "learning_rate": 2.7352707832962865e-05, + "loss": 52.3368, + "step": 130750 + }, + { + "epoch": 0.5282869459471471, + "grad_norm": 900.5870971679688, + "learning_rate": 2.734923264320254e-05, + "loss": 31.7685, + "step": 130760 + }, + { + "epoch": 0.5283273472125147, + "grad_norm": 851.9434814453125, + "learning_rate": 2.7345757407642714e-05, + "loss": 41.7662, + "step": 130770 + }, + { + "epoch": 0.5283677484778824, + "grad_norm": 2293.946533203125, + "learning_rate": 2.7342282126351144e-05, + "loss": 96.9833, + "step": 130780 + }, + { + "epoch": 0.52840814974325, + "grad_norm": 837.9186401367188, + "learning_rate": 2.7338806799395577e-05, + "loss": 53.9366, + "step": 130790 + }, + { + "epoch": 0.5284485510086175, + "grad_norm": 314.82135009765625, + "learning_rate": 2.733533142684377e-05, + "loss": 56.3716, + "step": 130800 + }, + { + "epoch": 0.5284889522739852, + "grad_norm": 918.69287109375, + "learning_rate": 2.7331856008763472e-05, + "loss": 50.4389, + "step": 130810 + }, + { + "epoch": 0.5285293535393528, + "grad_norm": 775.286865234375, + "learning_rate": 2.7328380545222436e-05, + "loss": 64.7548, + "step": 130820 + }, + { + "epoch": 0.5285697548047205, + "grad_norm": 461.6344909667969, + "learning_rate": 2.732490503628843e-05, + "loss": 53.2063, + "step": 130830 + }, + { + "epoch": 0.5286101560700881, + "grad_norm": 415.2874755859375, + "learning_rate": 2.73214294820292e-05, + "loss": 94.1432, + "step": 130840 + }, + { + "epoch": 0.5286505573354557, + "grad_norm": 718.537109375, + "learning_rate": 2.7317953882512504e-05, + "loss": 47.0089, + "step": 130850 + }, + { + "epoch": 0.5286909586008234, + "grad_norm": 524.0968017578125, + "learning_rate": 2.7314478237806107e-05, + "loss": 50.3915, + "step": 130860 + }, + { + "epoch": 0.528731359866191, + "grad_norm": 685.7463989257812, + "learning_rate": 2.7311002547977766e-05, + "loss": 47.7177, + "step": 130870 + }, + { + "epoch": 0.5287717611315587, + "grad_norm": 658.6242065429688, + "learning_rate": 2.730752681309524e-05, + "loss": 44.9115, + "step": 130880 + }, + { + "epoch": 0.5288121623969263, + "grad_norm": 831.5072021484375, + "learning_rate": 2.730405103322629e-05, + "loss": 36.1585, + "step": 130890 + }, + { + "epoch": 0.5288525636622939, + "grad_norm": 2113.5791015625, + "learning_rate": 2.7300575208438683e-05, + "loss": 64.7953, + "step": 130900 + }, + { + "epoch": 0.5288929649276616, + "grad_norm": 1694.7403564453125, + "learning_rate": 2.729709933880017e-05, + "loss": 64.9504, + "step": 130910 + }, + { + "epoch": 0.5289333661930292, + "grad_norm": 959.0403442382812, + "learning_rate": 2.7293623424378535e-05, + "loss": 62.3716, + "step": 130920 + }, + { + "epoch": 0.5289737674583967, + "grad_norm": 2491.924072265625, + "learning_rate": 2.7290147465241517e-05, + "loss": 52.1772, + "step": 130930 + }, + { + "epoch": 0.5290141687237644, + "grad_norm": 243.4221954345703, + "learning_rate": 2.7286671461456897e-05, + "loss": 38.3009, + "step": 130940 + }, + { + "epoch": 0.529054569989132, + "grad_norm": 1660.8577880859375, + "learning_rate": 2.7283195413092445e-05, + "loss": 60.126, + "step": 130950 + }, + { + "epoch": 0.5290949712544997, + "grad_norm": 609.1519165039062, + "learning_rate": 2.7279719320215924e-05, + "loss": 59.0399, + "step": 130960 + }, + { + "epoch": 0.5291353725198673, + "grad_norm": 379.8237609863281, + "learning_rate": 2.7276243182895094e-05, + "loss": 64.3225, + "step": 130970 + }, + { + "epoch": 0.529175773785235, + "grad_norm": 535.1643676757812, + "learning_rate": 2.7272767001197742e-05, + "loss": 37.2136, + "step": 130980 + }, + { + "epoch": 0.5292161750506026, + "grad_norm": 587.9727783203125, + "learning_rate": 2.726929077519162e-05, + "loss": 41.9734, + "step": 130990 + }, + { + "epoch": 0.5292565763159702, + "grad_norm": 652.3465576171875, + "learning_rate": 2.726581450494451e-05, + "loss": 83.3366, + "step": 131000 + }, + { + "epoch": 0.5292969775813379, + "grad_norm": 395.1925964355469, + "learning_rate": 2.7262338190524173e-05, + "loss": 70.538, + "step": 131010 + }, + { + "epoch": 0.5293373788467055, + "grad_norm": 602.4820556640625, + "learning_rate": 2.7258861831998388e-05, + "loss": 55.8564, + "step": 131020 + }, + { + "epoch": 0.5293777801120731, + "grad_norm": 2387.6884765625, + "learning_rate": 2.7255385429434932e-05, + "loss": 77.701, + "step": 131030 + }, + { + "epoch": 0.5294181813774408, + "grad_norm": 626.4151000976562, + "learning_rate": 2.725190898290158e-05, + "loss": 71.4265, + "step": 131040 + }, + { + "epoch": 0.5294585826428083, + "grad_norm": 1227.336669921875, + "learning_rate": 2.7248432492466096e-05, + "loss": 63.7168, + "step": 131050 + }, + { + "epoch": 0.529498983908176, + "grad_norm": 506.81475830078125, + "learning_rate": 2.7244955958196265e-05, + "loss": 38.2177, + "step": 131060 + }, + { + "epoch": 0.5295393851735436, + "grad_norm": 550.5950317382812, + "learning_rate": 2.7241479380159868e-05, + "loss": 69.3767, + "step": 131070 + }, + { + "epoch": 0.5295797864389112, + "grad_norm": 751.3677368164062, + "learning_rate": 2.723800275842468e-05, + "loss": 45.0376, + "step": 131080 + }, + { + "epoch": 0.5296201877042789, + "grad_norm": 459.6380310058594, + "learning_rate": 2.7234526093058464e-05, + "loss": 59.1021, + "step": 131090 + }, + { + "epoch": 0.5296605889696465, + "grad_norm": 463.3095703125, + "learning_rate": 2.7231049384129016e-05, + "loss": 37.5911, + "step": 131100 + }, + { + "epoch": 0.5297009902350142, + "grad_norm": 0.0, + "learning_rate": 2.7227572631704107e-05, + "loss": 52.0239, + "step": 131110 + }, + { + "epoch": 0.5297413915003818, + "grad_norm": 1287.0667724609375, + "learning_rate": 2.7224095835851525e-05, + "loss": 85.9998, + "step": 131120 + }, + { + "epoch": 0.5297817927657494, + "grad_norm": 710.130615234375, + "learning_rate": 2.722061899663905e-05, + "loss": 41.3392, + "step": 131130 + }, + { + "epoch": 0.5298221940311171, + "grad_norm": 241.11277770996094, + "learning_rate": 2.7217142114134463e-05, + "loss": 75.5654, + "step": 131140 + }, + { + "epoch": 0.5298625952964847, + "grad_norm": 2309.976806640625, + "learning_rate": 2.7213665188405556e-05, + "loss": 62.305, + "step": 131150 + }, + { + "epoch": 0.5299029965618524, + "grad_norm": 592.4625244140625, + "learning_rate": 2.721018821952011e-05, + "loss": 49.6338, + "step": 131160 + }, + { + "epoch": 0.52994339782722, + "grad_norm": 3443.605224609375, + "learning_rate": 2.7206711207545893e-05, + "loss": 65.5017, + "step": 131170 + }, + { + "epoch": 0.5299837990925875, + "grad_norm": 924.45654296875, + "learning_rate": 2.7203234152550712e-05, + "loss": 52.5658, + "step": 131180 + }, + { + "epoch": 0.5300242003579552, + "grad_norm": 385.39910888671875, + "learning_rate": 2.719975705460234e-05, + "loss": 35.2366, + "step": 131190 + }, + { + "epoch": 0.5300646016233228, + "grad_norm": 709.4951171875, + "learning_rate": 2.7196279913768584e-05, + "loss": 56.7616, + "step": 131200 + }, + { + "epoch": 0.5301050028886904, + "grad_norm": 849.27294921875, + "learning_rate": 2.719280273011721e-05, + "loss": 62.0372, + "step": 131210 + }, + { + "epoch": 0.5301454041540581, + "grad_norm": 943.2574462890625, + "learning_rate": 2.7189325503716022e-05, + "loss": 81.1754, + "step": 131220 + }, + { + "epoch": 0.5301858054194257, + "grad_norm": 839.4765014648438, + "learning_rate": 2.7185848234632803e-05, + "loss": 65.2041, + "step": 131230 + }, + { + "epoch": 0.5302262066847934, + "grad_norm": 879.865966796875, + "learning_rate": 2.7182370922935353e-05, + "loss": 57.2173, + "step": 131240 + }, + { + "epoch": 0.530266607950161, + "grad_norm": 980.9358520507812, + "learning_rate": 2.717889356869146e-05, + "loss": 47.6376, + "step": 131250 + }, + { + "epoch": 0.5303070092155286, + "grad_norm": 680.2994384765625, + "learning_rate": 2.717541617196891e-05, + "loss": 35.0096, + "step": 131260 + }, + { + "epoch": 0.5303474104808963, + "grad_norm": 870.2879638671875, + "learning_rate": 2.71719387328355e-05, + "loss": 54.6478, + "step": 131270 + }, + { + "epoch": 0.5303878117462639, + "grad_norm": 456.5863952636719, + "learning_rate": 2.716846125135903e-05, + "loss": 66.5173, + "step": 131280 + }, + { + "epoch": 0.5304282130116316, + "grad_norm": 1249.5938720703125, + "learning_rate": 2.716498372760729e-05, + "loss": 44.1854, + "step": 131290 + }, + { + "epoch": 0.5304686142769992, + "grad_norm": 364.5703430175781, + "learning_rate": 2.7161506161648076e-05, + "loss": 58.9766, + "step": 131300 + }, + { + "epoch": 0.5305090155423667, + "grad_norm": 890.297119140625, + "learning_rate": 2.7158028553549187e-05, + "loss": 56.6682, + "step": 131310 + }, + { + "epoch": 0.5305494168077344, + "grad_norm": 925.9620361328125, + "learning_rate": 2.715455090337842e-05, + "loss": 43.9666, + "step": 131320 + }, + { + "epoch": 0.530589818073102, + "grad_norm": 511.55609130859375, + "learning_rate": 2.715107321120358e-05, + "loss": 51.9629, + "step": 131330 + }, + { + "epoch": 0.5306302193384697, + "grad_norm": 547.1328125, + "learning_rate": 2.7147595477092457e-05, + "loss": 43.9675, + "step": 131340 + }, + { + "epoch": 0.5306706206038373, + "grad_norm": 784.9970703125, + "learning_rate": 2.7144117701112846e-05, + "loss": 65.4148, + "step": 131350 + }, + { + "epoch": 0.5307110218692049, + "grad_norm": 653.8527221679688, + "learning_rate": 2.7140639883332564e-05, + "loss": 64.9822, + "step": 131360 + }, + { + "epoch": 0.5307514231345726, + "grad_norm": 480.8045959472656, + "learning_rate": 2.713716202381941e-05, + "loss": 53.8297, + "step": 131370 + }, + { + "epoch": 0.5307918243999402, + "grad_norm": 251.93240356445312, + "learning_rate": 2.713368412264118e-05, + "loss": 55.8464, + "step": 131380 + }, + { + "epoch": 0.5308322256653079, + "grad_norm": 609.9501953125, + "learning_rate": 2.713020617986567e-05, + "loss": 38.5014, + "step": 131390 + }, + { + "epoch": 0.5308726269306755, + "grad_norm": 921.5916748046875, + "learning_rate": 2.7126728195560702e-05, + "loss": 52.8066, + "step": 131400 + }, + { + "epoch": 0.5309130281960431, + "grad_norm": 912.7449951171875, + "learning_rate": 2.7123250169794075e-05, + "loss": 54.2252, + "step": 131410 + }, + { + "epoch": 0.5309534294614108, + "grad_norm": 718.9556274414062, + "learning_rate": 2.711977210263359e-05, + "loss": 57.7856, + "step": 131420 + }, + { + "epoch": 0.5309938307267784, + "grad_norm": 457.2162170410156, + "learning_rate": 2.7116293994147053e-05, + "loss": 64.9568, + "step": 131430 + }, + { + "epoch": 0.5310342319921459, + "grad_norm": 621.4869995117188, + "learning_rate": 2.711281584440228e-05, + "loss": 62.9255, + "step": 131440 + }, + { + "epoch": 0.5310746332575136, + "grad_norm": 792.2979125976562, + "learning_rate": 2.710933765346707e-05, + "loss": 47.9132, + "step": 131450 + }, + { + "epoch": 0.5311150345228812, + "grad_norm": 539.8186645507812, + "learning_rate": 2.710585942140924e-05, + "loss": 54.2183, + "step": 131460 + }, + { + "epoch": 0.5311554357882489, + "grad_norm": 558.9265747070312, + "learning_rate": 2.710238114829659e-05, + "loss": 49.2721, + "step": 131470 + }, + { + "epoch": 0.5311958370536165, + "grad_norm": 709.4071655273438, + "learning_rate": 2.7098902834196943e-05, + "loss": 58.9437, + "step": 131480 + }, + { + "epoch": 0.5312362383189841, + "grad_norm": 654.2186889648438, + "learning_rate": 2.7095424479178106e-05, + "loss": 36.4761, + "step": 131490 + }, + { + "epoch": 0.5312766395843518, + "grad_norm": 889.7582397460938, + "learning_rate": 2.7091946083307896e-05, + "loss": 68.6755, + "step": 131500 + }, + { + "epoch": 0.5313170408497194, + "grad_norm": 1404.284912109375, + "learning_rate": 2.708846764665411e-05, + "loss": 67.0801, + "step": 131510 + }, + { + "epoch": 0.5313574421150871, + "grad_norm": 774.0541381835938, + "learning_rate": 2.7084989169284568e-05, + "loss": 70.2885, + "step": 131520 + }, + { + "epoch": 0.5313978433804547, + "grad_norm": 676.1631469726562, + "learning_rate": 2.70815106512671e-05, + "loss": 57.3858, + "step": 131530 + }, + { + "epoch": 0.5314382446458223, + "grad_norm": 288.2684631347656, + "learning_rate": 2.7078032092669502e-05, + "loss": 57.0647, + "step": 131540 + }, + { + "epoch": 0.53147864591119, + "grad_norm": 692.8833618164062, + "learning_rate": 2.70745534935596e-05, + "loss": 52.4784, + "step": 131550 + }, + { + "epoch": 0.5315190471765576, + "grad_norm": 360.534912109375, + "learning_rate": 2.707107485400521e-05, + "loss": 76.2931, + "step": 131560 + }, + { + "epoch": 0.5315594484419252, + "grad_norm": 902.7174682617188, + "learning_rate": 2.7067596174074155e-05, + "loss": 66.0437, + "step": 131570 + }, + { + "epoch": 0.5315998497072928, + "grad_norm": 1684.62451171875, + "learning_rate": 2.7064117453834243e-05, + "loss": 48.1203, + "step": 131580 + }, + { + "epoch": 0.5316402509726604, + "grad_norm": 74.62276458740234, + "learning_rate": 2.70606386933533e-05, + "loss": 69.9313, + "step": 131590 + }, + { + "epoch": 0.5316806522380281, + "grad_norm": 347.76605224609375, + "learning_rate": 2.705715989269914e-05, + "loss": 67.9091, + "step": 131600 + }, + { + "epoch": 0.5317210535033957, + "grad_norm": 2573.411865234375, + "learning_rate": 2.70536810519396e-05, + "loss": 58.865, + "step": 131610 + }, + { + "epoch": 0.5317614547687634, + "grad_norm": 265.0232849121094, + "learning_rate": 2.705020217114248e-05, + "loss": 47.0783, + "step": 131620 + }, + { + "epoch": 0.531801856034131, + "grad_norm": 801.228515625, + "learning_rate": 2.7046723250375617e-05, + "loss": 51.892, + "step": 131630 + }, + { + "epoch": 0.5318422572994986, + "grad_norm": 833.2240600585938, + "learning_rate": 2.7043244289706826e-05, + "loss": 49.0188, + "step": 131640 + }, + { + "epoch": 0.5318826585648663, + "grad_norm": 984.8556518554688, + "learning_rate": 2.7039765289203946e-05, + "loss": 55.2805, + "step": 131650 + }, + { + "epoch": 0.5319230598302339, + "grad_norm": 779.8922119140625, + "learning_rate": 2.703628624893478e-05, + "loss": 40.7662, + "step": 131660 + }, + { + "epoch": 0.5319634610956016, + "grad_norm": 687.851318359375, + "learning_rate": 2.703280716896717e-05, + "loss": 71.506, + "step": 131670 + }, + { + "epoch": 0.5320038623609692, + "grad_norm": 561.60888671875, + "learning_rate": 2.702932804936894e-05, + "loss": 76.0556, + "step": 131680 + }, + { + "epoch": 0.5320442636263367, + "grad_norm": 597.3526611328125, + "learning_rate": 2.7025848890207917e-05, + "loss": 51.0563, + "step": 131690 + }, + { + "epoch": 0.5320846648917044, + "grad_norm": 1162.3173828125, + "learning_rate": 2.7022369691551917e-05, + "loss": 67.7418, + "step": 131700 + }, + { + "epoch": 0.532125066157072, + "grad_norm": 1007.927734375, + "learning_rate": 2.7018890453468788e-05, + "loss": 55.7545, + "step": 131710 + }, + { + "epoch": 0.5321654674224396, + "grad_norm": 583.6637573242188, + "learning_rate": 2.7015411176026344e-05, + "loss": 59.1643, + "step": 131720 + }, + { + "epoch": 0.5322058686878073, + "grad_norm": 503.95025634765625, + "learning_rate": 2.7011931859292427e-05, + "loss": 57.7244, + "step": 131730 + }, + { + "epoch": 0.5322462699531749, + "grad_norm": 545.329345703125, + "learning_rate": 2.7008452503334858e-05, + "loss": 46.0308, + "step": 131740 + }, + { + "epoch": 0.5322866712185426, + "grad_norm": 380.2509765625, + "learning_rate": 2.7004973108221472e-05, + "loss": 40.5332, + "step": 131750 + }, + { + "epoch": 0.5323270724839102, + "grad_norm": 740.1156005859375, + "learning_rate": 2.700149367402011e-05, + "loss": 60.6691, + "step": 131760 + }, + { + "epoch": 0.5323674737492778, + "grad_norm": 1089.05712890625, + "learning_rate": 2.69980142007986e-05, + "loss": 65.5629, + "step": 131770 + }, + { + "epoch": 0.5324078750146455, + "grad_norm": 537.0314331054688, + "learning_rate": 2.699453468862477e-05, + "loss": 55.9601, + "step": 131780 + }, + { + "epoch": 0.5324482762800131, + "grad_norm": 266.5636291503906, + "learning_rate": 2.699105513756645e-05, + "loss": 59.2656, + "step": 131790 + }, + { + "epoch": 0.5324886775453808, + "grad_norm": 889.6873779296875, + "learning_rate": 2.6987575547691497e-05, + "loss": 76.0885, + "step": 131800 + }, + { + "epoch": 0.5325290788107484, + "grad_norm": 1049.7486572265625, + "learning_rate": 2.698409591906773e-05, + "loss": 50.9139, + "step": 131810 + }, + { + "epoch": 0.5325694800761159, + "grad_norm": 820.5618286132812, + "learning_rate": 2.6980616251762997e-05, + "loss": 36.7931, + "step": 131820 + }, + { + "epoch": 0.5326098813414836, + "grad_norm": 643.490966796875, + "learning_rate": 2.6977136545845122e-05, + "loss": 56.8342, + "step": 131830 + }, + { + "epoch": 0.5326502826068512, + "grad_norm": 853.44970703125, + "learning_rate": 2.6973656801381963e-05, + "loss": 60.0717, + "step": 131840 + }, + { + "epoch": 0.5326906838722189, + "grad_norm": 440.1711730957031, + "learning_rate": 2.697017701844134e-05, + "loss": 45.1691, + "step": 131850 + }, + { + "epoch": 0.5327310851375865, + "grad_norm": 468.0743408203125, + "learning_rate": 2.6966697197091108e-05, + "loss": 54.0596, + "step": 131860 + }, + { + "epoch": 0.5327714864029541, + "grad_norm": 1050.264892578125, + "learning_rate": 2.69632173373991e-05, + "loss": 55.2322, + "step": 131870 + }, + { + "epoch": 0.5328118876683218, + "grad_norm": 720.6689453125, + "learning_rate": 2.695973743943315e-05, + "loss": 55.2848, + "step": 131880 + }, + { + "epoch": 0.5328522889336894, + "grad_norm": 1108.028564453125, + "learning_rate": 2.6956257503261116e-05, + "loss": 40.9585, + "step": 131890 + }, + { + "epoch": 0.532892690199057, + "grad_norm": 1092.5072021484375, + "learning_rate": 2.695277752895084e-05, + "loss": 44.4573, + "step": 131900 + }, + { + "epoch": 0.5329330914644247, + "grad_norm": 625.7965087890625, + "learning_rate": 2.6949297516570156e-05, + "loss": 28.5109, + "step": 131910 + }, + { + "epoch": 0.5329734927297923, + "grad_norm": 681.8804931640625, + "learning_rate": 2.6945817466186912e-05, + "loss": 43.6512, + "step": 131920 + }, + { + "epoch": 0.53301389399516, + "grad_norm": 756.3519897460938, + "learning_rate": 2.694233737786896e-05, + "loss": 76.8343, + "step": 131930 + }, + { + "epoch": 0.5330542952605276, + "grad_norm": 1092.2066650390625, + "learning_rate": 2.693885725168414e-05, + "loss": 46.8568, + "step": 131940 + }, + { + "epoch": 0.5330946965258951, + "grad_norm": 988.5387573242188, + "learning_rate": 2.6935377087700297e-05, + "loss": 73.0625, + "step": 131950 + }, + { + "epoch": 0.5331350977912628, + "grad_norm": 940.03076171875, + "learning_rate": 2.693189688598528e-05, + "loss": 63.5997, + "step": 131960 + }, + { + "epoch": 0.5331754990566304, + "grad_norm": 372.99151611328125, + "learning_rate": 2.6928416646606936e-05, + "loss": 57.4221, + "step": 131970 + }, + { + "epoch": 0.5332159003219981, + "grad_norm": 1288.4371337890625, + "learning_rate": 2.6924936369633125e-05, + "loss": 48.3105, + "step": 131980 + }, + { + "epoch": 0.5332563015873657, + "grad_norm": 2535.275634765625, + "learning_rate": 2.6921456055131683e-05, + "loss": 67.3763, + "step": 131990 + }, + { + "epoch": 0.5332967028527333, + "grad_norm": 3868.3837890625, + "learning_rate": 2.6917975703170466e-05, + "loss": 65.5701, + "step": 132000 + }, + { + "epoch": 0.533337104118101, + "grad_norm": 810.7564697265625, + "learning_rate": 2.691449531381733e-05, + "loss": 64.9525, + "step": 132010 + }, + { + "epoch": 0.5333775053834686, + "grad_norm": 1662.32373046875, + "learning_rate": 2.6911014887140122e-05, + "loss": 57.4445, + "step": 132020 + }, + { + "epoch": 0.5334179066488363, + "grad_norm": 528.4577026367188, + "learning_rate": 2.6907534423206692e-05, + "loss": 49.0796, + "step": 132030 + }, + { + "epoch": 0.5334583079142039, + "grad_norm": 798.1748657226562, + "learning_rate": 2.6904053922084895e-05, + "loss": 51.6413, + "step": 132040 + }, + { + "epoch": 0.5334987091795715, + "grad_norm": 589.7070922851562, + "learning_rate": 2.6900573383842583e-05, + "loss": 33.4644, + "step": 132050 + }, + { + "epoch": 0.5335391104449392, + "grad_norm": 757.95458984375, + "learning_rate": 2.689709280854762e-05, + "loss": 52.9057, + "step": 132060 + }, + { + "epoch": 0.5335795117103068, + "grad_norm": 567.9570922851562, + "learning_rate": 2.6893612196267853e-05, + "loss": 46.4571, + "step": 132070 + }, + { + "epoch": 0.5336199129756743, + "grad_norm": 732.6300048828125, + "learning_rate": 2.6890131547071147e-05, + "loss": 66.3549, + "step": 132080 + }, + { + "epoch": 0.533660314241042, + "grad_norm": 1564.1744384765625, + "learning_rate": 2.6886650861025343e-05, + "loss": 82.0941, + "step": 132090 + }, + { + "epoch": 0.5337007155064096, + "grad_norm": 6092.22314453125, + "learning_rate": 2.6883170138198323e-05, + "loss": 73.8271, + "step": 132100 + }, + { + "epoch": 0.5337411167717773, + "grad_norm": 3677.495361328125, + "learning_rate": 2.6879689378657923e-05, + "loss": 83.4729, + "step": 132110 + }, + { + "epoch": 0.5337815180371449, + "grad_norm": 830.318603515625, + "learning_rate": 2.6876208582472012e-05, + "loss": 43.1947, + "step": 132120 + }, + { + "epoch": 0.5338219193025125, + "grad_norm": 861.20849609375, + "learning_rate": 2.687272774970845e-05, + "loss": 79.1685, + "step": 132130 + }, + { + "epoch": 0.5338623205678802, + "grad_norm": 541.7510375976562, + "learning_rate": 2.6869246880435095e-05, + "loss": 50.6851, + "step": 132140 + }, + { + "epoch": 0.5339027218332478, + "grad_norm": 2233.982177734375, + "learning_rate": 2.686576597471981e-05, + "loss": 64.9713, + "step": 132150 + }, + { + "epoch": 0.5339431230986155, + "grad_norm": 801.9125366210938, + "learning_rate": 2.686228503263045e-05, + "loss": 67.6367, + "step": 132160 + }, + { + "epoch": 0.5339835243639831, + "grad_norm": 577.6140747070312, + "learning_rate": 2.685880405423489e-05, + "loss": 56.8992, + "step": 132170 + }, + { + "epoch": 0.5340239256293507, + "grad_norm": 303.9769592285156, + "learning_rate": 2.6855323039601e-05, + "loss": 55.4841, + "step": 132180 + }, + { + "epoch": 0.5340643268947184, + "grad_norm": 0.0, + "learning_rate": 2.685184198879662e-05, + "loss": 79.864, + "step": 132190 + }, + { + "epoch": 0.534104728160086, + "grad_norm": 1271.53759765625, + "learning_rate": 2.684836090188963e-05, + "loss": 53.1909, + "step": 132200 + }, + { + "epoch": 0.5341451294254536, + "grad_norm": 390.48504638671875, + "learning_rate": 2.6844879778947884e-05, + "loss": 45.1817, + "step": 132210 + }, + { + "epoch": 0.5341855306908212, + "grad_norm": 1045.8829345703125, + "learning_rate": 2.6841398620039273e-05, + "loss": 57.4035, + "step": 132220 + }, + { + "epoch": 0.5342259319561888, + "grad_norm": 211.4333038330078, + "learning_rate": 2.6837917425231633e-05, + "loss": 93.9413, + "step": 132230 + }, + { + "epoch": 0.5342663332215565, + "grad_norm": 362.2946472167969, + "learning_rate": 2.6834436194592853e-05, + "loss": 33.1701, + "step": 132240 + }, + { + "epoch": 0.5343067344869241, + "grad_norm": 875.4388427734375, + "learning_rate": 2.6830954928190794e-05, + "loss": 76.5484, + "step": 132250 + }, + { + "epoch": 0.5343471357522918, + "grad_norm": 546.877197265625, + "learning_rate": 2.682747362609333e-05, + "loss": 38.9788, + "step": 132260 + }, + { + "epoch": 0.5343875370176594, + "grad_norm": 674.6320190429688, + "learning_rate": 2.6823992288368322e-05, + "loss": 70.6211, + "step": 132270 + }, + { + "epoch": 0.534427938283027, + "grad_norm": 868.1038818359375, + "learning_rate": 2.6820510915083648e-05, + "loss": 49.9378, + "step": 132280 + }, + { + "epoch": 0.5344683395483947, + "grad_norm": 355.66741943359375, + "learning_rate": 2.681702950630717e-05, + "loss": 34.1495, + "step": 132290 + }, + { + "epoch": 0.5345087408137623, + "grad_norm": 460.1276550292969, + "learning_rate": 2.6813548062106775e-05, + "loss": 64.0979, + "step": 132300 + }, + { + "epoch": 0.53454914207913, + "grad_norm": 0.0, + "learning_rate": 2.6810066582550324e-05, + "loss": 59.0976, + "step": 132310 + }, + { + "epoch": 0.5345895433444976, + "grad_norm": 310.49627685546875, + "learning_rate": 2.6806585067705692e-05, + "loss": 64.1751, + "step": 132320 + }, + { + "epoch": 0.5346299446098651, + "grad_norm": 862.5602416992188, + "learning_rate": 2.680310351764075e-05, + "loss": 63.3662, + "step": 132330 + }, + { + "epoch": 0.5346703458752328, + "grad_norm": 802.3466186523438, + "learning_rate": 2.679962193242338e-05, + "loss": 71.8971, + "step": 132340 + }, + { + "epoch": 0.5347107471406004, + "grad_norm": 1650.000244140625, + "learning_rate": 2.6796140312121458e-05, + "loss": 59.4168, + "step": 132350 + }, + { + "epoch": 0.534751148405968, + "grad_norm": 832.1773071289062, + "learning_rate": 2.6792658656802856e-05, + "loss": 45.569, + "step": 132360 + }, + { + "epoch": 0.5347915496713357, + "grad_norm": 917.97412109375, + "learning_rate": 2.6789176966535444e-05, + "loss": 45.4459, + "step": 132370 + }, + { + "epoch": 0.5348319509367033, + "grad_norm": 583.6998901367188, + "learning_rate": 2.678569524138711e-05, + "loss": 66.7321, + "step": 132380 + }, + { + "epoch": 0.534872352202071, + "grad_norm": 418.53680419921875, + "learning_rate": 2.678221348142573e-05, + "loss": 34.0593, + "step": 132390 + }, + { + "epoch": 0.5349127534674386, + "grad_norm": 402.7685241699219, + "learning_rate": 2.6778731686719178e-05, + "loss": 55.3928, + "step": 132400 + }, + { + "epoch": 0.5349531547328062, + "grad_norm": 372.7942199707031, + "learning_rate": 2.6775249857335333e-05, + "loss": 30.0867, + "step": 132410 + }, + { + "epoch": 0.5349935559981739, + "grad_norm": 1119.2479248046875, + "learning_rate": 2.677176799334208e-05, + "loss": 58.895, + "step": 132420 + }, + { + "epoch": 0.5350339572635415, + "grad_norm": 407.2303771972656, + "learning_rate": 2.6768286094807298e-05, + "loss": 64.0234, + "step": 132430 + }, + { + "epoch": 0.5350743585289092, + "grad_norm": 855.4847412109375, + "learning_rate": 2.6764804161798867e-05, + "loss": 89.9496, + "step": 132440 + }, + { + "epoch": 0.5351147597942768, + "grad_norm": 1015.7791748046875, + "learning_rate": 2.6761322194384674e-05, + "loss": 60.1903, + "step": 132450 + }, + { + "epoch": 0.5351551610596443, + "grad_norm": 1254.9368896484375, + "learning_rate": 2.6757840192632598e-05, + "loss": 61.4563, + "step": 132460 + }, + { + "epoch": 0.535195562325012, + "grad_norm": 299.9359436035156, + "learning_rate": 2.6754358156610525e-05, + "loss": 38.3524, + "step": 132470 + }, + { + "epoch": 0.5352359635903796, + "grad_norm": 811.1012573242188, + "learning_rate": 2.6750876086386328e-05, + "loss": 47.9699, + "step": 132480 + }, + { + "epoch": 0.5352763648557473, + "grad_norm": 893.4951171875, + "learning_rate": 2.6747393982027903e-05, + "loss": 48.4848, + "step": 132490 + }, + { + "epoch": 0.5353167661211149, + "grad_norm": 727.3240966796875, + "learning_rate": 2.674391184360313e-05, + "loss": 34.6639, + "step": 132500 + }, + { + "epoch": 0.5353571673864825, + "grad_norm": 2415.583251953125, + "learning_rate": 2.6740429671179907e-05, + "loss": 73.8259, + "step": 132510 + }, + { + "epoch": 0.5353975686518502, + "grad_norm": 443.5478515625, + "learning_rate": 2.6736947464826108e-05, + "loss": 48.6447, + "step": 132520 + }, + { + "epoch": 0.5354379699172178, + "grad_norm": 1031.6845703125, + "learning_rate": 2.6733465224609622e-05, + "loss": 54.8037, + "step": 132530 + }, + { + "epoch": 0.5354783711825855, + "grad_norm": 564.192138671875, + "learning_rate": 2.6729982950598338e-05, + "loss": 51.8397, + "step": 132540 + }, + { + "epoch": 0.5355187724479531, + "grad_norm": 472.5278625488281, + "learning_rate": 2.6726500642860154e-05, + "loss": 49.6563, + "step": 132550 + }, + { + "epoch": 0.5355591737133207, + "grad_norm": 551.7787475585938, + "learning_rate": 2.6723018301462937e-05, + "loss": 43.9799, + "step": 132560 + }, + { + "epoch": 0.5355995749786884, + "grad_norm": 1193.7701416015625, + "learning_rate": 2.67195359264746e-05, + "loss": 64.9433, + "step": 132570 + }, + { + "epoch": 0.535639976244056, + "grad_norm": 799.9347534179688, + "learning_rate": 2.671605351796302e-05, + "loss": 41.734, + "step": 132580 + }, + { + "epoch": 0.5356803775094235, + "grad_norm": 759.9945068359375, + "learning_rate": 2.67125710759961e-05, + "loss": 76.4194, + "step": 132590 + }, + { + "epoch": 0.5357207787747912, + "grad_norm": 647.0215454101562, + "learning_rate": 2.6709088600641717e-05, + "loss": 43.6254, + "step": 132600 + }, + { + "epoch": 0.5357611800401588, + "grad_norm": 720.9840698242188, + "learning_rate": 2.6705606091967778e-05, + "loss": 49.2521, + "step": 132610 + }, + { + "epoch": 0.5358015813055265, + "grad_norm": 1699.47216796875, + "learning_rate": 2.670212355004217e-05, + "loss": 48.1515, + "step": 132620 + }, + { + "epoch": 0.5358419825708941, + "grad_norm": 912.8600463867188, + "learning_rate": 2.6698640974932793e-05, + "loss": 60.5469, + "step": 132630 + }, + { + "epoch": 0.5358823838362617, + "grad_norm": 562.8499755859375, + "learning_rate": 2.6695158366707522e-05, + "loss": 53.9552, + "step": 132640 + }, + { + "epoch": 0.5359227851016294, + "grad_norm": 778.0060424804688, + "learning_rate": 2.6691675725434272e-05, + "loss": 52.4513, + "step": 132650 + }, + { + "epoch": 0.535963186366997, + "grad_norm": 773.4092407226562, + "learning_rate": 2.6688193051180933e-05, + "loss": 39.1972, + "step": 132660 + }, + { + "epoch": 0.5360035876323647, + "grad_norm": 608.3802490234375, + "learning_rate": 2.66847103440154e-05, + "loss": 37.7895, + "step": 132670 + }, + { + "epoch": 0.5360439888977323, + "grad_norm": 775.6356811523438, + "learning_rate": 2.6681227604005576e-05, + "loss": 41.3886, + "step": 132680 + }, + { + "epoch": 0.5360843901630999, + "grad_norm": 1359.913818359375, + "learning_rate": 2.6677744831219348e-05, + "loss": 69.0232, + "step": 132690 + }, + { + "epoch": 0.5361247914284676, + "grad_norm": 540.3826293945312, + "learning_rate": 2.6674262025724627e-05, + "loss": 32.8396, + "step": 132700 + }, + { + "epoch": 0.5361651926938352, + "grad_norm": 862.3062744140625, + "learning_rate": 2.66707791875893e-05, + "loss": 64.9376, + "step": 132710 + }, + { + "epoch": 0.5362055939592028, + "grad_norm": 557.792236328125, + "learning_rate": 2.666729631688128e-05, + "loss": 53.6906, + "step": 132720 + }, + { + "epoch": 0.5362459952245704, + "grad_norm": 711.6216430664062, + "learning_rate": 2.6663813413668455e-05, + "loss": 55.813, + "step": 132730 + }, + { + "epoch": 0.536286396489938, + "grad_norm": 795.2600708007812, + "learning_rate": 2.6660330478018726e-05, + "loss": 57.3366, + "step": 132740 + }, + { + "epoch": 0.5363267977553057, + "grad_norm": 375.18975830078125, + "learning_rate": 2.6656847510000012e-05, + "loss": 64.0476, + "step": 132750 + }, + { + "epoch": 0.5363671990206733, + "grad_norm": 765.218994140625, + "learning_rate": 2.6653364509680188e-05, + "loss": 52.8319, + "step": 132760 + }, + { + "epoch": 0.536407600286041, + "grad_norm": 609.1416015625, + "learning_rate": 2.664988147712718e-05, + "loss": 34.5201, + "step": 132770 + }, + { + "epoch": 0.5364480015514086, + "grad_norm": 629.1380615234375, + "learning_rate": 2.664639841240888e-05, + "loss": 30.9847, + "step": 132780 + }, + { + "epoch": 0.5364884028167762, + "grad_norm": 0.0, + "learning_rate": 2.6642915315593204e-05, + "loss": 41.8538, + "step": 132790 + }, + { + "epoch": 0.5365288040821439, + "grad_norm": 751.7651977539062, + "learning_rate": 2.6639432186748043e-05, + "loss": 53.989, + "step": 132800 + }, + { + "epoch": 0.5365692053475115, + "grad_norm": 829.0548095703125, + "learning_rate": 2.6635949025941303e-05, + "loss": 55.0998, + "step": 132810 + }, + { + "epoch": 0.5366096066128792, + "grad_norm": 1614.1934814453125, + "learning_rate": 2.6632465833240893e-05, + "loss": 67.0182, + "step": 132820 + }, + { + "epoch": 0.5366500078782468, + "grad_norm": 1317.6927490234375, + "learning_rate": 2.662898260871473e-05, + "loss": 61.6694, + "step": 132830 + }, + { + "epoch": 0.5366904091436144, + "grad_norm": 427.5072937011719, + "learning_rate": 2.662549935243071e-05, + "loss": 50.9423, + "step": 132840 + }, + { + "epoch": 0.536730810408982, + "grad_norm": 1174.3800048828125, + "learning_rate": 2.6622016064456738e-05, + "loss": 72.6584, + "step": 132850 + }, + { + "epoch": 0.5367712116743496, + "grad_norm": 379.9467468261719, + "learning_rate": 2.661853274486073e-05, + "loss": 64.3494, + "step": 132860 + }, + { + "epoch": 0.5368116129397172, + "grad_norm": 1121.682861328125, + "learning_rate": 2.6615049393710596e-05, + "loss": 87.2399, + "step": 132870 + }, + { + "epoch": 0.5368520142050849, + "grad_norm": 1015.9825439453125, + "learning_rate": 2.661156601107424e-05, + "loss": 76.0801, + "step": 132880 + }, + { + "epoch": 0.5368924154704525, + "grad_norm": 364.2757568359375, + "learning_rate": 2.660808259701958e-05, + "loss": 44.1075, + "step": 132890 + }, + { + "epoch": 0.5369328167358202, + "grad_norm": 350.0528259277344, + "learning_rate": 2.6604599151614513e-05, + "loss": 41.6351, + "step": 132900 + }, + { + "epoch": 0.5369732180011878, + "grad_norm": 742.4920654296875, + "learning_rate": 2.660111567492696e-05, + "loss": 47.2528, + "step": 132910 + }, + { + "epoch": 0.5370136192665554, + "grad_norm": 588.64599609375, + "learning_rate": 2.6597632167024843e-05, + "loss": 49.0769, + "step": 132920 + }, + { + "epoch": 0.5370540205319231, + "grad_norm": 519.329345703125, + "learning_rate": 2.6594148627976056e-05, + "loss": 27.8517, + "step": 132930 + }, + { + "epoch": 0.5370944217972907, + "grad_norm": 620.1698608398438, + "learning_rate": 2.659066505784852e-05, + "loss": 30.6305, + "step": 132940 + }, + { + "epoch": 0.5371348230626584, + "grad_norm": 615.6215209960938, + "learning_rate": 2.6587181456710153e-05, + "loss": 70.2314, + "step": 132950 + }, + { + "epoch": 0.537175224328026, + "grad_norm": 766.043212890625, + "learning_rate": 2.6583697824628868e-05, + "loss": 59.5871, + "step": 132960 + }, + { + "epoch": 0.5372156255933935, + "grad_norm": 719.8123168945312, + "learning_rate": 2.6580214161672577e-05, + "loss": 92.2357, + "step": 132970 + }, + { + "epoch": 0.5372560268587612, + "grad_norm": 0.0, + "learning_rate": 2.65767304679092e-05, + "loss": 56.5093, + "step": 132980 + }, + { + "epoch": 0.5372964281241288, + "grad_norm": 615.7906494140625, + "learning_rate": 2.6573246743406643e-05, + "loss": 82.8496, + "step": 132990 + }, + { + "epoch": 0.5373368293894965, + "grad_norm": 377.28985595703125, + "learning_rate": 2.656976298823284e-05, + "loss": 61.0799, + "step": 133000 + }, + { + "epoch": 0.5373772306548641, + "grad_norm": 839.7172241210938, + "learning_rate": 2.656627920245569e-05, + "loss": 57.0245, + "step": 133010 + }, + { + "epoch": 0.5374176319202317, + "grad_norm": 296.6985168457031, + "learning_rate": 2.6562795386143126e-05, + "loss": 83.4302, + "step": 133020 + }, + { + "epoch": 0.5374580331855994, + "grad_norm": 615.6727294921875, + "learning_rate": 2.6559311539363057e-05, + "loss": 83.2225, + "step": 133030 + }, + { + "epoch": 0.537498434450967, + "grad_norm": 518.0780029296875, + "learning_rate": 2.6555827662183414e-05, + "loss": 61.7653, + "step": 133040 + }, + { + "epoch": 0.5375388357163347, + "grad_norm": 585.662841796875, + "learning_rate": 2.6552343754672103e-05, + "loss": 30.4691, + "step": 133050 + }, + { + "epoch": 0.5375792369817023, + "grad_norm": 763.933349609375, + "learning_rate": 2.654885981689706e-05, + "loss": 47.9403, + "step": 133060 + }, + { + "epoch": 0.5376196382470699, + "grad_norm": 4562.193359375, + "learning_rate": 2.654537584892619e-05, + "loss": 94.3141, + "step": 133070 + }, + { + "epoch": 0.5376600395124376, + "grad_norm": 2078.553466796875, + "learning_rate": 2.6541891850827427e-05, + "loss": 59.5855, + "step": 133080 + }, + { + "epoch": 0.5377004407778052, + "grad_norm": 882.0210571289062, + "learning_rate": 2.653840782266868e-05, + "loss": 48.3455, + "step": 133090 + }, + { + "epoch": 0.5377408420431727, + "grad_norm": 643.2105712890625, + "learning_rate": 2.653492376451789e-05, + "loss": 50.6237, + "step": 133100 + }, + { + "epoch": 0.5377812433085404, + "grad_norm": 468.3760986328125, + "learning_rate": 2.6531439676442966e-05, + "loss": 50.7116, + "step": 133110 + }, + { + "epoch": 0.537821644573908, + "grad_norm": 763.0037841796875, + "learning_rate": 2.6527955558511842e-05, + "loss": 78.7773, + "step": 133120 + }, + { + "epoch": 0.5378620458392757, + "grad_norm": 289.5901184082031, + "learning_rate": 2.652447141079243e-05, + "loss": 46.9934, + "step": 133130 + }, + { + "epoch": 0.5379024471046433, + "grad_norm": 776.5145874023438, + "learning_rate": 2.6520987233352668e-05, + "loss": 65.4828, + "step": 133140 + }, + { + "epoch": 0.5379428483700109, + "grad_norm": 2137.632568359375, + "learning_rate": 2.6517503026260477e-05, + "loss": 51.402, + "step": 133150 + }, + { + "epoch": 0.5379832496353786, + "grad_norm": 1670.868896484375, + "learning_rate": 2.6514018789583784e-05, + "loss": 58.1956, + "step": 133160 + }, + { + "epoch": 0.5380236509007462, + "grad_norm": 451.3445739746094, + "learning_rate": 2.651053452339051e-05, + "loss": 47.8556, + "step": 133170 + }, + { + "epoch": 0.5380640521661139, + "grad_norm": 993.9606323242188, + "learning_rate": 2.650705022774859e-05, + "loss": 53.2049, + "step": 133180 + }, + { + "epoch": 0.5381044534314815, + "grad_norm": 836.609130859375, + "learning_rate": 2.6503565902725945e-05, + "loss": 43.5089, + "step": 133190 + }, + { + "epoch": 0.5381448546968491, + "grad_norm": 999.381103515625, + "learning_rate": 2.650008154839052e-05, + "loss": 61.5361, + "step": 133200 + }, + { + "epoch": 0.5381852559622168, + "grad_norm": 806.6781616210938, + "learning_rate": 2.6496597164810228e-05, + "loss": 82.2622, + "step": 133210 + }, + { + "epoch": 0.5382256572275844, + "grad_norm": 576.73974609375, + "learning_rate": 2.6493112752053e-05, + "loss": 32.6687, + "step": 133220 + }, + { + "epoch": 0.538266058492952, + "grad_norm": 362.6260986328125, + "learning_rate": 2.6489628310186777e-05, + "loss": 27.9188, + "step": 133230 + }, + { + "epoch": 0.5383064597583196, + "grad_norm": 422.7489013671875, + "learning_rate": 2.6486143839279487e-05, + "loss": 54.0248, + "step": 133240 + }, + { + "epoch": 0.5383468610236872, + "grad_norm": 578.5451049804688, + "learning_rate": 2.6482659339399045e-05, + "loss": 60.191, + "step": 133250 + }, + { + "epoch": 0.5383872622890549, + "grad_norm": 2537.281005859375, + "learning_rate": 2.64791748106134e-05, + "loss": 58.6423, + "step": 133260 + }, + { + "epoch": 0.5384276635544225, + "grad_norm": 537.8567504882812, + "learning_rate": 2.647569025299048e-05, + "loss": 39.9818, + "step": 133270 + }, + { + "epoch": 0.5384680648197901, + "grad_norm": 415.552001953125, + "learning_rate": 2.647220566659822e-05, + "loss": 41.9125, + "step": 133280 + }, + { + "epoch": 0.5385084660851578, + "grad_norm": 603.8895874023438, + "learning_rate": 2.6468721051504554e-05, + "loss": 74.081, + "step": 133290 + }, + { + "epoch": 0.5385488673505254, + "grad_norm": 918.3076171875, + "learning_rate": 2.646523640777741e-05, + "loss": 67.8838, + "step": 133300 + }, + { + "epoch": 0.5385892686158931, + "grad_norm": 644.3486938476562, + "learning_rate": 2.646175173548474e-05, + "loss": 39.718, + "step": 133310 + }, + { + "epoch": 0.5386296698812607, + "grad_norm": 1127.198486328125, + "learning_rate": 2.6458267034694463e-05, + "loss": 50.1396, + "step": 133320 + }, + { + "epoch": 0.5386700711466283, + "grad_norm": 1032.7508544921875, + "learning_rate": 2.645478230547451e-05, + "loss": 60.5964, + "step": 133330 + }, + { + "epoch": 0.538710472411996, + "grad_norm": 2308.70654296875, + "learning_rate": 2.6451297547892834e-05, + "loss": 86.4414, + "step": 133340 + }, + { + "epoch": 0.5387508736773636, + "grad_norm": 250.09999084472656, + "learning_rate": 2.644781276201736e-05, + "loss": 38.5809, + "step": 133350 + }, + { + "epoch": 0.5387912749427312, + "grad_norm": 892.7601318359375, + "learning_rate": 2.6444327947916036e-05, + "loss": 54.9328, + "step": 133360 + }, + { + "epoch": 0.5388316762080988, + "grad_norm": 742.9862670898438, + "learning_rate": 2.6440843105656793e-05, + "loss": 53.7666, + "step": 133370 + }, + { + "epoch": 0.5388720774734664, + "grad_norm": 580.2630004882812, + "learning_rate": 2.6437358235307576e-05, + "loss": 42.9191, + "step": 133380 + }, + { + "epoch": 0.5389124787388341, + "grad_norm": 657.9290161132812, + "learning_rate": 2.643387333693631e-05, + "loss": 48.353, + "step": 133390 + }, + { + "epoch": 0.5389528800042017, + "grad_norm": 278.8210144042969, + "learning_rate": 2.6430388410610955e-05, + "loss": 28.9319, + "step": 133400 + }, + { + "epoch": 0.5389932812695694, + "grad_norm": 4367.41015625, + "learning_rate": 2.6426903456399442e-05, + "loss": 80.4687, + "step": 133410 + }, + { + "epoch": 0.539033682534937, + "grad_norm": 192.4734649658203, + "learning_rate": 2.6423418474369704e-05, + "loss": 34.9511, + "step": 133420 + }, + { + "epoch": 0.5390740838003046, + "grad_norm": 2244.14892578125, + "learning_rate": 2.6419933464589695e-05, + "loss": 88.4328, + "step": 133430 + }, + { + "epoch": 0.5391144850656723, + "grad_norm": 520.8280639648438, + "learning_rate": 2.641644842712735e-05, + "loss": 60.3455, + "step": 133440 + }, + { + "epoch": 0.5391548863310399, + "grad_norm": 356.8963623046875, + "learning_rate": 2.6412963362050618e-05, + "loss": 42.0201, + "step": 133450 + }, + { + "epoch": 0.5391952875964076, + "grad_norm": 578.19873046875, + "learning_rate": 2.640947826942743e-05, + "loss": 80.3492, + "step": 133460 + }, + { + "epoch": 0.5392356888617752, + "grad_norm": 855.69677734375, + "learning_rate": 2.640599314932574e-05, + "loss": 64.0816, + "step": 133470 + }, + { + "epoch": 0.5392760901271428, + "grad_norm": 1241.7418212890625, + "learning_rate": 2.6402508001813496e-05, + "loss": 59.2113, + "step": 133480 + }, + { + "epoch": 0.5393164913925104, + "grad_norm": 245.85995483398438, + "learning_rate": 2.6399022826958635e-05, + "loss": 40.4469, + "step": 133490 + }, + { + "epoch": 0.539356892657878, + "grad_norm": 1086.880615234375, + "learning_rate": 2.6395537624829096e-05, + "loss": 47.158, + "step": 133500 + }, + { + "epoch": 0.5393972939232456, + "grad_norm": 1283.5257568359375, + "learning_rate": 2.639205239549284e-05, + "loss": 75.8677, + "step": 133510 + }, + { + "epoch": 0.5394376951886133, + "grad_norm": 973.451904296875, + "learning_rate": 2.63885671390178e-05, + "loss": 60.6037, + "step": 133520 + }, + { + "epoch": 0.5394780964539809, + "grad_norm": 743.983642578125, + "learning_rate": 2.6385081855471937e-05, + "loss": 46.7652, + "step": 133530 + }, + { + "epoch": 0.5395184977193486, + "grad_norm": 338.3520812988281, + "learning_rate": 2.638159654492318e-05, + "loss": 84.5076, + "step": 133540 + }, + { + "epoch": 0.5395588989847162, + "grad_norm": 460.87103271484375, + "learning_rate": 2.6378111207439494e-05, + "loss": 51.0171, + "step": 133550 + }, + { + "epoch": 0.5395993002500838, + "grad_norm": 0.0, + "learning_rate": 2.637462584308881e-05, + "loss": 49.7129, + "step": 133560 + }, + { + "epoch": 0.5396397015154515, + "grad_norm": 1022.263427734375, + "learning_rate": 2.6371140451939103e-05, + "loss": 69.8698, + "step": 133570 + }, + { + "epoch": 0.5396801027808191, + "grad_norm": 1827.470947265625, + "learning_rate": 2.6367655034058302e-05, + "loss": 79.5532, + "step": 133580 + }, + { + "epoch": 0.5397205040461868, + "grad_norm": 608.9415893554688, + "learning_rate": 2.6364169589514358e-05, + "loss": 61.6891, + "step": 133590 + }, + { + "epoch": 0.5397609053115544, + "grad_norm": 1445.496826171875, + "learning_rate": 2.636068411837523e-05, + "loss": 67.8332, + "step": 133600 + }, + { + "epoch": 0.5398013065769219, + "grad_norm": 633.453369140625, + "learning_rate": 2.6357198620708868e-05, + "loss": 41.0135, + "step": 133610 + }, + { + "epoch": 0.5398417078422896, + "grad_norm": 1123.755615234375, + "learning_rate": 2.635371309658321e-05, + "loss": 65.9337, + "step": 133620 + }, + { + "epoch": 0.5398821091076572, + "grad_norm": 412.05029296875, + "learning_rate": 2.6350227546066218e-05, + "loss": 57.8581, + "step": 133630 + }, + { + "epoch": 0.5399225103730249, + "grad_norm": 794.2384643554688, + "learning_rate": 2.634674196922585e-05, + "loss": 47.7789, + "step": 133640 + }, + { + "epoch": 0.5399629116383925, + "grad_norm": 839.2406616210938, + "learning_rate": 2.6343256366130054e-05, + "loss": 49.6203, + "step": 133650 + }, + { + "epoch": 0.5400033129037601, + "grad_norm": 474.2958068847656, + "learning_rate": 2.633977073684679e-05, + "loss": 73.9534, + "step": 133660 + }, + { + "epoch": 0.5400437141691278, + "grad_norm": 611.3136596679688, + "learning_rate": 2.6336285081443996e-05, + "loss": 90.5097, + "step": 133670 + }, + { + "epoch": 0.5400841154344954, + "grad_norm": 1800.31884765625, + "learning_rate": 2.633279939998964e-05, + "loss": 66.3581, + "step": 133680 + }, + { + "epoch": 0.540124516699863, + "grad_norm": 437.4395751953125, + "learning_rate": 2.6329313692551672e-05, + "loss": 54.8113, + "step": 133690 + }, + { + "epoch": 0.5401649179652307, + "grad_norm": 891.3457641601562, + "learning_rate": 2.6325827959198045e-05, + "loss": 59.0138, + "step": 133700 + }, + { + "epoch": 0.5402053192305983, + "grad_norm": 971.3360595703125, + "learning_rate": 2.6322342199996726e-05, + "loss": 42.0446, + "step": 133710 + }, + { + "epoch": 0.540245720495966, + "grad_norm": 1470.095703125, + "learning_rate": 2.6318856415015664e-05, + "loss": 67.5125, + "step": 133720 + }, + { + "epoch": 0.5402861217613336, + "grad_norm": 553.43408203125, + "learning_rate": 2.631537060432282e-05, + "loss": 45.4572, + "step": 133730 + }, + { + "epoch": 0.5403265230267011, + "grad_norm": 1151.4344482421875, + "learning_rate": 2.631188476798614e-05, + "loss": 41.0355, + "step": 133740 + }, + { + "epoch": 0.5403669242920688, + "grad_norm": 1145.021728515625, + "learning_rate": 2.63083989060736e-05, + "loss": 48.7941, + "step": 133750 + }, + { + "epoch": 0.5404073255574364, + "grad_norm": 640.4161987304688, + "learning_rate": 2.6304913018653144e-05, + "loss": 58.756, + "step": 133760 + }, + { + "epoch": 0.5404477268228041, + "grad_norm": 887.5857543945312, + "learning_rate": 2.630142710579274e-05, + "loss": 54.0097, + "step": 133770 + }, + { + "epoch": 0.5404881280881717, + "grad_norm": 817.3486328125, + "learning_rate": 2.6297941167560346e-05, + "loss": 45.1971, + "step": 133780 + }, + { + "epoch": 0.5405285293535393, + "grad_norm": 959.50732421875, + "learning_rate": 2.6294455204023915e-05, + "loss": 43.2953, + "step": 133790 + }, + { + "epoch": 0.540568930618907, + "grad_norm": 315.968017578125, + "learning_rate": 2.6290969215251416e-05, + "loss": 76.2952, + "step": 133800 + }, + { + "epoch": 0.5406093318842746, + "grad_norm": 1908.3179931640625, + "learning_rate": 2.628748320131081e-05, + "loss": 71.9718, + "step": 133810 + }, + { + "epoch": 0.5406497331496423, + "grad_norm": 869.3490600585938, + "learning_rate": 2.6283997162270052e-05, + "loss": 48.5477, + "step": 133820 + }, + { + "epoch": 0.5406901344150099, + "grad_norm": 1390.5584716796875, + "learning_rate": 2.6280511098197113e-05, + "loss": 57.5297, + "step": 133830 + }, + { + "epoch": 0.5407305356803775, + "grad_norm": 1634.8409423828125, + "learning_rate": 2.627702500915995e-05, + "loss": 56.8467, + "step": 133840 + }, + { + "epoch": 0.5407709369457452, + "grad_norm": 796.09716796875, + "learning_rate": 2.6273538895226522e-05, + "loss": 43.6418, + "step": 133850 + }, + { + "epoch": 0.5408113382111128, + "grad_norm": 676.2175903320312, + "learning_rate": 2.6270052756464803e-05, + "loss": 61.2231, + "step": 133860 + }, + { + "epoch": 0.5408517394764804, + "grad_norm": 843.5305786132812, + "learning_rate": 2.626656659294275e-05, + "loss": 39.5511, + "step": 133870 + }, + { + "epoch": 0.540892140741848, + "grad_norm": 584.4107055664062, + "learning_rate": 2.6263080404728325e-05, + "loss": 40.8211, + "step": 133880 + }, + { + "epoch": 0.5409325420072156, + "grad_norm": 617.6050415039062, + "learning_rate": 2.62595941918895e-05, + "loss": 46.0637, + "step": 133890 + }, + { + "epoch": 0.5409729432725833, + "grad_norm": 323.1057434082031, + "learning_rate": 2.6256107954494242e-05, + "loss": 59.272, + "step": 133900 + }, + { + "epoch": 0.5410133445379509, + "grad_norm": 534.6497802734375, + "learning_rate": 2.6252621692610507e-05, + "loss": 47.1891, + "step": 133910 + }, + { + "epoch": 0.5410537458033186, + "grad_norm": 1699.9815673828125, + "learning_rate": 2.6249135406306273e-05, + "loss": 34.6852, + "step": 133920 + }, + { + "epoch": 0.5410941470686862, + "grad_norm": 595.4637451171875, + "learning_rate": 2.6245649095649494e-05, + "loss": 54.051, + "step": 133930 + }, + { + "epoch": 0.5411345483340538, + "grad_norm": 538.2552490234375, + "learning_rate": 2.6242162760708154e-05, + "loss": 35.2021, + "step": 133940 + }, + { + "epoch": 0.5411749495994215, + "grad_norm": 1009.6656494140625, + "learning_rate": 2.6238676401550207e-05, + "loss": 40.208, + "step": 133950 + }, + { + "epoch": 0.5412153508647891, + "grad_norm": 941.228515625, + "learning_rate": 2.623519001824362e-05, + "loss": 48.1016, + "step": 133960 + }, + { + "epoch": 0.5412557521301568, + "grad_norm": 884.5780029296875, + "learning_rate": 2.6231703610856373e-05, + "loss": 57.1634, + "step": 133970 + }, + { + "epoch": 0.5412961533955244, + "grad_norm": 670.623291015625, + "learning_rate": 2.6228217179456433e-05, + "loss": 56.2344, + "step": 133980 + }, + { + "epoch": 0.541336554660892, + "grad_norm": 432.82928466796875, + "learning_rate": 2.6224730724111758e-05, + "loss": 37.2939, + "step": 133990 + }, + { + "epoch": 0.5413769559262596, + "grad_norm": 505.16387939453125, + "learning_rate": 2.6221244244890336e-05, + "loss": 44.5701, + "step": 134000 } ], "logging_steps": 10,