{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 62447, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016013579515429084, "grad_norm": 19.96757698059082, "learning_rate": 4.003442960946414e-08, "loss": 6.6824, "step": 100 }, { "epoch": 0.0032027159030858167, "grad_norm": 20.086483001708984, "learning_rate": 8.006885921892828e-08, "loss": 6.6684, "step": 200 }, { "epoch": 0.004804073854628725, "grad_norm": 15.201075553894043, "learning_rate": 1.2010328882839244e-07, "loss": 6.5818, "step": 300 }, { "epoch": 0.006405431806171633, "grad_norm": 12.548062324523926, "learning_rate": 1.6013771843785657e-07, "loss": 6.409, "step": 400 }, { "epoch": 0.008006789757714542, "grad_norm": 10.694886207580566, "learning_rate": 2.0017214804732072e-07, "loss": 6.1352, "step": 500 }, { "epoch": 0.00960814770925745, "grad_norm": 8.358333587646484, "learning_rate": 2.402065776567849e-07, "loss": 5.7786, "step": 600 }, { "epoch": 0.011209505660800359, "grad_norm": 6.130667686462402, "learning_rate": 2.80241007266249e-07, "loss": 5.3755, "step": 700 }, { "epoch": 0.012810863612343267, "grad_norm": 3.611398935317993, "learning_rate": 3.2027543687571313e-07, "loss": 5.1389, "step": 800 }, { "epoch": 0.014412221563886175, "grad_norm": 2.8347551822662354, "learning_rate": 3.603098664851773e-07, "loss": 4.9266, "step": 900 }, { "epoch": 0.016013579515429085, "grad_norm": 21.159616470336914, "learning_rate": 4.0034429609464144e-07, "loss": 4.8398, "step": 1000 }, { "epoch": 0.017614937466971993, "grad_norm": 6.242217540740967, "learning_rate": 4.4037872570410557e-07, "loss": 4.7241, "step": 1100 }, { "epoch": 0.0192162954185149, "grad_norm": 3.52362322807312, "learning_rate": 4.804131553135698e-07, "loss": 4.6735, "step": 1200 }, { "epoch": 0.02081765337005781, "grad_norm": 2.831575393676758, "learning_rate": 5.204475849230339e-07, "loss": 4.5398, "step": 1300 }, { "epoch": 0.022419011321600717, "grad_norm": 11.5364351272583, "learning_rate": 5.60482014532498e-07, "loss": 4.5072, "step": 1400 }, { "epoch": 0.024020369273143626, "grad_norm": 3.1574575901031494, "learning_rate": 6.005164441419621e-07, "loss": 4.4694, "step": 1500 }, { "epoch": 0.025621727224686534, "grad_norm": 2.988778829574585, "learning_rate": 6.405508737514263e-07, "loss": 4.451, "step": 1600 }, { "epoch": 0.027223085176229442, "grad_norm": 2.388536214828491, "learning_rate": 6.805853033608904e-07, "loss": 4.3784, "step": 1700 }, { "epoch": 0.02882444312777235, "grad_norm": 8.523998260498047, "learning_rate": 7.206197329703546e-07, "loss": 4.3341, "step": 1800 }, { "epoch": 0.030425801079315258, "grad_norm": 5.110918045043945, "learning_rate": 7.606541625798188e-07, "loss": 4.2974, "step": 1900 }, { "epoch": 0.03202715903085817, "grad_norm": 2.1784770488739014, "learning_rate": 8.006885921892829e-07, "loss": 4.2403, "step": 2000 }, { "epoch": 0.033628516982401074, "grad_norm": 2.2727229595184326, "learning_rate": 8.407230217987469e-07, "loss": 4.1771, "step": 2100 }, { "epoch": 0.035229874933943986, "grad_norm": 2.410856008529663, "learning_rate": 8.807574514082111e-07, "loss": 4.1723, "step": 2200 }, { "epoch": 0.03683123288548689, "grad_norm": 7.233943462371826, "learning_rate": 9.207918810176753e-07, "loss": 4.1031, "step": 2300 }, { "epoch": 0.0384325908370298, "grad_norm": 9.451576232910156, "learning_rate": 9.608263106271395e-07, "loss": 4.0296, "step": 2400 }, { "epoch": 0.04003394878857271, "grad_norm": 5.198200225830078, "learning_rate": 1.0008607402366035e-06, "loss": 3.9371, "step": 2500 }, { "epoch": 0.04163530674011562, "grad_norm": 11.912164688110352, "learning_rate": 1.0408951698460678e-06, "loss": 3.8349, "step": 2600 }, { "epoch": 0.04323666469165852, "grad_norm": 6.008382320404053, "learning_rate": 1.0809295994555318e-06, "loss": 3.7505, "step": 2700 }, { "epoch": 0.044838022643201435, "grad_norm": 3.3153979778289795, "learning_rate": 1.120964029064996e-06, "loss": 3.6149, "step": 2800 }, { "epoch": 0.046439380594744346, "grad_norm": 8.011855125427246, "learning_rate": 1.16099845867446e-06, "loss": 3.5414, "step": 2900 }, { "epoch": 0.04804073854628725, "grad_norm": 3.550476312637329, "learning_rate": 1.2010328882839243e-06, "loss": 3.4248, "step": 3000 }, { "epoch": 0.04964209649783016, "grad_norm": 3.9144866466522217, "learning_rate": 1.2410673178933883e-06, "loss": 3.3224, "step": 3100 }, { "epoch": 0.05124345444937307, "grad_norm": 3.6054248809814453, "learning_rate": 1.2811017475028525e-06, "loss": 3.2983, "step": 3200 }, { "epoch": 0.05284481240091598, "grad_norm": 4.165266990661621, "learning_rate": 1.3211361771123166e-06, "loss": 3.1677, "step": 3300 }, { "epoch": 0.054446170352458884, "grad_norm": 4.654821872711182, "learning_rate": 1.3611706067217808e-06, "loss": 3.14, "step": 3400 }, { "epoch": 0.056047528304001795, "grad_norm": 3.641819715499878, "learning_rate": 1.4012050363312448e-06, "loss": 3.0439, "step": 3500 }, { "epoch": 0.0576488862555447, "grad_norm": 3.61091947555542, "learning_rate": 1.4412394659407093e-06, "loss": 2.9522, "step": 3600 }, { "epoch": 0.05925024420708761, "grad_norm": 22.04112434387207, "learning_rate": 1.4812738955501733e-06, "loss": 2.9255, "step": 3700 }, { "epoch": 0.060851602158630516, "grad_norm": 5.0808892250061035, "learning_rate": 1.5213083251596375e-06, "loss": 2.8402, "step": 3800 }, { "epoch": 0.06245296011017343, "grad_norm": 9.055444717407227, "learning_rate": 1.5613427547691015e-06, "loss": 2.8354, "step": 3900 }, { "epoch": 0.06405431806171634, "grad_norm": 3.44482684135437, "learning_rate": 1.6013771843785658e-06, "loss": 2.7592, "step": 4000 }, { "epoch": 0.06565567601325924, "grad_norm": 2.7728819847106934, "learning_rate": 1.6414116139880298e-06, "loss": 2.7746, "step": 4100 }, { "epoch": 0.06725703396480215, "grad_norm": 1.9306970834732056, "learning_rate": 1.6814460435974938e-06, "loss": 2.7233, "step": 4200 }, { "epoch": 0.06885839191634506, "grad_norm": 1.8614246845245361, "learning_rate": 1.7214804732069583e-06, "loss": 2.7021, "step": 4300 }, { "epoch": 0.07045974986788797, "grad_norm": 3.224013566970825, "learning_rate": 1.7615149028164223e-06, "loss": 2.6586, "step": 4400 }, { "epoch": 0.07206110781943088, "grad_norm": 4.159784317016602, "learning_rate": 1.8015493324258865e-06, "loss": 2.6666, "step": 4500 }, { "epoch": 0.07366246577097378, "grad_norm": 2.2219038009643555, "learning_rate": 1.8415837620353505e-06, "loss": 2.6465, "step": 4600 }, { "epoch": 0.07526382372251669, "grad_norm": 14.757235527038574, "learning_rate": 1.8816181916448148e-06, "loss": 2.6125, "step": 4700 }, { "epoch": 0.0768651816740596, "grad_norm": 1.881609559059143, "learning_rate": 1.921652621254279e-06, "loss": 2.5652, "step": 4800 }, { "epoch": 0.07846653962560252, "grad_norm": 1.9000244140625, "learning_rate": 1.9616870508637432e-06, "loss": 2.5676, "step": 4900 }, { "epoch": 0.08006789757714541, "grad_norm": 3.4342846870422363, "learning_rate": 2.001721480473207e-06, "loss": 2.5934, "step": 5000 }, { "epoch": 0.08166925552868833, "grad_norm": 3.2394461631774902, "learning_rate": 2.0417559100826713e-06, "loss": 2.5371, "step": 5100 }, { "epoch": 0.08327061348023124, "grad_norm": 2.726757287979126, "learning_rate": 2.0817903396921355e-06, "loss": 2.5211, "step": 5200 }, { "epoch": 0.08487197143177415, "grad_norm": 1.8385337591171265, "learning_rate": 2.1218247693015993e-06, "loss": 2.5449, "step": 5300 }, { "epoch": 0.08647332938331705, "grad_norm": 1.7317003011703491, "learning_rate": 2.1618591989110636e-06, "loss": 2.5368, "step": 5400 }, { "epoch": 0.08807468733485996, "grad_norm": 1.8202093839645386, "learning_rate": 2.201893628520528e-06, "loss": 2.4703, "step": 5500 }, { "epoch": 0.08967604528640287, "grad_norm": 1.627389669418335, "learning_rate": 2.241928058129992e-06, "loss": 2.4741, "step": 5600 }, { "epoch": 0.09127740323794578, "grad_norm": 3.039496660232544, "learning_rate": 2.2819624877394563e-06, "loss": 2.4966, "step": 5700 }, { "epoch": 0.09287876118948869, "grad_norm": 5.223389148712158, "learning_rate": 2.32199691734892e-06, "loss": 2.4383, "step": 5800 }, { "epoch": 0.09448011914103159, "grad_norm": 1.7681688070297241, "learning_rate": 2.3620313469583843e-06, "loss": 2.4656, "step": 5900 }, { "epoch": 0.0960814770925745, "grad_norm": 4.00803804397583, "learning_rate": 2.4020657765678486e-06, "loss": 2.481, "step": 6000 }, { "epoch": 0.09768283504411741, "grad_norm": 14.015419960021973, "learning_rate": 2.4421002061773128e-06, "loss": 2.4758, "step": 6100 }, { "epoch": 0.09928419299566033, "grad_norm": 3.860048294067383, "learning_rate": 2.4821346357867766e-06, "loss": 2.4548, "step": 6200 }, { "epoch": 0.10088555094720322, "grad_norm": 1.4068512916564941, "learning_rate": 2.5221690653962413e-06, "loss": 2.4428, "step": 6300 }, { "epoch": 0.10248690889874613, "grad_norm": 3.721557855606079, "learning_rate": 2.562203495005705e-06, "loss": 2.3956, "step": 6400 }, { "epoch": 0.10408826685028905, "grad_norm": 2.806149482727051, "learning_rate": 2.6022379246151693e-06, "loss": 2.3903, "step": 6500 }, { "epoch": 0.10568962480183196, "grad_norm": 2.8240647315979004, "learning_rate": 2.642272354224633e-06, "loss": 2.395, "step": 6600 }, { "epoch": 0.10729098275337486, "grad_norm": 1.7092350721359253, "learning_rate": 2.6823067838340978e-06, "loss": 2.4076, "step": 6700 }, { "epoch": 0.10889234070491777, "grad_norm": 1.814175009727478, "learning_rate": 2.7223412134435616e-06, "loss": 2.4055, "step": 6800 }, { "epoch": 0.11049369865646068, "grad_norm": 2.08941650390625, "learning_rate": 2.762375643053026e-06, "loss": 2.4097, "step": 6900 }, { "epoch": 0.11209505660800359, "grad_norm": 2.0335028171539307, "learning_rate": 2.8024100726624896e-06, "loss": 2.3769, "step": 7000 }, { "epoch": 0.1136964145595465, "grad_norm": 14.262283325195312, "learning_rate": 2.8424445022719543e-06, "loss": 2.3706, "step": 7100 }, { "epoch": 0.1152977725110894, "grad_norm": 2.324890375137329, "learning_rate": 2.8824789318814185e-06, "loss": 2.3688, "step": 7200 }, { "epoch": 0.11689913046263231, "grad_norm": 2.6902220249176025, "learning_rate": 2.9225133614908823e-06, "loss": 2.3829, "step": 7300 }, { "epoch": 0.11850048841417522, "grad_norm": 3.410318613052368, "learning_rate": 2.9625477911003466e-06, "loss": 2.3687, "step": 7400 }, { "epoch": 0.12010184636571813, "grad_norm": 1.4391207695007324, "learning_rate": 3.0025822207098104e-06, "loss": 2.3909, "step": 7500 }, { "epoch": 0.12170320431726103, "grad_norm": 11.690342903137207, "learning_rate": 3.042616650319275e-06, "loss": 2.3387, "step": 7600 }, { "epoch": 0.12330456226880394, "grad_norm": 1.5653709173202515, "learning_rate": 3.082651079928739e-06, "loss": 2.3451, "step": 7700 }, { "epoch": 0.12490592022034686, "grad_norm": 3.124866247177124, "learning_rate": 3.122685509538203e-06, "loss": 2.322, "step": 7800 }, { "epoch": 0.12650727817188975, "grad_norm": 12.413910865783691, "learning_rate": 3.162719939147667e-06, "loss": 2.3182, "step": 7900 }, { "epoch": 0.12810863612343268, "grad_norm": 1.7550314664840698, "learning_rate": 3.2027543687571315e-06, "loss": 2.3099, "step": 8000 }, { "epoch": 0.12970999407497558, "grad_norm": 1.9001699686050415, "learning_rate": 3.2427887983665958e-06, "loss": 2.3299, "step": 8100 }, { "epoch": 0.13131135202651847, "grad_norm": 1.4118369817733765, "learning_rate": 3.2828232279760596e-06, "loss": 2.3003, "step": 8200 }, { "epoch": 0.1329127099780614, "grad_norm": 3.046459913253784, "learning_rate": 3.322857657585524e-06, "loss": 2.3053, "step": 8300 }, { "epoch": 0.1345140679296043, "grad_norm": 6.424179553985596, "learning_rate": 3.3628920871949876e-06, "loss": 2.2845, "step": 8400 }, { "epoch": 0.13611542588114722, "grad_norm": 3.9462482929229736, "learning_rate": 3.4029265168044523e-06, "loss": 2.2821, "step": 8500 }, { "epoch": 0.13771678383269012, "grad_norm": 2.464116096496582, "learning_rate": 3.4429609464139165e-06, "loss": 2.3144, "step": 8600 }, { "epoch": 0.13931814178423302, "grad_norm": 17.63976287841797, "learning_rate": 3.4829953760233803e-06, "loss": 2.2811, "step": 8700 }, { "epoch": 0.14091949973577594, "grad_norm": 3.135732650756836, "learning_rate": 3.5230298056328446e-06, "loss": 2.2953, "step": 8800 }, { "epoch": 0.14252085768731884, "grad_norm": 4.162137031555176, "learning_rate": 3.563064235242309e-06, "loss": 2.2692, "step": 8900 }, { "epoch": 0.14412221563886177, "grad_norm": 6.429003715515137, "learning_rate": 3.603098664851773e-06, "loss": 2.2819, "step": 9000 }, { "epoch": 0.14572357359040466, "grad_norm": 6.803035736083984, "learning_rate": 3.643133094461237e-06, "loss": 2.2672, "step": 9100 }, { "epoch": 0.14732493154194756, "grad_norm": 15.847606658935547, "learning_rate": 3.683167524070701e-06, "loss": 2.26, "step": 9200 }, { "epoch": 0.1489262894934905, "grad_norm": 3.1911871433258057, "learning_rate": 3.723201953680165e-06, "loss": 2.2355, "step": 9300 }, { "epoch": 0.15052764744503339, "grad_norm": 1.6060032844543457, "learning_rate": 3.7632363832896296e-06, "loss": 2.2608, "step": 9400 }, { "epoch": 0.15212900539657628, "grad_norm": 1.5236974954605103, "learning_rate": 3.8032708128990938e-06, "loss": 2.2507, "step": 9500 }, { "epoch": 0.1537303633481192, "grad_norm": 8.704015731811523, "learning_rate": 3.843305242508558e-06, "loss": 2.2457, "step": 9600 }, { "epoch": 0.1553317212996621, "grad_norm": 4.1284918785095215, "learning_rate": 3.883339672118022e-06, "loss": 2.2321, "step": 9700 }, { "epoch": 0.15693307925120503, "grad_norm": 8.519213676452637, "learning_rate": 3.9233741017274865e-06, "loss": 2.2356, "step": 9800 }, { "epoch": 0.15853443720274793, "grad_norm": 6.228696823120117, "learning_rate": 3.96340853133695e-06, "loss": 2.2243, "step": 9900 }, { "epoch": 0.16013579515429083, "grad_norm": 2.693775177001953, "learning_rate": 4.003442960946414e-06, "loss": 2.2288, "step": 10000 }, { "epoch": 0.16173715310583375, "grad_norm": 8.416048049926758, "learning_rate": 4.043477390555878e-06, "loss": 2.2311, "step": 10100 }, { "epoch": 0.16333851105737665, "grad_norm": 1.5264601707458496, "learning_rate": 4.083511820165343e-06, "loss": 2.2186, "step": 10200 }, { "epoch": 0.16493986900891958, "grad_norm": 1.7846661806106567, "learning_rate": 4.123546249774807e-06, "loss": 2.2132, "step": 10300 }, { "epoch": 0.16654122696046247, "grad_norm": 3.9117202758789062, "learning_rate": 4.163580679384271e-06, "loss": 2.228, "step": 10400 }, { "epoch": 0.16814258491200537, "grad_norm": 4.531779766082764, "learning_rate": 4.203615108993735e-06, "loss": 2.2066, "step": 10500 }, { "epoch": 0.1697439428635483, "grad_norm": 2.1657228469848633, "learning_rate": 4.243649538603199e-06, "loss": 2.1929, "step": 10600 }, { "epoch": 0.1713453008150912, "grad_norm": 2.9067344665527344, "learning_rate": 4.283683968212663e-06, "loss": 2.2093, "step": 10700 }, { "epoch": 0.1729466587666341, "grad_norm": 3.7661423683166504, "learning_rate": 4.323718397822127e-06, "loss": 2.1919, "step": 10800 }, { "epoch": 0.17454801671817702, "grad_norm": 2.9169373512268066, "learning_rate": 4.363752827431592e-06, "loss": 2.2099, "step": 10900 }, { "epoch": 0.17614937466971992, "grad_norm": 2.1810638904571533, "learning_rate": 4.403787257041056e-06, "loss": 2.1923, "step": 11000 }, { "epoch": 0.17775073262126284, "grad_norm": 8.174213409423828, "learning_rate": 4.443821686650519e-06, "loss": 2.1886, "step": 11100 }, { "epoch": 0.17935209057280574, "grad_norm": 2.431321382522583, "learning_rate": 4.483856116259984e-06, "loss": 2.1991, "step": 11200 }, { "epoch": 0.18095344852434864, "grad_norm": 3.7426862716674805, "learning_rate": 4.523890545869448e-06, "loss": 2.1763, "step": 11300 }, { "epoch": 0.18255480647589156, "grad_norm": 2.5155022144317627, "learning_rate": 4.5639249754789125e-06, "loss": 2.1906, "step": 11400 }, { "epoch": 0.18415616442743446, "grad_norm": 1.7059454917907715, "learning_rate": 4.603959405088376e-06, "loss": 2.1872, "step": 11500 }, { "epoch": 0.18575752237897739, "grad_norm": 5.253864765167236, "learning_rate": 4.64399383469784e-06, "loss": 2.1889, "step": 11600 }, { "epoch": 0.18735888033052028, "grad_norm": 1.5918197631835938, "learning_rate": 4.684028264307305e-06, "loss": 2.1746, "step": 11700 }, { "epoch": 0.18896023828206318, "grad_norm": 10.147111892700195, "learning_rate": 4.724062693916769e-06, "loss": 2.1712, "step": 11800 }, { "epoch": 0.1905615962336061, "grad_norm": 4.3356781005859375, "learning_rate": 4.764097123526233e-06, "loss": 2.1815, "step": 11900 }, { "epoch": 0.192162954185149, "grad_norm": 10.20026683807373, "learning_rate": 4.804131553135697e-06, "loss": 2.176, "step": 12000 }, { "epoch": 0.1937643121366919, "grad_norm": 1.9123090505599976, "learning_rate": 4.844165982745162e-06, "loss": 2.1807, "step": 12100 }, { "epoch": 0.19536567008823483, "grad_norm": 1.6245704889297485, "learning_rate": 4.8842004123546256e-06, "loss": 2.1637, "step": 12200 }, { "epoch": 0.19696702803977773, "grad_norm": 5.880768299102783, "learning_rate": 4.924234841964089e-06, "loss": 2.1735, "step": 12300 }, { "epoch": 0.19856838599132065, "grad_norm": 5.809731960296631, "learning_rate": 4.964269271573553e-06, "loss": 2.1523, "step": 12400 }, { "epoch": 0.20016974394286355, "grad_norm": 1.827416181564331, "learning_rate": 5.004303701183018e-06, "loss": 2.1485, "step": 12500 }, { "epoch": 0.20177110189440645, "grad_norm": 2.386488437652588, "learning_rate": 5.0443381307924825e-06, "loss": 2.1641, "step": 12600 }, { "epoch": 0.20337245984594937, "grad_norm": 5.080982208251953, "learning_rate": 5.0843725604019455e-06, "loss": 2.1706, "step": 12700 }, { "epoch": 0.20497381779749227, "grad_norm": 6.828605651855469, "learning_rate": 5.12440699001141e-06, "loss": 2.1736, "step": 12800 }, { "epoch": 0.2065751757490352, "grad_norm": 2.243302822113037, "learning_rate": 5.164441419620875e-06, "loss": 2.1238, "step": 12900 }, { "epoch": 0.2081765337005781, "grad_norm": 3.8954567909240723, "learning_rate": 5.204475849230339e-06, "loss": 2.1461, "step": 13000 }, { "epoch": 0.209777891652121, "grad_norm": 3.563438653945923, "learning_rate": 5.244510278839802e-06, "loss": 2.1492, "step": 13100 }, { "epoch": 0.21137924960366392, "grad_norm": 2.1851043701171875, "learning_rate": 5.284544708449266e-06, "loss": 2.1407, "step": 13200 }, { "epoch": 0.2129806075552068, "grad_norm": 4.8792524337768555, "learning_rate": 5.324579138058731e-06, "loss": 2.1403, "step": 13300 }, { "epoch": 0.2145819655067497, "grad_norm": 4.021134376525879, "learning_rate": 5.3646135676681955e-06, "loss": 2.1628, "step": 13400 }, { "epoch": 0.21618332345829264, "grad_norm": 3.8988146781921387, "learning_rate": 5.4046479972776585e-06, "loss": 2.1425, "step": 13500 }, { "epoch": 0.21778468140983553, "grad_norm": 6.337070941925049, "learning_rate": 5.444682426887123e-06, "loss": 2.1493, "step": 13600 }, { "epoch": 0.21938603936137846, "grad_norm": 2.077366828918457, "learning_rate": 5.484716856496588e-06, "loss": 2.1264, "step": 13700 }, { "epoch": 0.22098739731292136, "grad_norm": 1.3507400751113892, "learning_rate": 5.524751286106052e-06, "loss": 2.1306, "step": 13800 }, { "epoch": 0.22258875526446426, "grad_norm": 1.5656003952026367, "learning_rate": 5.564785715715516e-06, "loss": 2.135, "step": 13900 }, { "epoch": 0.22419011321600718, "grad_norm": 3.315119981765747, "learning_rate": 5.604820145324979e-06, "loss": 2.1449, "step": 14000 }, { "epoch": 0.22579147116755008, "grad_norm": 1.677067518234253, "learning_rate": 5.644854574934444e-06, "loss": 2.1126, "step": 14100 }, { "epoch": 0.227392829119093, "grad_norm": 1.3107109069824219, "learning_rate": 5.6848890045439086e-06, "loss": 2.1415, "step": 14200 }, { "epoch": 0.2289941870706359, "grad_norm": 1.887251853942871, "learning_rate": 5.724923434153372e-06, "loss": 2.1312, "step": 14300 }, { "epoch": 0.2305955450221788, "grad_norm": 4.706649303436279, "learning_rate": 5.764957863762837e-06, "loss": 2.1417, "step": 14400 }, { "epoch": 0.23219690297372172, "grad_norm": 4.202969074249268, "learning_rate": 5.8049922933723e-06, "loss": 2.1403, "step": 14500 }, { "epoch": 0.23379826092526462, "grad_norm": 2.2349281311035156, "learning_rate": 5.845026722981765e-06, "loss": 2.1164, "step": 14600 }, { "epoch": 0.23539961887680752, "grad_norm": 1.7390815019607544, "learning_rate": 5.885061152591229e-06, "loss": 2.1313, "step": 14700 }, { "epoch": 0.23700097682835045, "grad_norm": 1.9534856081008911, "learning_rate": 5.925095582200693e-06, "loss": 2.1252, "step": 14800 }, { "epoch": 0.23860233477989334, "grad_norm": 1.7701072692871094, "learning_rate": 5.965130011810158e-06, "loss": 2.1207, "step": 14900 }, { "epoch": 0.24020369273143627, "grad_norm": 6.166327953338623, "learning_rate": 6.005164441419621e-06, "loss": 2.1079, "step": 15000 }, { "epoch": 0.24180505068297917, "grad_norm": 2.4361186027526855, "learning_rate": 6.045198871029085e-06, "loss": 2.114, "step": 15100 }, { "epoch": 0.24340640863452206, "grad_norm": 2.536973714828491, "learning_rate": 6.08523330063855e-06, "loss": 2.109, "step": 15200 }, { "epoch": 0.245007766586065, "grad_norm": 3.394212484359741, "learning_rate": 6.125267730248014e-06, "loss": 2.1193, "step": 15300 }, { "epoch": 0.2466091245376079, "grad_norm": 1.725258708000183, "learning_rate": 6.165302159857478e-06, "loss": 2.1238, "step": 15400 }, { "epoch": 0.2482104824891508, "grad_norm": 2.9132273197174072, "learning_rate": 6.205336589466942e-06, "loss": 2.115, "step": 15500 }, { "epoch": 0.2498118404406937, "grad_norm": 1.6105629205703735, "learning_rate": 6.245371019076406e-06, "loss": 2.1103, "step": 15600 }, { "epoch": 0.25141319839223664, "grad_norm": 1.4759615659713745, "learning_rate": 6.285405448685871e-06, "loss": 2.1018, "step": 15700 }, { "epoch": 0.2530145563437795, "grad_norm": 6.175992488861084, "learning_rate": 6.325439878295334e-06, "loss": 2.1052, "step": 15800 }, { "epoch": 0.25461591429532243, "grad_norm": 1.496497631072998, "learning_rate": 6.3654743079047984e-06, "loss": 2.1098, "step": 15900 }, { "epoch": 0.25621727224686536, "grad_norm": 2.9353444576263428, "learning_rate": 6.405508737514263e-06, "loss": 2.1142, "step": 16000 }, { "epoch": 0.2578186301984082, "grad_norm": 3.003761053085327, "learning_rate": 6.445543167123727e-06, "loss": 2.1096, "step": 16100 }, { "epoch": 0.25941998814995115, "grad_norm": 1.8897191286087036, "learning_rate": 6.4855775967331916e-06, "loss": 2.0977, "step": 16200 }, { "epoch": 0.2610213461014941, "grad_norm": 1.1225190162658691, "learning_rate": 6.5256120263426545e-06, "loss": 2.1022, "step": 16300 }, { "epoch": 0.26262270405303695, "grad_norm": 5.252044200897217, "learning_rate": 6.565646455952119e-06, "loss": 2.1068, "step": 16400 }, { "epoch": 0.2642240620045799, "grad_norm": 1.9852492809295654, "learning_rate": 6.605680885561584e-06, "loss": 2.0882, "step": 16500 }, { "epoch": 0.2658254199561228, "grad_norm": 1.1616008281707764, "learning_rate": 6.645715315171048e-06, "loss": 2.0944, "step": 16600 }, { "epoch": 0.2674267779076657, "grad_norm": 2.1226704120635986, "learning_rate": 6.685749744780512e-06, "loss": 2.0927, "step": 16700 }, { "epoch": 0.2690281358592086, "grad_norm": 1.4191474914550781, "learning_rate": 6.725784174389975e-06, "loss": 2.0998, "step": 16800 }, { "epoch": 0.2706294938107515, "grad_norm": 2.283435106277466, "learning_rate": 6.76581860399944e-06, "loss": 2.1157, "step": 16900 }, { "epoch": 0.27223085176229445, "grad_norm": 1.6899996995925903, "learning_rate": 6.805853033608905e-06, "loss": 2.0937, "step": 17000 }, { "epoch": 0.2738322097138373, "grad_norm": 1.3105698823928833, "learning_rate": 6.845887463218368e-06, "loss": 2.0545, "step": 17100 }, { "epoch": 0.27543356766538024, "grad_norm": 1.1776176691055298, "learning_rate": 6.885921892827833e-06, "loss": 2.0984, "step": 17200 }, { "epoch": 0.27703492561692317, "grad_norm": 1.651307225227356, "learning_rate": 6.925956322437296e-06, "loss": 2.0959, "step": 17300 }, { "epoch": 0.27863628356846604, "grad_norm": 1.7482041120529175, "learning_rate": 6.965990752046761e-06, "loss": 2.0636, "step": 17400 }, { "epoch": 0.28023764152000896, "grad_norm": 3.625835418701172, "learning_rate": 7.006025181656225e-06, "loss": 2.085, "step": 17500 }, { "epoch": 0.2818389994715519, "grad_norm": 1.6532440185546875, "learning_rate": 7.046059611265689e-06, "loss": 2.0883, "step": 17600 }, { "epoch": 0.28344035742309476, "grad_norm": 1.331597924232483, "learning_rate": 7.086094040875153e-06, "loss": 2.1034, "step": 17700 }, { "epoch": 0.2850417153746377, "grad_norm": 3.6023612022399902, "learning_rate": 7.126128470484618e-06, "loss": 2.0991, "step": 17800 }, { "epoch": 0.2866430733261806, "grad_norm": 1.4167087078094482, "learning_rate": 7.166162900094081e-06, "loss": 2.1057, "step": 17900 }, { "epoch": 0.28824443127772353, "grad_norm": 6.183845520019531, "learning_rate": 7.206197329703546e-06, "loss": 2.0951, "step": 18000 }, { "epoch": 0.2898457892292664, "grad_norm": 1.5191693305969238, "learning_rate": 7.246231759313009e-06, "loss": 2.062, "step": 18100 }, { "epoch": 0.29144714718080933, "grad_norm": 1.5019919872283936, "learning_rate": 7.286266188922474e-06, "loss": 2.073, "step": 18200 }, { "epoch": 0.29304850513235225, "grad_norm": 2.338139533996582, "learning_rate": 7.326300618531938e-06, "loss": 2.0935, "step": 18300 }, { "epoch": 0.2946498630838951, "grad_norm": 3.6389622688293457, "learning_rate": 7.366335048141402e-06, "loss": 2.0907, "step": 18400 }, { "epoch": 0.29625122103543805, "grad_norm": 1.3060230016708374, "learning_rate": 7.406369477750867e-06, "loss": 2.0691, "step": 18500 }, { "epoch": 0.297852578986981, "grad_norm": 2.181640863418579, "learning_rate": 7.44640390736033e-06, "loss": 2.0668, "step": 18600 }, { "epoch": 0.29945393693852385, "grad_norm": 2.1645591259002686, "learning_rate": 7.4864383369697944e-06, "loss": 2.0589, "step": 18700 }, { "epoch": 0.30105529489006677, "grad_norm": 2.522383451461792, "learning_rate": 7.526472766579259e-06, "loss": 2.0624, "step": 18800 }, { "epoch": 0.3026566528416097, "grad_norm": 2.0438318252563477, "learning_rate": 7.566507196188723e-06, "loss": 2.0756, "step": 18900 }, { "epoch": 0.30425801079315257, "grad_norm": 1.5602883100509644, "learning_rate": 7.6065416257981876e-06, "loss": 2.0539, "step": 19000 }, { "epoch": 0.3058593687446955, "grad_norm": 1.2384752035140991, "learning_rate": 7.64657605540765e-06, "loss": 2.0698, "step": 19100 }, { "epoch": 0.3074607266962384, "grad_norm": 3.290865659713745, "learning_rate": 7.686610485017116e-06, "loss": 2.0538, "step": 19200 }, { "epoch": 0.30906208464778134, "grad_norm": 1.9636443853378296, "learning_rate": 7.72664491462658e-06, "loss": 2.0679, "step": 19300 }, { "epoch": 0.3106634425993242, "grad_norm": 2.1679654121398926, "learning_rate": 7.766679344236044e-06, "loss": 2.0734, "step": 19400 }, { "epoch": 0.31226480055086714, "grad_norm": 2.441173553466797, "learning_rate": 7.806713773845507e-06, "loss": 2.0475, "step": 19500 }, { "epoch": 0.31386615850241006, "grad_norm": 1.2764122486114502, "learning_rate": 7.846748203454973e-06, "loss": 2.0773, "step": 19600 }, { "epoch": 0.31546751645395293, "grad_norm": 1.106123685836792, "learning_rate": 7.886782633064435e-06, "loss": 2.0645, "step": 19700 }, { "epoch": 0.31706887440549586, "grad_norm": 1.025707721710205, "learning_rate": 7.9268170626739e-06, "loss": 2.0643, "step": 19800 }, { "epoch": 0.3186702323570388, "grad_norm": 1.2565511465072632, "learning_rate": 7.966851492283364e-06, "loss": 2.0666, "step": 19900 }, { "epoch": 0.32027159030858166, "grad_norm": 1.2378392219543457, "learning_rate": 8.006885921892828e-06, "loss": 2.0601, "step": 20000 }, { "epoch": 0.3218729482601246, "grad_norm": 1.9206656217575073, "learning_rate": 8.046920351502294e-06, "loss": 2.0576, "step": 20100 }, { "epoch": 0.3234743062116675, "grad_norm": 1.6953002214431763, "learning_rate": 8.086954781111756e-06, "loss": 2.0502, "step": 20200 }, { "epoch": 0.3250756641632104, "grad_norm": 1.6600971221923828, "learning_rate": 8.126989210721221e-06, "loss": 2.0589, "step": 20300 }, { "epoch": 0.3266770221147533, "grad_norm": 2.360778331756592, "learning_rate": 8.167023640330685e-06, "loss": 2.0591, "step": 20400 }, { "epoch": 0.3282783800662962, "grad_norm": 1.5475653409957886, "learning_rate": 8.207058069940149e-06, "loss": 2.0703, "step": 20500 }, { "epoch": 0.32987973801783915, "grad_norm": 1.2999683618545532, "learning_rate": 8.247092499549614e-06, "loss": 2.0651, "step": 20600 }, { "epoch": 0.331481095969382, "grad_norm": 3.301884889602661, "learning_rate": 8.287126929159077e-06, "loss": 2.0485, "step": 20700 }, { "epoch": 0.33308245392092495, "grad_norm": 3.200942277908325, "learning_rate": 8.327161358768542e-06, "loss": 2.0444, "step": 20800 }, { "epoch": 0.3346838118724679, "grad_norm": 1.2649630308151245, "learning_rate": 8.367195788378006e-06, "loss": 2.0526, "step": 20900 }, { "epoch": 0.33628516982401074, "grad_norm": 1.187700867652893, "learning_rate": 8.40723021798747e-06, "loss": 2.0651, "step": 21000 }, { "epoch": 0.33788652777555367, "grad_norm": 1.5766338109970093, "learning_rate": 8.447264647596935e-06, "loss": 2.0575, "step": 21100 }, { "epoch": 0.3394878857270966, "grad_norm": 1.1678153276443481, "learning_rate": 8.487299077206397e-06, "loss": 2.0394, "step": 21200 }, { "epoch": 0.34108924367863946, "grad_norm": 1.978745698928833, "learning_rate": 8.527333506815863e-06, "loss": 2.0434, "step": 21300 }, { "epoch": 0.3426906016301824, "grad_norm": 1.311265230178833, "learning_rate": 8.567367936425327e-06, "loss": 2.0423, "step": 21400 }, { "epoch": 0.3442919595817253, "grad_norm": 1.4099359512329102, "learning_rate": 8.60740236603479e-06, "loss": 2.0375, "step": 21500 }, { "epoch": 0.3458933175332682, "grad_norm": 1.2521507740020752, "learning_rate": 8.647436795644254e-06, "loss": 2.0355, "step": 21600 }, { "epoch": 0.3474946754848111, "grad_norm": 2.544433832168579, "learning_rate": 8.687471225253718e-06, "loss": 2.0351, "step": 21700 }, { "epoch": 0.34909603343635404, "grad_norm": 1.6786710023880005, "learning_rate": 8.727505654863184e-06, "loss": 2.0544, "step": 21800 }, { "epoch": 0.35069739138789696, "grad_norm": 1.224026083946228, "learning_rate": 8.767540084472647e-06, "loss": 2.0406, "step": 21900 }, { "epoch": 0.35229874933943983, "grad_norm": 7.5012431144714355, "learning_rate": 8.807574514082111e-06, "loss": 2.0355, "step": 22000 }, { "epoch": 0.35390010729098276, "grad_norm": 1.4292916059494019, "learning_rate": 8.847608943691575e-06, "loss": 2.0445, "step": 22100 }, { "epoch": 0.3555014652425257, "grad_norm": 1.1762036085128784, "learning_rate": 8.887643373301039e-06, "loss": 2.0358, "step": 22200 }, { "epoch": 0.35710282319406855, "grad_norm": 1.1497453451156616, "learning_rate": 8.927677802910504e-06, "loss": 2.0411, "step": 22300 }, { "epoch": 0.3587041811456115, "grad_norm": 1.7819931507110596, "learning_rate": 8.967712232519968e-06, "loss": 2.0414, "step": 22400 }, { "epoch": 0.3603055390971544, "grad_norm": 4.624775409698486, "learning_rate": 9.007746662129432e-06, "loss": 2.0309, "step": 22500 }, { "epoch": 0.3619068970486973, "grad_norm": 1.5174845457077026, "learning_rate": 9.047781091738896e-06, "loss": 2.0494, "step": 22600 }, { "epoch": 0.3635082550002402, "grad_norm": 2.5349197387695312, "learning_rate": 9.08781552134836e-06, "loss": 2.0199, "step": 22700 }, { "epoch": 0.3651096129517831, "grad_norm": 1.4281384944915771, "learning_rate": 9.127849950957825e-06, "loss": 2.0461, "step": 22800 }, { "epoch": 0.366710970903326, "grad_norm": 1.4501956701278687, "learning_rate": 9.167884380567289e-06, "loss": 2.0275, "step": 22900 }, { "epoch": 0.3683123288548689, "grad_norm": 1.7848312854766846, "learning_rate": 9.207918810176753e-06, "loss": 2.0459, "step": 23000 }, { "epoch": 0.36991368680641185, "grad_norm": 1.2266578674316406, "learning_rate": 9.247953239786217e-06, "loss": 2.0382, "step": 23100 }, { "epoch": 0.37151504475795477, "grad_norm": 2.917593002319336, "learning_rate": 9.28798766939568e-06, "loss": 2.0338, "step": 23200 }, { "epoch": 0.37311640270949764, "grad_norm": 1.7669585943222046, "learning_rate": 9.328022099005146e-06, "loss": 2.0098, "step": 23300 }, { "epoch": 0.37471776066104057, "grad_norm": 1.3076069355010986, "learning_rate": 9.36805652861461e-06, "loss": 2.0259, "step": 23400 }, { "epoch": 0.3763191186125835, "grad_norm": 1.26585054397583, "learning_rate": 9.408090958224073e-06, "loss": 2.0096, "step": 23500 }, { "epoch": 0.37792047656412636, "grad_norm": 1.330881953239441, "learning_rate": 9.448125387833537e-06, "loss": 2.0141, "step": 23600 }, { "epoch": 0.3795218345156693, "grad_norm": 1.3129397630691528, "learning_rate": 9.488159817443003e-06, "loss": 2.0351, "step": 23700 }, { "epoch": 0.3811231924672122, "grad_norm": 2.2104837894439697, "learning_rate": 9.528194247052467e-06, "loss": 2.0458, "step": 23800 }, { "epoch": 0.3827245504187551, "grad_norm": 4.37896728515625, "learning_rate": 9.56822867666193e-06, "loss": 2.0432, "step": 23900 }, { "epoch": 0.384325908370298, "grad_norm": 1.4323294162750244, "learning_rate": 9.608263106271394e-06, "loss": 2.028, "step": 24000 }, { "epoch": 0.38592726632184093, "grad_norm": 2.277630567550659, "learning_rate": 9.648297535880858e-06, "loss": 2.012, "step": 24100 }, { "epoch": 0.3875286242733838, "grad_norm": 1.0068135261535645, "learning_rate": 9.688331965490324e-06, "loss": 2.0153, "step": 24200 }, { "epoch": 0.38912998222492673, "grad_norm": 1.464872121810913, "learning_rate": 9.728366395099786e-06, "loss": 2.0255, "step": 24300 }, { "epoch": 0.39073134017646965, "grad_norm": 1.6919342279434204, "learning_rate": 9.768400824709251e-06, "loss": 2.0146, "step": 24400 }, { "epoch": 0.3923326981280126, "grad_norm": 1.4236170053482056, "learning_rate": 9.808435254318715e-06, "loss": 2.0235, "step": 24500 }, { "epoch": 0.39393405607955545, "grad_norm": 1.2634207010269165, "learning_rate": 9.848469683928179e-06, "loss": 2.0067, "step": 24600 }, { "epoch": 0.3955354140310984, "grad_norm": 1.185770034790039, "learning_rate": 9.888504113537644e-06, "loss": 2.0249, "step": 24700 }, { "epoch": 0.3971367719826413, "grad_norm": 1.6554452180862427, "learning_rate": 9.928538543147106e-06, "loss": 2.0224, "step": 24800 }, { "epoch": 0.39873812993418417, "grad_norm": 1.7017241716384888, "learning_rate": 9.968572972756572e-06, "loss": 2.0138, "step": 24900 }, { "epoch": 0.4003394878857271, "grad_norm": 1.0250684022903442, "learning_rate": 1.0008607402366036e-05, "loss": 2.0082, "step": 25000 }, { "epoch": 0.40194084583727, "grad_norm": 1.3391590118408203, "learning_rate": 1.0048641831975501e-05, "loss": 2.008, "step": 25100 }, { "epoch": 0.4035422037888129, "grad_norm": 1.0555273294448853, "learning_rate": 1.0088676261584965e-05, "loss": 2.0199, "step": 25200 }, { "epoch": 0.4051435617403558, "grad_norm": 2.1245908737182617, "learning_rate": 1.0128710691194427e-05, "loss": 2.0141, "step": 25300 }, { "epoch": 0.40674491969189874, "grad_norm": 1.1639268398284912, "learning_rate": 1.0168745120803891e-05, "loss": 2.0274, "step": 25400 }, { "epoch": 0.4083462776434416, "grad_norm": 1.75816011428833, "learning_rate": 1.0208779550413356e-05, "loss": 2.0065, "step": 25500 }, { "epoch": 0.40994763559498454, "grad_norm": 3.2224700450897217, "learning_rate": 1.024881398002282e-05, "loss": 2.0036, "step": 25600 }, { "epoch": 0.41154899354652746, "grad_norm": 1.0586453676223755, "learning_rate": 1.0288848409632284e-05, "loss": 2.0216, "step": 25700 }, { "epoch": 0.4131503514980704, "grad_norm": 1.5636674165725708, "learning_rate": 1.032888283924175e-05, "loss": 2.0035, "step": 25800 }, { "epoch": 0.41475170944961326, "grad_norm": 1.287876009941101, "learning_rate": 1.0368917268851213e-05, "loss": 2.0033, "step": 25900 }, { "epoch": 0.4163530674011562, "grad_norm": 1.1676390171051025, "learning_rate": 1.0408951698460677e-05, "loss": 1.9948, "step": 26000 }, { "epoch": 0.4179544253526991, "grad_norm": 2.230921506881714, "learning_rate": 1.0448986128070143e-05, "loss": 1.9747, "step": 26100 }, { "epoch": 0.419555783304242, "grad_norm": 1.1102570295333862, "learning_rate": 1.0489020557679605e-05, "loss": 2.002, "step": 26200 }, { "epoch": 0.4211571412557849, "grad_norm": 12.577959060668945, "learning_rate": 1.0529054987289069e-05, "loss": 1.9873, "step": 26300 }, { "epoch": 0.42275849920732783, "grad_norm": 1.0285041332244873, "learning_rate": 1.0569089416898532e-05, "loss": 2.0182, "step": 26400 }, { "epoch": 0.4243598571588707, "grad_norm": 2.1250357627868652, "learning_rate": 1.0609123846507998e-05, "loss": 1.9949, "step": 26500 }, { "epoch": 0.4259612151104136, "grad_norm": 0.90369713306427, "learning_rate": 1.0649158276117462e-05, "loss": 2.0081, "step": 26600 }, { "epoch": 0.42756257306195655, "grad_norm": 1.0429993867874146, "learning_rate": 1.0689192705726926e-05, "loss": 2.0188, "step": 26700 }, { "epoch": 0.4291639310134994, "grad_norm": 1.2060284614562988, "learning_rate": 1.0729227135336391e-05, "loss": 1.9747, "step": 26800 }, { "epoch": 0.43076528896504235, "grad_norm": 1.7947618961334229, "learning_rate": 1.0769261564945855e-05, "loss": 1.9963, "step": 26900 }, { "epoch": 0.4323666469165853, "grad_norm": 0.970507025718689, "learning_rate": 1.0809295994555317e-05, "loss": 2.0089, "step": 27000 }, { "epoch": 0.4339680048681282, "grad_norm": 1.038913607597351, "learning_rate": 1.0849330424164784e-05, "loss": 1.9827, "step": 27100 }, { "epoch": 0.43556936281967107, "grad_norm": 2.165769100189209, "learning_rate": 1.0889364853774246e-05, "loss": 1.9961, "step": 27200 }, { "epoch": 0.437170720771214, "grad_norm": 1.196454644203186, "learning_rate": 1.092939928338371e-05, "loss": 2.0035, "step": 27300 }, { "epoch": 0.4387720787227569, "grad_norm": 0.956650972366333, "learning_rate": 1.0969433712993176e-05, "loss": 1.9841, "step": 27400 }, { "epoch": 0.4403734366742998, "grad_norm": 1.084486961364746, "learning_rate": 1.100946814260264e-05, "loss": 1.9789, "step": 27500 }, { "epoch": 0.4419747946258427, "grad_norm": 0.9682411551475525, "learning_rate": 1.1049502572212103e-05, "loss": 1.9999, "step": 27600 }, { "epoch": 0.44357615257738564, "grad_norm": 2.1347734928131104, "learning_rate": 1.1089537001821567e-05, "loss": 2.0091, "step": 27700 }, { "epoch": 0.4451775105289285, "grad_norm": 4.513906478881836, "learning_rate": 1.1129571431431033e-05, "loss": 1.9896, "step": 27800 }, { "epoch": 0.44677886848047144, "grad_norm": 1.6367132663726807, "learning_rate": 1.1169605861040496e-05, "loss": 1.9781, "step": 27900 }, { "epoch": 0.44838022643201436, "grad_norm": 1.0168904066085815, "learning_rate": 1.1209640290649958e-05, "loss": 1.9924, "step": 28000 }, { "epoch": 0.44998158438355723, "grad_norm": 1.7051305770874023, "learning_rate": 1.1249674720259424e-05, "loss": 1.9872, "step": 28100 }, { "epoch": 0.45158294233510016, "grad_norm": 0.9768884778022766, "learning_rate": 1.1289709149868888e-05, "loss": 1.9809, "step": 28200 }, { "epoch": 0.4531843002866431, "grad_norm": 1.0439552068710327, "learning_rate": 1.1329743579478352e-05, "loss": 1.999, "step": 28300 }, { "epoch": 0.454785658238186, "grad_norm": 0.9658423066139221, "learning_rate": 1.1369778009087817e-05, "loss": 2.0104, "step": 28400 }, { "epoch": 0.4563870161897289, "grad_norm": 0.9558666944503784, "learning_rate": 1.1409812438697281e-05, "loss": 2.0104, "step": 28500 }, { "epoch": 0.4579883741412718, "grad_norm": 1.591242790222168, "learning_rate": 1.1449846868306745e-05, "loss": 1.9888, "step": 28600 }, { "epoch": 0.45958973209281473, "grad_norm": 1.8828788995742798, "learning_rate": 1.148988129791621e-05, "loss": 1.9951, "step": 28700 }, { "epoch": 0.4611910900443576, "grad_norm": 1.1350332498550415, "learning_rate": 1.1529915727525674e-05, "loss": 1.9842, "step": 28800 }, { "epoch": 0.4627924479959005, "grad_norm": 1.6506210565567017, "learning_rate": 1.1569950157135136e-05, "loss": 1.9927, "step": 28900 }, { "epoch": 0.46439380594744345, "grad_norm": 1.0234204530715942, "learning_rate": 1.16099845867446e-05, "loss": 1.9981, "step": 29000 }, { "epoch": 0.4659951638989863, "grad_norm": 0.9220559597015381, "learning_rate": 1.1650019016354065e-05, "loss": 1.9772, "step": 29100 }, { "epoch": 0.46759652185052925, "grad_norm": 1.008548617362976, "learning_rate": 1.169005344596353e-05, "loss": 1.9885, "step": 29200 }, { "epoch": 0.46919787980207217, "grad_norm": 1.0374430418014526, "learning_rate": 1.1730087875572993e-05, "loss": 1.9901, "step": 29300 }, { "epoch": 0.47079923775361504, "grad_norm": 1.4683129787445068, "learning_rate": 1.1770122305182459e-05, "loss": 1.9905, "step": 29400 }, { "epoch": 0.47240059570515797, "grad_norm": 2.1045260429382324, "learning_rate": 1.1810156734791922e-05, "loss": 1.9764, "step": 29500 }, { "epoch": 0.4740019536567009, "grad_norm": 0.9143902063369751, "learning_rate": 1.1850191164401386e-05, "loss": 1.9914, "step": 29600 }, { "epoch": 0.4756033116082438, "grad_norm": 1.0126798152923584, "learning_rate": 1.1890225594010852e-05, "loss": 1.9559, "step": 29700 }, { "epoch": 0.4772046695597867, "grad_norm": 1.282818078994751, "learning_rate": 1.1930260023620316e-05, "loss": 1.9927, "step": 29800 }, { "epoch": 0.4788060275113296, "grad_norm": 1.2307484149932861, "learning_rate": 1.1970294453229778e-05, "loss": 1.9825, "step": 29900 }, { "epoch": 0.48040738546287254, "grad_norm": 1.429739236831665, "learning_rate": 1.2010328882839241e-05, "loss": 1.9616, "step": 30000 }, { "epoch": 0.4820087434144154, "grad_norm": 1.5777498483657837, "learning_rate": 1.2050363312448707e-05, "loss": 1.9821, "step": 30100 }, { "epoch": 0.48361010136595833, "grad_norm": 1.1172056198120117, "learning_rate": 1.209039774205817e-05, "loss": 1.9669, "step": 30200 }, { "epoch": 0.48521145931750126, "grad_norm": 1.8118427991867065, "learning_rate": 1.2130432171667635e-05, "loss": 1.9555, "step": 30300 }, { "epoch": 0.48681281726904413, "grad_norm": 5.031758785247803, "learning_rate": 1.21704666012771e-05, "loss": 1.958, "step": 30400 }, { "epoch": 0.48841417522058705, "grad_norm": 1.171064853668213, "learning_rate": 1.2210501030886564e-05, "loss": 1.9697, "step": 30500 }, { "epoch": 0.49001553317213, "grad_norm": 1.6317328214645386, "learning_rate": 1.2250535460496028e-05, "loss": 1.9722, "step": 30600 }, { "epoch": 0.49161689112367285, "grad_norm": 0.9671623110771179, "learning_rate": 1.2290569890105493e-05, "loss": 1.9659, "step": 30700 }, { "epoch": 0.4932182490752158, "grad_norm": 1.0588128566741943, "learning_rate": 1.2330604319714955e-05, "loss": 1.9534, "step": 30800 }, { "epoch": 0.4948196070267587, "grad_norm": 1.1236603260040283, "learning_rate": 1.237063874932442e-05, "loss": 1.9505, "step": 30900 }, { "epoch": 0.4964209649783016, "grad_norm": 1.175752878189087, "learning_rate": 1.2410673178933885e-05, "loss": 1.9712, "step": 31000 }, { "epoch": 0.4980223229298445, "grad_norm": 1.0395989418029785, "learning_rate": 1.2450707608543348e-05, "loss": 1.9493, "step": 31100 }, { "epoch": 0.4996236808813874, "grad_norm": 0.9693764448165894, "learning_rate": 1.2490742038152812e-05, "loss": 1.9581, "step": 31200 }, { "epoch": 0.5012250388329303, "grad_norm": 1.100197434425354, "learning_rate": 1.2530776467762276e-05, "loss": 1.955, "step": 31300 }, { "epoch": 0.5028263967844733, "grad_norm": 1.3823459148406982, "learning_rate": 1.2570810897371742e-05, "loss": 1.9734, "step": 31400 }, { "epoch": 0.5044277547360161, "grad_norm": 0.9062979221343994, "learning_rate": 1.2610845326981205e-05, "loss": 1.9612, "step": 31500 }, { "epoch": 0.506029112687559, "grad_norm": 1.0007665157318115, "learning_rate": 1.2650879756590668e-05, "loss": 1.9664, "step": 31600 }, { "epoch": 0.5076304706391019, "grad_norm": 0.9745628833770752, "learning_rate": 1.2690914186200135e-05, "loss": 1.9648, "step": 31700 }, { "epoch": 0.5092318285906449, "grad_norm": 1.407834768295288, "learning_rate": 1.2730948615809597e-05, "loss": 1.9562, "step": 31800 }, { "epoch": 0.5108331865421878, "grad_norm": 1.207322597503662, "learning_rate": 1.277098304541906e-05, "loss": 1.9696, "step": 31900 }, { "epoch": 0.5124345444937307, "grad_norm": 1.4670792818069458, "learning_rate": 1.2811017475028526e-05, "loss": 1.9524, "step": 32000 }, { "epoch": 0.5140359024452736, "grad_norm": 1.023777961730957, "learning_rate": 1.285105190463799e-05, "loss": 1.97, "step": 32100 }, { "epoch": 0.5156372603968165, "grad_norm": 0.9778289198875427, "learning_rate": 1.2891086334247454e-05, "loss": 1.9494, "step": 32200 }, { "epoch": 0.5172386183483594, "grad_norm": 0.8971097469329834, "learning_rate": 1.2931120763856918e-05, "loss": 1.9628, "step": 32300 }, { "epoch": 0.5188399762999023, "grad_norm": 1.8562573194503784, "learning_rate": 1.2971155193466383e-05, "loss": 1.9543, "step": 32400 }, { "epoch": 0.5204413342514452, "grad_norm": 1.7294055223464966, "learning_rate": 1.3011189623075847e-05, "loss": 1.9519, "step": 32500 }, { "epoch": 0.5220426922029882, "grad_norm": 1.2172763347625732, "learning_rate": 1.3051224052685309e-05, "loss": 1.9758, "step": 32600 }, { "epoch": 0.5236440501545311, "grad_norm": 1.144281268119812, "learning_rate": 1.3091258482294775e-05, "loss": 1.9589, "step": 32700 }, { "epoch": 0.5252454081060739, "grad_norm": 1.057813048362732, "learning_rate": 1.3131292911904238e-05, "loss": 1.9443, "step": 32800 }, { "epoch": 0.5268467660576168, "grad_norm": 1.297404170036316, "learning_rate": 1.3171327341513702e-05, "loss": 1.9614, "step": 32900 }, { "epoch": 0.5284481240091597, "grad_norm": 1.0840290784835815, "learning_rate": 1.3211361771123168e-05, "loss": 1.9633, "step": 33000 }, { "epoch": 0.5300494819607027, "grad_norm": 1.0041546821594238, "learning_rate": 1.3251396200732631e-05, "loss": 1.9484, "step": 33100 }, { "epoch": 0.5316508399122456, "grad_norm": 1.780435562133789, "learning_rate": 1.3291430630342095e-05, "loss": 1.9438, "step": 33200 }, { "epoch": 0.5332521978637885, "grad_norm": 0.9901188015937805, "learning_rate": 1.333146505995156e-05, "loss": 1.9384, "step": 33300 }, { "epoch": 0.5348535558153314, "grad_norm": 0.9118313789367676, "learning_rate": 1.3371499489561025e-05, "loss": 1.9507, "step": 33400 }, { "epoch": 0.5364549137668743, "grad_norm": 1.0270628929138184, "learning_rate": 1.3411533919170487e-05, "loss": 1.9685, "step": 33500 }, { "epoch": 0.5380562717184172, "grad_norm": 2.4503536224365234, "learning_rate": 1.345156834877995e-05, "loss": 1.9481, "step": 33600 }, { "epoch": 0.5396576296699601, "grad_norm": 1.1191452741622925, "learning_rate": 1.3491602778389416e-05, "loss": 1.9529, "step": 33700 }, { "epoch": 0.541258987621503, "grad_norm": 0.8804434537887573, "learning_rate": 1.353163720799888e-05, "loss": 1.9591, "step": 33800 }, { "epoch": 0.542860345573046, "grad_norm": 1.1734013557434082, "learning_rate": 1.3571671637608344e-05, "loss": 1.9643, "step": 33900 }, { "epoch": 0.5444617035245889, "grad_norm": 0.9487005472183228, "learning_rate": 1.361170606721781e-05, "loss": 1.9408, "step": 34000 }, { "epoch": 0.5460630614761317, "grad_norm": 1.025894045829773, "learning_rate": 1.3651740496827273e-05, "loss": 1.9682, "step": 34100 }, { "epoch": 0.5476644194276746, "grad_norm": 1.3745815753936768, "learning_rate": 1.3691774926436737e-05, "loss": 1.9441, "step": 34200 }, { "epoch": 0.5492657773792176, "grad_norm": 0.9772420525550842, "learning_rate": 1.3731809356046202e-05, "loss": 1.9593, "step": 34300 }, { "epoch": 0.5508671353307605, "grad_norm": 0.8825002908706665, "learning_rate": 1.3771843785655666e-05, "loss": 1.9413, "step": 34400 }, { "epoch": 0.5524684932823034, "grad_norm": 2.0654349327087402, "learning_rate": 1.3811878215265128e-05, "loss": 1.9478, "step": 34500 }, { "epoch": 0.5540698512338463, "grad_norm": 0.9932202696800232, "learning_rate": 1.3851912644874592e-05, "loss": 1.9529, "step": 34600 }, { "epoch": 0.5556712091853893, "grad_norm": 0.923985481262207, "learning_rate": 1.3891947074484058e-05, "loss": 1.9542, "step": 34700 }, { "epoch": 0.5572725671369321, "grad_norm": 1.2756383419036865, "learning_rate": 1.3931981504093521e-05, "loss": 1.9437, "step": 34800 }, { "epoch": 0.558873925088475, "grad_norm": 1.025530457496643, "learning_rate": 1.3972015933702985e-05, "loss": 1.9479, "step": 34900 }, { "epoch": 0.5604752830400179, "grad_norm": 0.9658239483833313, "learning_rate": 1.401205036331245e-05, "loss": 1.9392, "step": 35000 }, { "epoch": 0.5620766409915608, "grad_norm": 1.0094221830368042, "learning_rate": 1.4052084792921914e-05, "loss": 1.9311, "step": 35100 }, { "epoch": 0.5636779989431038, "grad_norm": 0.933716893196106, "learning_rate": 1.4092119222531378e-05, "loss": 1.9605, "step": 35200 }, { "epoch": 0.5652793568946467, "grad_norm": 1.0568841695785522, "learning_rate": 1.4132153652140844e-05, "loss": 1.9453, "step": 35300 }, { "epoch": 0.5668807148461895, "grad_norm": 0.9029392004013062, "learning_rate": 1.4172188081750306e-05, "loss": 1.9327, "step": 35400 }, { "epoch": 0.5684820727977324, "grad_norm": 0.9875580668449402, "learning_rate": 1.421222251135977e-05, "loss": 1.9405, "step": 35500 }, { "epoch": 0.5700834307492754, "grad_norm": 0.9351832270622253, "learning_rate": 1.4252256940969235e-05, "loss": 1.9527, "step": 35600 }, { "epoch": 0.5716847887008183, "grad_norm": 1.1400425434112549, "learning_rate": 1.4292291370578699e-05, "loss": 1.9451, "step": 35700 }, { "epoch": 0.5732861466523612, "grad_norm": 0.971022367477417, "learning_rate": 1.4332325800188163e-05, "loss": 1.9336, "step": 35800 }, { "epoch": 0.5748875046039041, "grad_norm": 0.8905283808708191, "learning_rate": 1.4372360229797627e-05, "loss": 1.9518, "step": 35900 }, { "epoch": 0.5764888625554471, "grad_norm": 1.2511688470840454, "learning_rate": 1.4412394659407092e-05, "loss": 1.9276, "step": 36000 }, { "epoch": 0.5780902205069899, "grad_norm": 1.2555015087127686, "learning_rate": 1.4452429089016556e-05, "loss": 1.9306, "step": 36100 }, { "epoch": 0.5796915784585328, "grad_norm": 2.5456793308258057, "learning_rate": 1.4492463518626018e-05, "loss": 1.9212, "step": 36200 }, { "epoch": 0.5812929364100757, "grad_norm": 5.189430236816406, "learning_rate": 1.4532497948235485e-05, "loss": 1.9298, "step": 36300 }, { "epoch": 0.5828942943616187, "grad_norm": 0.8082601428031921, "learning_rate": 1.4572532377844947e-05, "loss": 1.9289, "step": 36400 }, { "epoch": 0.5844956523131616, "grad_norm": 1.2962714433670044, "learning_rate": 1.4612566807454411e-05, "loss": 1.9303, "step": 36500 }, { "epoch": 0.5860970102647045, "grad_norm": 1.9360517263412476, "learning_rate": 1.4652601237063877e-05, "loss": 1.9155, "step": 36600 }, { "epoch": 0.5876983682162473, "grad_norm": 1.16732919216156, "learning_rate": 1.469263566667334e-05, "loss": 1.9132, "step": 36700 }, { "epoch": 0.5892997261677902, "grad_norm": 0.8907911777496338, "learning_rate": 1.4732670096282804e-05, "loss": 1.9312, "step": 36800 }, { "epoch": 0.5909010841193332, "grad_norm": 0.9275608062744141, "learning_rate": 1.477270452589227e-05, "loss": 1.9638, "step": 36900 }, { "epoch": 0.5925024420708761, "grad_norm": 1.2977879047393799, "learning_rate": 1.4812738955501734e-05, "loss": 1.9372, "step": 37000 }, { "epoch": 0.594103800022419, "grad_norm": 1.1967015266418457, "learning_rate": 1.4852773385111196e-05, "loss": 1.9319, "step": 37100 }, { "epoch": 0.595705157973962, "grad_norm": 1.0788534879684448, "learning_rate": 1.489280781472066e-05, "loss": 1.9326, "step": 37200 }, { "epoch": 0.5973065159255049, "grad_norm": 0.8467668890953064, "learning_rate": 1.4932842244330125e-05, "loss": 1.9238, "step": 37300 }, { "epoch": 0.5989078738770477, "grad_norm": 0.8952154517173767, "learning_rate": 1.4972876673939589e-05, "loss": 1.926, "step": 37400 }, { "epoch": 0.6005092318285906, "grad_norm": 0.8892629742622375, "learning_rate": 1.5012911103549053e-05, "loss": 1.943, "step": 37500 }, { "epoch": 0.6021105897801335, "grad_norm": 0.8832671642303467, "learning_rate": 1.5052945533158518e-05, "loss": 1.9035, "step": 37600 }, { "epoch": 0.6037119477316765, "grad_norm": 1.0101639032363892, "learning_rate": 1.5092979962767982e-05, "loss": 1.9282, "step": 37700 }, { "epoch": 0.6053133056832194, "grad_norm": 0.9980772733688354, "learning_rate": 1.5133014392377446e-05, "loss": 1.9334, "step": 37800 }, { "epoch": 0.6069146636347623, "grad_norm": 0.9352878332138062, "learning_rate": 1.5173048821986911e-05, "loss": 1.9286, "step": 37900 }, { "epoch": 0.6085160215863051, "grad_norm": 0.9329906105995178, "learning_rate": 1.5213083251596375e-05, "loss": 1.9133, "step": 38000 }, { "epoch": 0.6101173795378481, "grad_norm": 1.0744600296020508, "learning_rate": 1.5253117681205837e-05, "loss": 1.9431, "step": 38100 }, { "epoch": 0.611718737489391, "grad_norm": 1.1284574270248413, "learning_rate": 1.52931521108153e-05, "loss": 1.9236, "step": 38200 }, { "epoch": 0.6133200954409339, "grad_norm": 0.7931867241859436, "learning_rate": 1.5333186540424767e-05, "loss": 1.9239, "step": 38300 }, { "epoch": 0.6149214533924768, "grad_norm": 0.9535111784934998, "learning_rate": 1.5373220970034232e-05, "loss": 1.933, "step": 38400 }, { "epoch": 0.6165228113440198, "grad_norm": 1.1604766845703125, "learning_rate": 1.5413255399643694e-05, "loss": 1.9118, "step": 38500 }, { "epoch": 0.6181241692955627, "grad_norm": 0.9939236640930176, "learning_rate": 1.545328982925316e-05, "loss": 1.9004, "step": 38600 }, { "epoch": 0.6197255272471055, "grad_norm": 0.901757538318634, "learning_rate": 1.5493324258862622e-05, "loss": 1.9198, "step": 38700 }, { "epoch": 0.6213268851986484, "grad_norm": 1.034832239151001, "learning_rate": 1.5533358688472087e-05, "loss": 1.9175, "step": 38800 }, { "epoch": 0.6229282431501914, "grad_norm": 0.8186530470848083, "learning_rate": 1.5573393118081553e-05, "loss": 1.9098, "step": 38900 }, { "epoch": 0.6245296011017343, "grad_norm": 1.0724900960922241, "learning_rate": 1.5613427547691015e-05, "loss": 1.9143, "step": 39000 }, { "epoch": 0.6261309590532772, "grad_norm": 0.9440537691116333, "learning_rate": 1.565346197730048e-05, "loss": 1.9327, "step": 39100 }, { "epoch": 0.6277323170048201, "grad_norm": 0.9175347089767456, "learning_rate": 1.5693496406909946e-05, "loss": 1.9097, "step": 39200 }, { "epoch": 0.6293336749563629, "grad_norm": 1.075506567955017, "learning_rate": 1.5733530836519408e-05, "loss": 1.9148, "step": 39300 }, { "epoch": 0.6309350329079059, "grad_norm": 1.156162142753601, "learning_rate": 1.577356526612887e-05, "loss": 1.928, "step": 39400 }, { "epoch": 0.6325363908594488, "grad_norm": 1.2199561595916748, "learning_rate": 1.5813599695738336e-05, "loss": 1.9212, "step": 39500 }, { "epoch": 0.6341377488109917, "grad_norm": 1.088230848312378, "learning_rate": 1.58536341253478e-05, "loss": 1.9147, "step": 39600 }, { "epoch": 0.6357391067625346, "grad_norm": 0.911685049533844, "learning_rate": 1.5893668554957263e-05, "loss": 1.914, "step": 39700 }, { "epoch": 0.6373404647140776, "grad_norm": 0.8977714776992798, "learning_rate": 1.593370298456673e-05, "loss": 1.9212, "step": 39800 }, { "epoch": 0.6389418226656205, "grad_norm": 0.9816354513168335, "learning_rate": 1.5973737414176194e-05, "loss": 1.9046, "step": 39900 }, { "epoch": 0.6405431806171633, "grad_norm": 0.88201904296875, "learning_rate": 1.6013771843785656e-05, "loss": 1.9359, "step": 40000 }, { "epoch": 0.6421445385687062, "grad_norm": 0.9104109406471252, "learning_rate": 1.6053806273395122e-05, "loss": 1.9302, "step": 40100 }, { "epoch": 0.6437458965202492, "grad_norm": 1.5256859064102173, "learning_rate": 1.6093840703004587e-05, "loss": 1.9226, "step": 40200 }, { "epoch": 0.6453472544717921, "grad_norm": 0.8858827948570251, "learning_rate": 1.613387513261405e-05, "loss": 1.9143, "step": 40300 }, { "epoch": 0.646948612423335, "grad_norm": 1.480420470237732, "learning_rate": 1.617390956222351e-05, "loss": 1.9171, "step": 40400 }, { "epoch": 0.6485499703748779, "grad_norm": 0.9443252682685852, "learning_rate": 1.6213943991832977e-05, "loss": 1.9104, "step": 40500 }, { "epoch": 0.6501513283264208, "grad_norm": 1.4180731773376465, "learning_rate": 1.6253978421442443e-05, "loss": 1.9015, "step": 40600 }, { "epoch": 0.6517526862779637, "grad_norm": 1.0369699001312256, "learning_rate": 1.6294012851051905e-05, "loss": 1.9085, "step": 40700 }, { "epoch": 0.6533540442295066, "grad_norm": 1.0155749320983887, "learning_rate": 1.633404728066137e-05, "loss": 1.8968, "step": 40800 }, { "epoch": 0.6549554021810495, "grad_norm": 1.0214248895645142, "learning_rate": 1.6374081710270836e-05, "loss": 1.9109, "step": 40900 }, { "epoch": 0.6565567601325925, "grad_norm": 1.2233892679214478, "learning_rate": 1.6414116139880298e-05, "loss": 1.8968, "step": 41000 }, { "epoch": 0.6581581180841354, "grad_norm": 0.8677876591682434, "learning_rate": 1.6454150569489763e-05, "loss": 1.9121, "step": 41100 }, { "epoch": 0.6597594760356783, "grad_norm": 0.8257797956466675, "learning_rate": 1.649418499909923e-05, "loss": 1.9329, "step": 41200 }, { "epoch": 0.6613608339872211, "grad_norm": 0.904925525188446, "learning_rate": 1.653421942870869e-05, "loss": 1.8934, "step": 41300 }, { "epoch": 0.662962191938764, "grad_norm": 0.8754270672798157, "learning_rate": 1.6574253858318153e-05, "loss": 1.8885, "step": 41400 }, { "epoch": 0.664563549890307, "grad_norm": 0.9102962613105774, "learning_rate": 1.661428828792762e-05, "loss": 1.9046, "step": 41500 }, { "epoch": 0.6661649078418499, "grad_norm": 0.9199568033218384, "learning_rate": 1.6654322717537084e-05, "loss": 1.9122, "step": 41600 }, { "epoch": 0.6677662657933928, "grad_norm": 0.9582586288452148, "learning_rate": 1.6694357147146546e-05, "loss": 1.8959, "step": 41700 }, { "epoch": 0.6693676237449357, "grad_norm": 0.8151847124099731, "learning_rate": 1.6734391576756012e-05, "loss": 1.887, "step": 41800 }, { "epoch": 0.6709689816964786, "grad_norm": 0.9953237771987915, "learning_rate": 1.6774426006365477e-05, "loss": 1.9236, "step": 41900 }, { "epoch": 0.6725703396480215, "grad_norm": 1.465527057647705, "learning_rate": 1.681446043597494e-05, "loss": 1.9136, "step": 42000 }, { "epoch": 0.6741716975995644, "grad_norm": 0.9603108763694763, "learning_rate": 1.68544948655844e-05, "loss": 1.8941, "step": 42100 }, { "epoch": 0.6757730555511073, "grad_norm": 0.8624867796897888, "learning_rate": 1.689452929519387e-05, "loss": 1.905, "step": 42200 }, { "epoch": 0.6773744135026503, "grad_norm": 0.9774655699729919, "learning_rate": 1.6934563724803333e-05, "loss": 1.9156, "step": 42300 }, { "epoch": 0.6789757714541932, "grad_norm": 2.9199743270874023, "learning_rate": 1.6974598154412795e-05, "loss": 1.9126, "step": 42400 }, { "epoch": 0.6805771294057361, "grad_norm": 1.2201206684112549, "learning_rate": 1.701463258402226e-05, "loss": 1.8976, "step": 42500 }, { "epoch": 0.6821784873572789, "grad_norm": 1.0182702541351318, "learning_rate": 1.7054667013631726e-05, "loss": 1.8968, "step": 42600 }, { "epoch": 0.6837798453088219, "grad_norm": 1.134906530380249, "learning_rate": 1.7094701443241188e-05, "loss": 1.9361, "step": 42700 }, { "epoch": 0.6853812032603648, "grad_norm": 1.635399341583252, "learning_rate": 1.7134735872850653e-05, "loss": 1.919, "step": 42800 }, { "epoch": 0.6869825612119077, "grad_norm": 0.8835542798042297, "learning_rate": 1.717477030246012e-05, "loss": 1.8776, "step": 42900 }, { "epoch": 0.6885839191634506, "grad_norm": 0.9510149955749512, "learning_rate": 1.721480473206958e-05, "loss": 1.9036, "step": 43000 }, { "epoch": 0.6901852771149936, "grad_norm": 0.8410897850990295, "learning_rate": 1.7254839161679043e-05, "loss": 1.9091, "step": 43100 }, { "epoch": 0.6917866350665364, "grad_norm": 1.4297950267791748, "learning_rate": 1.729487359128851e-05, "loss": 1.8934, "step": 43200 }, { "epoch": 0.6933879930180793, "grad_norm": 0.9010776877403259, "learning_rate": 1.7334908020897974e-05, "loss": 1.8997, "step": 43300 }, { "epoch": 0.6949893509696222, "grad_norm": 0.8833039999008179, "learning_rate": 1.7374942450507436e-05, "loss": 1.892, "step": 43400 }, { "epoch": 0.6965907089211651, "grad_norm": 0.9560312032699585, "learning_rate": 1.74149768801169e-05, "loss": 1.8977, "step": 43500 }, { "epoch": 0.6981920668727081, "grad_norm": 0.8603575825691223, "learning_rate": 1.7455011309726367e-05, "loss": 1.8881, "step": 43600 }, { "epoch": 0.699793424824251, "grad_norm": 0.8545820116996765, "learning_rate": 1.749504573933583e-05, "loss": 1.8992, "step": 43700 }, { "epoch": 0.7013947827757939, "grad_norm": 1.6138850450515747, "learning_rate": 1.7535080168945295e-05, "loss": 1.8823, "step": 43800 }, { "epoch": 0.7029961407273367, "grad_norm": 1.339882731437683, "learning_rate": 1.757511459855476e-05, "loss": 1.8942, "step": 43900 }, { "epoch": 0.7045974986788797, "grad_norm": 0.8209664225578308, "learning_rate": 1.7615149028164222e-05, "loss": 1.885, "step": 44000 }, { "epoch": 0.7061988566304226, "grad_norm": 0.8096824884414673, "learning_rate": 1.7655183457773685e-05, "loss": 1.8738, "step": 44100 }, { "epoch": 0.7078002145819655, "grad_norm": 1.0560259819030762, "learning_rate": 1.769521788738315e-05, "loss": 1.8941, "step": 44200 }, { "epoch": 0.7094015725335084, "grad_norm": 1.1268258094787598, "learning_rate": 1.7735252316992616e-05, "loss": 1.8974, "step": 44300 }, { "epoch": 0.7110029304850514, "grad_norm": 0.9307839274406433, "learning_rate": 1.7775286746602078e-05, "loss": 1.9122, "step": 44400 }, { "epoch": 0.7126042884365942, "grad_norm": 1.0069445371627808, "learning_rate": 1.7815321176211543e-05, "loss": 1.8597, "step": 44500 }, { "epoch": 0.7142056463881371, "grad_norm": 1.2771753072738647, "learning_rate": 1.785535560582101e-05, "loss": 1.8819, "step": 44600 }, { "epoch": 0.71580700433968, "grad_norm": 0.7819973230361938, "learning_rate": 1.789539003543047e-05, "loss": 1.8722, "step": 44700 }, { "epoch": 0.717408362291223, "grad_norm": 0.8193828463554382, "learning_rate": 1.7935424465039936e-05, "loss": 1.8745, "step": 44800 }, { "epoch": 0.7190097202427659, "grad_norm": 0.7969743609428406, "learning_rate": 1.7975458894649402e-05, "loss": 1.911, "step": 44900 }, { "epoch": 0.7206110781943088, "grad_norm": 1.4411369562149048, "learning_rate": 1.8015493324258864e-05, "loss": 1.8763, "step": 45000 }, { "epoch": 0.7222124361458517, "grad_norm": 1.0016000270843506, "learning_rate": 1.805552775386833e-05, "loss": 1.8875, "step": 45100 }, { "epoch": 0.7238137940973945, "grad_norm": 0.8997382521629333, "learning_rate": 1.809556218347779e-05, "loss": 1.8766, "step": 45200 }, { "epoch": 0.7254151520489375, "grad_norm": 0.8878375291824341, "learning_rate": 1.8135596613087257e-05, "loss": 1.8803, "step": 45300 }, { "epoch": 0.7270165100004804, "grad_norm": 0.8563076853752136, "learning_rate": 1.817563104269672e-05, "loss": 1.8855, "step": 45400 }, { "epoch": 0.7286178679520233, "grad_norm": 1.020241618156433, "learning_rate": 1.8215665472306185e-05, "loss": 1.8583, "step": 45500 }, { "epoch": 0.7302192259035662, "grad_norm": 0.8296322822570801, "learning_rate": 1.825569990191565e-05, "loss": 1.8692, "step": 45600 }, { "epoch": 0.7318205838551092, "grad_norm": 1.9434692859649658, "learning_rate": 1.8295734331525112e-05, "loss": 1.8831, "step": 45700 }, { "epoch": 0.733421941806652, "grad_norm": 0.9088252782821655, "learning_rate": 1.8335768761134578e-05, "loss": 1.8868, "step": 45800 }, { "epoch": 0.7350232997581949, "grad_norm": 1.7532590627670288, "learning_rate": 1.837580319074404e-05, "loss": 1.8734, "step": 45900 }, { "epoch": 0.7366246577097378, "grad_norm": 0.9662244319915771, "learning_rate": 1.8415837620353505e-05, "loss": 1.888, "step": 46000 }, { "epoch": 0.7382260156612808, "grad_norm": 3.4851512908935547, "learning_rate": 1.845587204996297e-05, "loss": 1.8829, "step": 46100 }, { "epoch": 0.7398273736128237, "grad_norm": 0.9157941341400146, "learning_rate": 1.8495906479572433e-05, "loss": 1.9091, "step": 46200 }, { "epoch": 0.7414287315643666, "grad_norm": 0.8992369771003723, "learning_rate": 1.85359409091819e-05, "loss": 1.8932, "step": 46300 }, { "epoch": 0.7430300895159095, "grad_norm": 0.8611487150192261, "learning_rate": 1.857597533879136e-05, "loss": 1.8672, "step": 46400 }, { "epoch": 0.7446314474674524, "grad_norm": 1.0629839897155762, "learning_rate": 1.8616009768400826e-05, "loss": 1.8819, "step": 46500 }, { "epoch": 0.7462328054189953, "grad_norm": 0.8317407369613647, "learning_rate": 1.865604419801029e-05, "loss": 1.8684, "step": 46600 }, { "epoch": 0.7478341633705382, "grad_norm": 0.8102233409881592, "learning_rate": 1.8696078627619754e-05, "loss": 1.8796, "step": 46700 }, { "epoch": 0.7494355213220811, "grad_norm": 0.8077260255813599, "learning_rate": 1.873611305722922e-05, "loss": 1.8872, "step": 46800 }, { "epoch": 0.7510368792736241, "grad_norm": 0.9285743236541748, "learning_rate": 1.877614748683868e-05, "loss": 1.8828, "step": 46900 }, { "epoch": 0.752638237225167, "grad_norm": 0.835612416267395, "learning_rate": 1.8816181916448147e-05, "loss": 1.8685, "step": 47000 }, { "epoch": 0.7542395951767098, "grad_norm": 1.5960347652435303, "learning_rate": 1.8856216346057612e-05, "loss": 1.8705, "step": 47100 }, { "epoch": 0.7558409531282527, "grad_norm": 0.7755472660064697, "learning_rate": 1.8896250775667075e-05, "loss": 1.8851, "step": 47200 }, { "epoch": 0.7574423110797956, "grad_norm": 1.0042415857315063, "learning_rate": 1.893628520527654e-05, "loss": 1.855, "step": 47300 }, { "epoch": 0.7590436690313386, "grad_norm": 0.8991414904594421, "learning_rate": 1.8976319634886006e-05, "loss": 1.8569, "step": 47400 }, { "epoch": 0.7606450269828815, "grad_norm": 0.813565194606781, "learning_rate": 1.9016354064495468e-05, "loss": 1.8578, "step": 47500 }, { "epoch": 0.7622463849344244, "grad_norm": 0.7883344292640686, "learning_rate": 1.9056388494104933e-05, "loss": 1.8968, "step": 47600 }, { "epoch": 0.7638477428859674, "grad_norm": 1.0632473230361938, "learning_rate": 1.9096422923714395e-05, "loss": 1.879, "step": 47700 }, { "epoch": 0.7654491008375102, "grad_norm": 0.8479236364364624, "learning_rate": 1.913645735332386e-05, "loss": 1.8683, "step": 47800 }, { "epoch": 0.7670504587890531, "grad_norm": 0.871159553527832, "learning_rate": 1.9176491782933323e-05, "loss": 1.8653, "step": 47900 }, { "epoch": 0.768651816740596, "grad_norm": 0.8534667491912842, "learning_rate": 1.921652621254279e-05, "loss": 1.8507, "step": 48000 }, { "epoch": 0.7702531746921389, "grad_norm": 0.8931534290313721, "learning_rate": 1.9256560642152254e-05, "loss": 1.8625, "step": 48100 }, { "epoch": 0.7718545326436819, "grad_norm": 1.1518031358718872, "learning_rate": 1.9296595071761716e-05, "loss": 1.8786, "step": 48200 }, { "epoch": 0.7734558905952248, "grad_norm": 0.9310818910598755, "learning_rate": 1.933662950137118e-05, "loss": 1.8517, "step": 48300 }, { "epoch": 0.7750572485467676, "grad_norm": 1.314759612083435, "learning_rate": 1.9376663930980647e-05, "loss": 1.853, "step": 48400 }, { "epoch": 0.7766586064983105, "grad_norm": 0.8431141972541809, "learning_rate": 1.941669836059011e-05, "loss": 1.863, "step": 48500 }, { "epoch": 0.7782599644498535, "grad_norm": 0.90580815076828, "learning_rate": 1.945673279019957e-05, "loss": 1.8496, "step": 48600 }, { "epoch": 0.7798613224013964, "grad_norm": 1.0436537265777588, "learning_rate": 1.9496767219809037e-05, "loss": 1.8629, "step": 48700 }, { "epoch": 0.7814626803529393, "grad_norm": 0.8080843091011047, "learning_rate": 1.9536801649418502e-05, "loss": 1.857, "step": 48800 }, { "epoch": 0.7830640383044822, "grad_norm": 0.8750945925712585, "learning_rate": 1.9576836079027964e-05, "loss": 1.8725, "step": 48900 }, { "epoch": 0.7846653962560252, "grad_norm": 1.2619659900665283, "learning_rate": 1.961687050863743e-05, "loss": 1.8422, "step": 49000 }, { "epoch": 0.786266754207568, "grad_norm": 0.84897780418396, "learning_rate": 1.9656904938246895e-05, "loss": 1.8725, "step": 49100 }, { "epoch": 0.7878681121591109, "grad_norm": 0.7454677820205688, "learning_rate": 1.9696939367856358e-05, "loss": 1.8735, "step": 49200 }, { "epoch": 0.7894694701106538, "grad_norm": 0.8530156016349792, "learning_rate": 1.9736973797465823e-05, "loss": 1.8597, "step": 49300 }, { "epoch": 0.7910708280621968, "grad_norm": 0.9725930690765381, "learning_rate": 1.977700822707529e-05, "loss": 1.8515, "step": 49400 }, { "epoch": 0.7926721860137397, "grad_norm": 0.8235682249069214, "learning_rate": 1.981704265668475e-05, "loss": 1.8791, "step": 49500 }, { "epoch": 0.7942735439652826, "grad_norm": 0.9344043135643005, "learning_rate": 1.9857077086294213e-05, "loss": 1.8663, "step": 49600 }, { "epoch": 0.7958749019168254, "grad_norm": 0.9629167318344116, "learning_rate": 1.9897111515903678e-05, "loss": 1.8647, "step": 49700 }, { "epoch": 0.7974762598683683, "grad_norm": 0.7384589910507202, "learning_rate": 1.9937145945513144e-05, "loss": 1.8586, "step": 49800 }, { "epoch": 0.7990776178199113, "grad_norm": 1.069229245185852, "learning_rate": 1.9977180375122606e-05, "loss": 1.8622, "step": 49900 }, { "epoch": 0.8006789757714542, "grad_norm": 0.8724033236503601, "learning_rate": 1.9999415105482566e-05, "loss": 1.8602, "step": 50000 }, { "epoch": 0.8022803337229971, "grad_norm": 0.8449952602386475, "learning_rate": 1.9993531998299776e-05, "loss": 1.8321, "step": 50100 }, { "epoch": 0.80388169167454, "grad_norm": 0.8022226095199585, "learning_rate": 1.9981326651105962e-05, "loss": 1.8735, "step": 50200 }, { "epoch": 0.805483049626083, "grad_norm": 0.8840874433517456, "learning_rate": 1.9962806785408838e-05, "loss": 1.829, "step": 50300 }, { "epoch": 0.8070844075776258, "grad_norm": 1.0448670387268066, "learning_rate": 1.993798411749008e-05, "loss": 1.8595, "step": 50400 }, { "epoch": 0.8086857655291687, "grad_norm": 0.904961109161377, "learning_rate": 1.9906874350993245e-05, "loss": 1.8586, "step": 50500 }, { "epoch": 0.8102871234807116, "grad_norm": 2.3403968811035156, "learning_rate": 1.98694971669891e-05, "loss": 1.8492, "step": 50600 }, { "epoch": 0.8118884814322546, "grad_norm": 1.0114359855651855, "learning_rate": 1.9825876211524724e-05, "loss": 1.8609, "step": 50700 }, { "epoch": 0.8134898393837975, "grad_norm": 0.8057318329811096, "learning_rate": 1.977603908066426e-05, "loss": 1.829, "step": 50800 }, { "epoch": 0.8150911973353404, "grad_norm": 0.8821057677268982, "learning_rate": 1.9720017303030703e-05, "loss": 1.862, "step": 50900 }, { "epoch": 0.8166925552868832, "grad_norm": 2.767875909805298, "learning_rate": 1.9657846319859854e-05, "loss": 1.8678, "step": 51000 }, { "epoch": 0.8182939132384262, "grad_norm": 0.7666317820549011, "learning_rate": 1.9589565462579015e-05, "loss": 1.8621, "step": 51100 }, { "epoch": 0.8198952711899691, "grad_norm": 1.021729588508606, "learning_rate": 1.9515217927924633e-05, "loss": 1.8352, "step": 51200 }, { "epoch": 0.821496629141512, "grad_norm": 0.8282054662704468, "learning_rate": 1.943485075061461e-05, "loss": 1.8583, "step": 51300 }, { "epoch": 0.8230979870930549, "grad_norm": 3.0896732807159424, "learning_rate": 1.934851477359256e-05, "loss": 1.8394, "step": 51400 }, { "epoch": 0.8246993450445979, "grad_norm": 0.9474732279777527, "learning_rate": 1.9256264615862893e-05, "loss": 1.8682, "step": 51500 }, { "epoch": 0.8263007029961408, "grad_norm": 0.7810207009315491, "learning_rate": 1.9158158637937027e-05, "loss": 1.8337, "step": 51600 }, { "epoch": 0.8279020609476836, "grad_norm": 0.8208989500999451, "learning_rate": 1.9054258904912575e-05, "loss": 1.8367, "step": 51700 }, { "epoch": 0.8295034188992265, "grad_norm": 0.8764814138412476, "learning_rate": 1.89446311472089e-05, "loss": 1.8403, "step": 51800 }, { "epoch": 0.8311047768507694, "grad_norm": 1.1485708951950073, "learning_rate": 1.8829344718983903e-05, "loss": 1.8576, "step": 51900 }, { "epoch": 0.8327061348023124, "grad_norm": 1.06003737449646, "learning_rate": 1.8708472554258237e-05, "loss": 1.872, "step": 52000 }, { "epoch": 0.8343074927538553, "grad_norm": 0.8322979807853699, "learning_rate": 1.8582091120774855e-05, "loss": 1.859, "step": 52100 }, { "epoch": 0.8359088507053982, "grad_norm": 0.7536402940750122, "learning_rate": 1.845028037162298e-05, "loss": 1.8401, "step": 52200 }, { "epoch": 0.837510208656941, "grad_norm": 1.4201630353927612, "learning_rate": 1.83131236946571e-05, "loss": 1.8723, "step": 52300 }, { "epoch": 0.839111566608484, "grad_norm": 0.7676379680633545, "learning_rate": 1.8170707859743067e-05, "loss": 1.8572, "step": 52400 }, { "epoch": 0.8407129245600269, "grad_norm": 0.8063752055168152, "learning_rate": 1.8023122963864602e-05, "loss": 1.8469, "step": 52500 }, { "epoch": 0.8423142825115698, "grad_norm": 0.8385179042816162, "learning_rate": 1.787046237412493e-05, "loss": 1.8564, "step": 52600 }, { "epoch": 0.8439156404631127, "grad_norm": 0.8969714641571045, "learning_rate": 1.7712822668679682e-05, "loss": 1.8556, "step": 52700 }, { "epoch": 0.8455169984146557, "grad_norm": 1.184692621231079, "learning_rate": 1.7550303575638318e-05, "loss": 1.8423, "step": 52800 }, { "epoch": 0.8471183563661986, "grad_norm": 0.8388579487800598, "learning_rate": 1.7383007909972844e-05, "loss": 1.8157, "step": 52900 }, { "epoch": 0.8487197143177414, "grad_norm": 0.7864462733268738, "learning_rate": 1.721104150847362e-05, "loss": 1.8526, "step": 53000 }, { "epoch": 0.8503210722692843, "grad_norm": 0.86407071352005, "learning_rate": 1.703451316279353e-05, "loss": 1.8428, "step": 53100 }, { "epoch": 0.8519224302208273, "grad_norm": 0.8313634395599365, "learning_rate": 1.6853534550622722e-05, "loss": 1.8479, "step": 53200 }, { "epoch": 0.8535237881723702, "grad_norm": 1.4253445863723755, "learning_rate": 1.666822016503765e-05, "loss": 1.8275, "step": 53300 }, { "epoch": 0.8551251461239131, "grad_norm": 5.398781776428223, "learning_rate": 1.6478687242068904e-05, "loss": 1.854, "step": 53400 }, { "epoch": 0.856726504075456, "grad_norm": 1.7977509498596191, "learning_rate": 1.628505568653385e-05, "loss": 1.8339, "step": 53500 }, { "epoch": 0.8583278620269988, "grad_norm": 0.8206777572631836, "learning_rate": 1.6087447996180826e-05, "loss": 1.8511, "step": 53600 }, { "epoch": 0.8599292199785418, "grad_norm": 0.8535060286521912, "learning_rate": 1.5885989184193027e-05, "loss": 1.8586, "step": 53700 }, { "epoch": 0.8615305779300847, "grad_norm": 1.6550579071044922, "learning_rate": 1.5680806700101e-05, "loss": 1.8482, "step": 53800 }, { "epoch": 0.8631319358816276, "grad_norm": 0.8122648000717163, "learning_rate": 1.5472030349153854e-05, "loss": 1.8335, "step": 53900 }, { "epoch": 0.8647332938331705, "grad_norm": 0.7805556058883667, "learning_rate": 1.525979221020014e-05, "loss": 1.8252, "step": 54000 }, { "epoch": 0.8663346517847135, "grad_norm": 0.8546029329299927, "learning_rate": 1.5044226552130399e-05, "loss": 1.8353, "step": 54100 }, { "epoch": 0.8679360097362564, "grad_norm": 0.7961782217025757, "learning_rate": 1.4825469748934192e-05, "loss": 1.8348, "step": 54200 }, { "epoch": 0.8695373676877992, "grad_norm": 0.9392079710960388, "learning_rate": 1.4603660193425402e-05, "loss": 1.8205, "step": 54300 }, { "epoch": 0.8711387256393421, "grad_norm": 0.7852017283439636, "learning_rate": 1.4378938209690334e-05, "loss": 1.8327, "step": 54400 }, { "epoch": 0.8727400835908851, "grad_norm": 0.8385934829711914, "learning_rate": 1.4151445964314057e-05, "loss": 1.8383, "step": 54500 }, { "epoch": 0.874341441542428, "grad_norm": 0.7498407363891602, "learning_rate": 1.3921327376441087e-05, "loss": 1.8121, "step": 54600 }, { "epoch": 0.8759427994939709, "grad_norm": 0.8227770924568176, "learning_rate": 1.3688728026727369e-05, "loss": 1.8395, "step": 54700 }, { "epoch": 0.8775441574455138, "grad_norm": 0.911970317363739, "learning_rate": 1.3453795065241128e-05, "loss": 1.8262, "step": 54800 }, { "epoch": 0.8791455153970567, "grad_norm": 0.8143411874771118, "learning_rate": 1.3216677118370834e-05, "loss": 1.8571, "step": 54900 }, { "epoch": 0.8807468733485996, "grad_norm": 0.8301388621330261, "learning_rate": 1.2977524194799229e-05, "loss": 1.8435, "step": 55000 }, { "epoch": 0.8823482313001425, "grad_norm": 1.3477791547775269, "learning_rate": 1.2736487590602864e-05, "loss": 1.8372, "step": 55100 }, { "epoch": 0.8839495892516854, "grad_norm": 0.8804235458374023, "learning_rate": 1.2493719793537157e-05, "loss": 1.841, "step": 55200 }, { "epoch": 0.8855509472032284, "grad_norm": 0.7941620349884033, "learning_rate": 1.2249374386567598e-05, "loss": 1.8271, "step": 55300 }, { "epoch": 0.8871523051547713, "grad_norm": 0.8681734800338745, "learning_rate": 1.2003605950708059e-05, "loss": 1.8459, "step": 55400 }, { "epoch": 0.8887536631063142, "grad_norm": 0.7299553155899048, "learning_rate": 1.1756569967227716e-05, "loss": 1.8684, "step": 55500 }, { "epoch": 0.890355021057857, "grad_norm": 0.7805650234222412, "learning_rate": 1.1508422719288434e-05, "loss": 1.8113, "step": 55600 }, { "epoch": 0.8919563790094, "grad_norm": 0.7692527770996094, "learning_rate": 1.125932119307486e-05, "loss": 1.8252, "step": 55700 }, { "epoch": 0.8935577369609429, "grad_norm": 0.8291378021240234, "learning_rate": 1.1009422978479742e-05, "loss": 1.7992, "step": 55800 }, { "epoch": 0.8951590949124858, "grad_norm": 0.8779826164245605, "learning_rate": 1.0758886169407351e-05, "loss": 1.8336, "step": 55900 }, { "epoch": 0.8967604528640287, "grad_norm": 0.7980159521102905, "learning_rate": 1.050786926375801e-05, "loss": 1.8212, "step": 56000 }, { "epoch": 0.8983618108155716, "grad_norm": 3.2298014163970947, "learning_rate": 1.025653106315707e-05, "loss": 1.8188, "step": 56100 }, { "epoch": 0.8999631687671145, "grad_norm": 0.8914725184440613, "learning_rate": 1.0005030572491733e-05, "loss": 1.8387, "step": 56200 }, { "epoch": 0.9015645267186574, "grad_norm": 0.8599027395248413, "learning_rate": 9.753526899319275e-06, "loss": 1.8327, "step": 56300 }, { "epoch": 0.9031658846702003, "grad_norm": 0.9533581733703613, "learning_rate": 9.50217915321035e-06, "loss": 1.822, "step": 56400 }, { "epoch": 0.9047672426217432, "grad_norm": 0.8099405169487, "learning_rate": 9.251146345090958e-06, "loss": 1.8462, "step": 56500 }, { "epoch": 0.9063686005732862, "grad_norm": 0.8883758783340454, "learning_rate": 9.000587286646886e-06, "loss": 1.8184, "step": 56600 }, { "epoch": 0.9079699585248291, "grad_norm": 1.6830765008926392, "learning_rate": 8.750660489854142e-06, "loss": 1.82, "step": 56700 }, { "epoch": 0.909571316476372, "grad_norm": 1.2402883768081665, "learning_rate": 8.501524066699047e-06, "loss": 1.816, "step": 56800 }, { "epoch": 0.9111726744279148, "grad_norm": 0.8525800108909607, "learning_rate": 8.253335629151306e-06, "loss": 1.8248, "step": 56900 }, { "epoch": 0.9127740323794578, "grad_norm": 0.8562950491905212, "learning_rate": 8.006252189453485e-06, "loss": 1.8284, "step": 57000 }, { "epoch": 0.9143753903310007, "grad_norm": 0.7687914371490479, "learning_rate": 7.760430060789828e-06, "loss": 1.8198, "step": 57100 }, { "epoch": 0.9159767482825436, "grad_norm": 0.9463182091712952, "learning_rate": 7.51602475839736e-06, "loss": 1.8266, "step": 57200 }, { "epoch": 0.9175781062340865, "grad_norm": 1.0767518281936646, "learning_rate": 7.273190901181783e-06, "loss": 1.8054, "step": 57300 }, { "epoch": 0.9191794641856295, "grad_norm": 0.8242263197898865, "learning_rate": 7.032082113900434e-06, "loss": 1.8337, "step": 57400 }, { "epoch": 0.9207808221371723, "grad_norm": 0.7926039695739746, "learning_rate": 6.792850929974142e-06, "loss": 1.8144, "step": 57500 }, { "epoch": 0.9223821800887152, "grad_norm": 0.7732511162757874, "learning_rate": 6.55564869498956e-06, "loss": 1.804, "step": 57600 }, { "epoch": 0.9239835380402581, "grad_norm": 0.7959622144699097, "learning_rate": 6.32062547095288e-06, "loss": 1.8222, "step": 57700 }, { "epoch": 0.925584895991801, "grad_norm": 0.8663679957389832, "learning_rate": 6.087929941355671e-06, "loss": 1.8496, "step": 57800 }, { "epoch": 0.927186253943344, "grad_norm": 0.7793252468109131, "learning_rate": 5.857709317112736e-06, "loss": 1.8177, "step": 57900 }, { "epoch": 0.9287876118948869, "grad_norm": 0.9085448980331421, "learning_rate": 5.630109243431608e-06, "loss": 1.8193, "step": 58000 }, { "epoch": 0.9303889698464298, "grad_norm": 0.7569569945335388, "learning_rate": 5.4052737076725824e-06, "loss": 1.8196, "step": 58100 }, { "epoch": 0.9319903277979726, "grad_norm": 0.8424269556999207, "learning_rate": 5.1833449482574895e-06, "loss": 1.835, "step": 58200 }, { "epoch": 0.9335916857495156, "grad_norm": 0.8512621521949768, "learning_rate": 4.964463364685001e-06, "loss": 1.8145, "step": 58300 }, { "epoch": 0.9351930437010585, "grad_norm": 1.0519986152648926, "learning_rate": 4.748767428709187e-06, "loss": 1.8213, "step": 58400 }, { "epoch": 0.9367944016526014, "grad_norm": 0.7896735072135925, "learning_rate": 4.536393596737752e-06, "loss": 1.8243, "step": 58500 }, { "epoch": 0.9383957596041443, "grad_norm": 1.0739407539367676, "learning_rate": 4.327476223505136e-06, "loss": 1.832, "step": 58600 }, { "epoch": 0.9399971175556873, "grad_norm": 0.8374795913696289, "learning_rate": 4.12214747707527e-06, "loss": 1.8338, "step": 58700 }, { "epoch": 0.9415984755072301, "grad_norm": 1.0221420526504517, "learning_rate": 3.920537255227669e-06, "loss": 1.8101, "step": 58800 }, { "epoch": 0.943199833458773, "grad_norm": 0.8421764969825745, "learning_rate": 3.7227731032797853e-06, "loss": 1.8329, "step": 58900 }, { "epoch": 0.9448011914103159, "grad_norm": 0.7701355814933777, "learning_rate": 3.5289801333976102e-06, "loss": 1.8216, "step": 59000 }, { "epoch": 0.9464025493618589, "grad_norm": 0.7741368412971497, "learning_rate": 3.339280945445559e-06, "loss": 1.8272, "step": 59100 }, { "epoch": 0.9480039073134018, "grad_norm": 1.7360873222351074, "learning_rate": 3.1537955494257345e-06, "loss": 1.8372, "step": 59200 }, { "epoch": 0.9496052652649447, "grad_norm": 0.7760699987411499, "learning_rate": 2.972641289555616e-06, "loss": 1.8182, "step": 59300 }, { "epoch": 0.9512066232164876, "grad_norm": 0.7646809220314026, "learning_rate": 2.7959327700322036e-06, "loss": 1.8084, "step": 59400 }, { "epoch": 0.9528079811680304, "grad_norm": 0.9442381858825684, "learning_rate": 2.623781782529625e-06, "loss": 1.8239, "step": 59500 }, { "epoch": 0.9544093391195734, "grad_norm": 0.8009527325630188, "learning_rate": 2.4562972354759698e-06, "loss": 1.8272, "step": 59600 }, { "epoch": 0.9560106970711163, "grad_norm": 0.7591850757598877, "learning_rate": 2.293585085154252e-06, "loss": 1.8314, "step": 59700 }, { "epoch": 0.9576120550226592, "grad_norm": 0.7954255938529968, "learning_rate": 2.135748268670902e-06, "loss": 1.8341, "step": 59800 }, { "epoch": 0.9592134129742022, "grad_norm": 1.0002678632736206, "learning_rate": 1.9828866388343814e-06, "loss": 1.8075, "step": 59900 }, { "epoch": 0.9608147709257451, "grad_norm": 0.7856830954551697, "learning_rate": 1.8350969009849483e-06, "loss": 1.8005, "step": 60000 }, { "epoch": 0.9624161288772879, "grad_norm": 0.9126999378204346, "learning_rate": 1.6924725518156637e-06, "loss": 1.8277, "step": 60100 }, { "epoch": 0.9640174868288308, "grad_norm": 0.8106286525726318, "learning_rate": 1.5551038202232805e-06, "loss": 1.8108, "step": 60200 }, { "epoch": 0.9656188447803737, "grad_norm": 1.359531044960022, "learning_rate": 1.4230776102264454e-06, "loss": 1.8475, "step": 60300 }, { "epoch": 0.9672202027319167, "grad_norm": 0.7704586386680603, "learning_rate": 1.2964774459873364e-06, "loss": 1.8482, "step": 60400 }, { "epoch": 0.9688215606834596, "grad_norm": 0.7488996982574463, "learning_rate": 1.1753834189715019e-06, "loss": 1.8115, "step": 60500 }, { "epoch": 0.9704229186350025, "grad_norm": 1.662976861000061, "learning_rate": 1.059872137279342e-06, "loss": 1.8391, "step": 60600 }, { "epoch": 0.9720242765865454, "grad_norm": 1.0111815929412842, "learning_rate": 9.500166771812902e-07, "loss": 1.8161, "step": 60700 }, { "epoch": 0.9736256345380883, "grad_norm": 0.7973281145095825, "learning_rate": 8.458865368873204e-07, "loss": 1.8219, "step": 60800 }, { "epoch": 0.9752269924896312, "grad_norm": 0.8591629266738892, "learning_rate": 7.475475925800968e-07, "loss": 1.8399, "step": 60900 }, { "epoch": 0.9768283504411741, "grad_norm": 0.9209094047546387, "learning_rate": 6.550620567394883e-07, "loss": 1.8319, "step": 61000 }, { "epoch": 0.978429708392717, "grad_norm": 0.916976273059845, "learning_rate": 5.684884387849176e-07, "loss": 1.8189, "step": 61100 }, { "epoch": 0.98003106634426, "grad_norm": 0.950470507144928, "learning_rate": 4.878815080603372e-07, "loss": 1.8052, "step": 61200 }, { "epoch": 0.9816324242958029, "grad_norm": 0.7501734495162964, "learning_rate": 4.1329225918533277e-07, "loss": 1.8419, "step": 61300 }, { "epoch": 0.9832337822473457, "grad_norm": 0.8855769038200378, "learning_rate": 3.447678797942389e-07, "loss": 1.8168, "step": 61400 }, { "epoch": 0.9848351401988886, "grad_norm": 0.9513728618621826, "learning_rate": 2.823517206836701e-07, "loss": 1.8219, "step": 61500 }, { "epoch": 0.9864364981504316, "grad_norm": 0.9888412952423096, "learning_rate": 2.2608326838736817e-07, "loss": 1.8183, "step": 61600 }, { "epoch": 0.9880378561019745, "grad_norm": 0.8009938597679138, "learning_rate": 1.7599812019571395e-07, "loss": 1.8027, "step": 61700 }, { "epoch": 0.9896392140535174, "grad_norm": 0.9275427460670471, "learning_rate": 1.321279616356963e-07, "loss": 1.8145, "step": 61800 }, { "epoch": 0.9912405720050603, "grad_norm": 0.7663293480873108, "learning_rate": 9.450054642560102e-08, "loss": 1.8332, "step": 61900 }, { "epoch": 0.9928419299566033, "grad_norm": 0.7306997776031494, "learning_rate": 6.313967891707906e-08, "loss": 1.8059, "step": 62000 }, { "epoch": 0.9944432879081461, "grad_norm": 0.8004014492034912, "learning_rate": 3.806519903573502e-08, "loss": 1.8347, "step": 62100 }, { "epoch": 0.996044645859689, "grad_norm": 0.7328791618347168, "learning_rate": 1.9292969729719502e-08, "loss": 1.8156, "step": 62200 }, { "epoch": 0.9976460038112319, "grad_norm": 0.8255366086959839, "learning_rate": 6.834866934314344e-09, "loss": 1.8029, "step": 62300 }, { "epoch": 0.9992473617627748, "grad_norm": 0.8802406787872314, "learning_rate": 6.987720588080837e-10, "loss": 1.8173, "step": 62400 }, { "epoch": 1.0, "step": 62447, "total_flos": 7.631778497299481e+18, "train_loss": 2.1485319636500972, "train_runtime": 14119.5859, "train_samples_per_second": 35.382, "train_steps_per_second": 4.423 } ], "logging_steps": 100, "max_steps": 62447, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.631778497299481e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }