{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1257, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002386634844868735, "grad_norm": 27.54071617126465, "learning_rate": 2.3809523809523808e-06, "loss": 6.8548, "step": 3 }, { "epoch": 0.00477326968973747, "grad_norm": 16.49394989013672, "learning_rate": 4.7619047619047615e-06, "loss": 6.5867, "step": 6 }, { "epoch": 0.007159904534606206, "grad_norm": 19.11440658569336, "learning_rate": 7.142857142857143e-06, "loss": 6.2734, "step": 9 }, { "epoch": 0.00954653937947494, "grad_norm": 20.9352970123291, "learning_rate": 9.523809523809523e-06, "loss": 5.535, "step": 12 }, { "epoch": 0.011933174224343675, "grad_norm": 11.74897575378418, "learning_rate": 1.1904761904761905e-05, "loss": 5.2226, "step": 15 }, { "epoch": 0.014319809069212411, "grad_norm": 15.756311416625977, "learning_rate": 1.4285714285714285e-05, "loss": 4.9951, "step": 18 }, { "epoch": 0.016706443914081145, "grad_norm": 9.158567428588867, "learning_rate": 1.6666666666666667e-05, "loss": 4.6998, "step": 21 }, { "epoch": 0.01909307875894988, "grad_norm": 10.05123233795166, "learning_rate": 1.9047619047619046e-05, "loss": 4.4934, "step": 24 }, { "epoch": 0.021479713603818614, "grad_norm": 8.713202476501465, "learning_rate": 2.1428571428571428e-05, "loss": 4.4422, "step": 27 }, { "epoch": 0.02386634844868735, "grad_norm": 6.295734882354736, "learning_rate": 2.380952380952381e-05, "loss": 4.307, "step": 30 }, { "epoch": 0.026252983293556086, "grad_norm": 4.285629749298096, "learning_rate": 2.6190476190476192e-05, "loss": 3.9384, "step": 33 }, { "epoch": 0.028639618138424822, "grad_norm": 3.6104960441589355, "learning_rate": 2.857142857142857e-05, "loss": 4.0232, "step": 36 }, { "epoch": 0.031026252983293555, "grad_norm": 3.1964364051818848, "learning_rate": 3.095238095238095e-05, "loss": 4.0479, "step": 39 }, { "epoch": 0.03341288782816229, "grad_norm": 3.879831075668335, "learning_rate": 3.3333333333333335e-05, "loss": 3.8802, "step": 42 }, { "epoch": 0.03579952267303103, "grad_norm": 4.037631511688232, "learning_rate": 3.571428571428572e-05, "loss": 3.8634, "step": 45 }, { "epoch": 0.03818615751789976, "grad_norm": 2.8461647033691406, "learning_rate": 3.809523809523809e-05, "loss": 3.7112, "step": 48 }, { "epoch": 0.0405727923627685, "grad_norm": 3.655538558959961, "learning_rate": 4.047619047619048e-05, "loss": 3.6577, "step": 51 }, { "epoch": 0.04295942720763723, "grad_norm": 3.792799711227417, "learning_rate": 4.2857142857142856e-05, "loss": 3.6948, "step": 54 }, { "epoch": 0.045346062052505964, "grad_norm": 5.230392932891846, "learning_rate": 4.523809523809524e-05, "loss": 3.618, "step": 57 }, { "epoch": 0.0477326968973747, "grad_norm": 5.6567230224609375, "learning_rate": 4.761904761904762e-05, "loss": 3.4814, "step": 60 }, { "epoch": 0.050119331742243436, "grad_norm": 3.167008399963379, "learning_rate": 5e-05, "loss": 3.3859, "step": 63 }, { "epoch": 0.05250596658711217, "grad_norm": 4.307368278503418, "learning_rate": 5.2380952380952384e-05, "loss": 3.2242, "step": 66 }, { "epoch": 0.05489260143198091, "grad_norm": 3.390169382095337, "learning_rate": 5.4761904761904766e-05, "loss": 3.0145, "step": 69 }, { "epoch": 0.057279236276849645, "grad_norm": 3.3308513164520264, "learning_rate": 5.714285714285714e-05, "loss": 3.0705, "step": 72 }, { "epoch": 0.059665871121718374, "grad_norm": 3.2529304027557373, "learning_rate": 5.9523809523809524e-05, "loss": 3.0395, "step": 75 }, { "epoch": 0.06205250596658711, "grad_norm": 2.5623693466186523, "learning_rate": 6.19047619047619e-05, "loss": 2.9807, "step": 78 }, { "epoch": 0.06443914081145585, "grad_norm": 2.3922460079193115, "learning_rate": 6.428571428571429e-05, "loss": 2.8913, "step": 81 }, { "epoch": 0.06682577565632458, "grad_norm": 3.3650715351104736, "learning_rate": 6.666666666666667e-05, "loss": 2.7402, "step": 84 }, { "epoch": 0.06921241050119331, "grad_norm": 2.4342546463012695, "learning_rate": 6.904761904761905e-05, "loss": 2.7315, "step": 87 }, { "epoch": 0.07159904534606205, "grad_norm": 2.673886775970459, "learning_rate": 7.142857142857143e-05, "loss": 2.7205, "step": 90 }, { "epoch": 0.07398568019093078, "grad_norm": 3.11196231842041, "learning_rate": 7.380952380952382e-05, "loss": 2.7564, "step": 93 }, { "epoch": 0.07637231503579953, "grad_norm": 2.593316078186035, "learning_rate": 7.619047619047618e-05, "loss": 2.6578, "step": 96 }, { "epoch": 0.07875894988066826, "grad_norm": 24.880268096923828, "learning_rate": 7.857142857142858e-05, "loss": 2.6199, "step": 99 }, { "epoch": 0.081145584725537, "grad_norm": 4.8875274658203125, "learning_rate": 8.095238095238096e-05, "loss": 2.6157, "step": 102 }, { "epoch": 0.08353221957040573, "grad_norm": 18.756973266601562, "learning_rate": 8.333333333333334e-05, "loss": 2.6938, "step": 105 }, { "epoch": 0.08591885441527446, "grad_norm": 5.592845439910889, "learning_rate": 8.571428571428571e-05, "loss": 2.6238, "step": 108 }, { "epoch": 0.0883054892601432, "grad_norm": 2.847984790802002, "learning_rate": 8.80952380952381e-05, "loss": 2.5633, "step": 111 }, { "epoch": 0.09069212410501193, "grad_norm": 2.497422218322754, "learning_rate": 9.047619047619048e-05, "loss": 2.6791, "step": 114 }, { "epoch": 0.09307875894988067, "grad_norm": 3.0090177059173584, "learning_rate": 9.285714285714286e-05, "loss": 2.5656, "step": 117 }, { "epoch": 0.0954653937947494, "grad_norm": 4.84130859375, "learning_rate": 9.523809523809524e-05, "loss": 2.5623, "step": 120 }, { "epoch": 0.09785202863961814, "grad_norm": 3.03096866607666, "learning_rate": 9.761904761904762e-05, "loss": 2.581, "step": 123 }, { "epoch": 0.10023866348448687, "grad_norm": 3.2894864082336426, "learning_rate": 0.0001, "loss": 2.3262, "step": 126 }, { "epoch": 0.1026252983293556, "grad_norm": 3.5789895057678223, "learning_rate": 9.999961058466053e-05, "loss": 2.5495, "step": 129 }, { "epoch": 0.10501193317422435, "grad_norm": 2.6820790767669678, "learning_rate": 9.999844234470782e-05, "loss": 2.4322, "step": 132 }, { "epoch": 0.10739856801909307, "grad_norm": 2.5585029125213623, "learning_rate": 9.999649529833915e-05, "loss": 2.4225, "step": 135 }, { "epoch": 0.10978520286396182, "grad_norm": 2.9007081985473633, "learning_rate": 9.999376947588288e-05, "loss": 2.5102, "step": 138 }, { "epoch": 0.11217183770883055, "grad_norm": 3.485059976577759, "learning_rate": 9.999026491979808e-05, "loss": 2.4618, "step": 141 }, { "epoch": 0.11455847255369929, "grad_norm": 2.1991868019104004, "learning_rate": 9.99859816846739e-05, "loss": 2.4147, "step": 144 }, { "epoch": 0.11694510739856802, "grad_norm": 2.4278526306152344, "learning_rate": 9.998091983722863e-05, "loss": 2.406, "step": 147 }, { "epoch": 0.11933174224343675, "grad_norm": 2.217151403427124, "learning_rate": 9.99750794563087e-05, "loss": 2.2757, "step": 150 }, { "epoch": 0.12171837708830549, "grad_norm": 2.5492608547210693, "learning_rate": 9.996846063288747e-05, "loss": 2.3297, "step": 153 }, { "epoch": 0.12410501193317422, "grad_norm": 2.6618294715881348, "learning_rate": 9.996106347006379e-05, "loss": 2.4732, "step": 156 }, { "epoch": 0.12649164677804295, "grad_norm": 2.283311605453491, "learning_rate": 9.99528880830604e-05, "loss": 2.2756, "step": 159 }, { "epoch": 0.1288782816229117, "grad_norm": 2.560292959213257, "learning_rate": 9.994393459922218e-05, "loss": 2.2277, "step": 162 }, { "epoch": 0.13126491646778043, "grad_norm": 4.688173294067383, "learning_rate": 9.993420315801406e-05, "loss": 2.2035, "step": 165 }, { "epoch": 0.13365155131264916, "grad_norm": 2.796677350997925, "learning_rate": 9.992369391101895e-05, "loss": 2.2856, "step": 168 }, { "epoch": 0.1360381861575179, "grad_norm": 2.2355430126190186, "learning_rate": 9.991240702193532e-05, "loss": 2.4203, "step": 171 }, { "epoch": 0.13842482100238662, "grad_norm": 4.946108818054199, "learning_rate": 9.990034266657467e-05, "loss": 2.3584, "step": 174 }, { "epoch": 0.14081145584725538, "grad_norm": 2.3821146488189697, "learning_rate": 9.988750103285883e-05, "loss": 2.2558, "step": 177 }, { "epoch": 0.1431980906921241, "grad_norm": 2.569561719894409, "learning_rate": 9.987388232081694e-05, "loss": 2.2857, "step": 180 }, { "epoch": 0.14558472553699284, "grad_norm": 2.7219672203063965, "learning_rate": 9.985948674258243e-05, "loss": 2.1856, "step": 183 }, { "epoch": 0.14797136038186157, "grad_norm": 2.2356839179992676, "learning_rate": 9.984431452238967e-05, "loss": 2.2922, "step": 186 }, { "epoch": 0.15035799522673032, "grad_norm": 2.406996488571167, "learning_rate": 9.982836589657043e-05, "loss": 2.2222, "step": 189 }, { "epoch": 0.15274463007159905, "grad_norm": 2.0661559104919434, "learning_rate": 9.981164111355035e-05, "loss": 2.2842, "step": 192 }, { "epoch": 0.15513126491646778, "grad_norm": 2.405298948287964, "learning_rate": 9.979414043384485e-05, "loss": 2.2644, "step": 195 }, { "epoch": 0.1575178997613365, "grad_norm": 2.26259446144104, "learning_rate": 9.977586413005531e-05, "loss": 2.3201, "step": 198 }, { "epoch": 0.15990453460620524, "grad_norm": 2.7550411224365234, "learning_rate": 9.975681248686461e-05, "loss": 2.3025, "step": 201 }, { "epoch": 0.162291169451074, "grad_norm": 2.2119550704956055, "learning_rate": 9.973698580103285e-05, "loss": 2.2355, "step": 204 }, { "epoch": 0.16467780429594273, "grad_norm": 2.1809093952178955, "learning_rate": 9.971638438139266e-05, "loss": 2.3667, "step": 207 }, { "epoch": 0.16706443914081145, "grad_norm": 2.4465034008026123, "learning_rate": 9.96950085488444e-05, "loss": 2.1225, "step": 210 }, { "epoch": 0.16945107398568018, "grad_norm": 2.1557154655456543, "learning_rate": 9.967285863635112e-05, "loss": 2.3583, "step": 213 }, { "epoch": 0.1718377088305489, "grad_norm": 2.4219322204589844, "learning_rate": 9.964993498893349e-05, "loss": 2.2503, "step": 216 }, { "epoch": 0.17422434367541767, "grad_norm": 2.533348798751831, "learning_rate": 9.962623796366429e-05, "loss": 2.2262, "step": 219 }, { "epoch": 0.1766109785202864, "grad_norm": 2.264911413192749, "learning_rate": 9.960176792966289e-05, "loss": 2.202, "step": 222 }, { "epoch": 0.17899761336515513, "grad_norm": 1.754560112953186, "learning_rate": 9.95765252680896e-05, "loss": 2.2589, "step": 225 }, { "epoch": 0.18138424821002386, "grad_norm": 1.8504457473754883, "learning_rate": 9.95505103721396e-05, "loss": 2.2058, "step": 228 }, { "epoch": 0.18377088305489261, "grad_norm": 2.1537466049194336, "learning_rate": 9.952372364703687e-05, "loss": 2.3089, "step": 231 }, { "epoch": 0.18615751789976134, "grad_norm": 3.062028408050537, "learning_rate": 9.949616551002787e-05, "loss": 2.195, "step": 234 }, { "epoch": 0.18854415274463007, "grad_norm": 2.369779586791992, "learning_rate": 9.946783639037504e-05, "loss": 2.1499, "step": 237 }, { "epoch": 0.1909307875894988, "grad_norm": 2.2154617309570312, "learning_rate": 9.943873672935014e-05, "loss": 2.1925, "step": 240 }, { "epoch": 0.19331742243436753, "grad_norm": 2.502539873123169, "learning_rate": 9.940886698022734e-05, "loss": 2.0562, "step": 243 }, { "epoch": 0.1957040572792363, "grad_norm": 2.294062614440918, "learning_rate": 9.93782276082762e-05, "loss": 2.1681, "step": 246 }, { "epoch": 0.19809069212410502, "grad_norm": 2.0213210582733154, "learning_rate": 9.934681909075434e-05, "loss": 2.1332, "step": 249 }, { "epoch": 0.20047732696897375, "grad_norm": 2.3265905380249023, "learning_rate": 9.931464191690015e-05, "loss": 2.2054, "step": 252 }, { "epoch": 0.20286396181384247, "grad_norm": 1.9375816583633423, "learning_rate": 9.928169658792498e-05, "loss": 2.1961, "step": 255 }, { "epoch": 0.2052505966587112, "grad_norm": 2.081695318222046, "learning_rate": 9.924798361700553e-05, "loss": 2.1484, "step": 258 }, { "epoch": 0.20763723150357996, "grad_norm": 2.059936046600342, "learning_rate": 9.92135035292757e-05, "loss": 2.1565, "step": 261 }, { "epoch": 0.2100238663484487, "grad_norm": 1.9999018907546997, "learning_rate": 9.91782568618185e-05, "loss": 2.2006, "step": 264 }, { "epoch": 0.21241050119331742, "grad_norm": 1.8792791366577148, "learning_rate": 9.914224416365764e-05, "loss": 2.2193, "step": 267 }, { "epoch": 0.21479713603818615, "grad_norm": 1.8357597589492798, "learning_rate": 9.910546599574902e-05, "loss": 2.3437, "step": 270 }, { "epoch": 0.2171837708830549, "grad_norm": 2.886901378631592, "learning_rate": 9.906792293097194e-05, "loss": 2.1577, "step": 273 }, { "epoch": 0.21957040572792363, "grad_norm": 2.1063649654388428, "learning_rate": 9.90296155541202e-05, "loss": 2.1812, "step": 276 }, { "epoch": 0.22195704057279236, "grad_norm": 2.063605785369873, "learning_rate": 9.899054446189304e-05, "loss": 2.0393, "step": 279 }, { "epoch": 0.2243436754176611, "grad_norm": 2.5164003372192383, "learning_rate": 9.895071026288574e-05, "loss": 2.0649, "step": 282 }, { "epoch": 0.22673031026252982, "grad_norm": 2.180410385131836, "learning_rate": 9.891011357758022e-05, "loss": 2.0445, "step": 285 }, { "epoch": 0.22911694510739858, "grad_norm": 2.030186176300049, "learning_rate": 9.886875503833536e-05, "loss": 2.2003, "step": 288 }, { "epoch": 0.2315035799522673, "grad_norm": 2.293309450149536, "learning_rate": 9.882663528937717e-05, "loss": 2.1257, "step": 291 }, { "epoch": 0.23389021479713604, "grad_norm": 1.9527851343154907, "learning_rate": 9.87837549867887e-05, "loss": 1.9611, "step": 294 }, { "epoch": 0.23627684964200477, "grad_norm": 2.0777316093444824, "learning_rate": 9.87401147984998e-05, "loss": 2.2418, "step": 297 }, { "epoch": 0.2386634844868735, "grad_norm": 3.1081202030181885, "learning_rate": 9.869571540427689e-05, "loss": 2.1499, "step": 300 }, { "epoch": 0.24105011933174225, "grad_norm": 1.9104883670806885, "learning_rate": 9.865055749571215e-05, "loss": 2.2008, "step": 303 }, { "epoch": 0.24343675417661098, "grad_norm": 1.9581879377365112, "learning_rate": 9.860464177621284e-05, "loss": 1.9274, "step": 306 }, { "epoch": 0.2458233890214797, "grad_norm": 2.2974860668182373, "learning_rate": 9.855796896099045e-05, "loss": 2.1965, "step": 309 }, { "epoch": 0.24821002386634844, "grad_norm": 3.9561026096343994, "learning_rate": 9.851053977704931e-05, "loss": 1.9958, "step": 312 }, { "epoch": 0.25059665871121717, "grad_norm": 1.9768146276474, "learning_rate": 9.846235496317555e-05, "loss": 1.9944, "step": 315 }, { "epoch": 0.2529832935560859, "grad_norm": 1.7646844387054443, "learning_rate": 9.841341526992536e-05, "loss": 2.114, "step": 318 }, { "epoch": 0.2553699284009546, "grad_norm": 1.744960904121399, "learning_rate": 9.836372145961345e-05, "loss": 2.1758, "step": 321 }, { "epoch": 0.2577565632458234, "grad_norm": 1.8891479969024658, "learning_rate": 9.83132743063011e-05, "loss": 2.174, "step": 324 }, { "epoch": 0.26014319809069214, "grad_norm": 2.207690477371216, "learning_rate": 9.826207459578411e-05, "loss": 1.9944, "step": 327 }, { "epoch": 0.26252983293556087, "grad_norm": 2.369786500930786, "learning_rate": 9.821012312558058e-05, "loss": 1.9725, "step": 330 }, { "epoch": 0.2649164677804296, "grad_norm": 1.8267792463302612, "learning_rate": 9.815742070491852e-05, "loss": 2.1383, "step": 333 }, { "epoch": 0.26730310262529833, "grad_norm": 2.1697428226470947, "learning_rate": 9.810396815472314e-05, "loss": 2.036, "step": 336 }, { "epoch": 0.26968973747016706, "grad_norm": 2.0004632472991943, "learning_rate": 9.804976630760419e-05, "loss": 2.077, "step": 339 }, { "epoch": 0.2720763723150358, "grad_norm": 1.9695758819580078, "learning_rate": 9.799481600784286e-05, "loss": 1.9859, "step": 342 }, { "epoch": 0.2744630071599045, "grad_norm": 2.275029420852661, "learning_rate": 9.793911811137875e-05, "loss": 2.0383, "step": 345 }, { "epoch": 0.27684964200477324, "grad_norm": 2.0272061824798584, "learning_rate": 9.788267348579648e-05, "loss": 2.0818, "step": 348 }, { "epoch": 0.27923627684964203, "grad_norm": 1.8712291717529297, "learning_rate": 9.782548301031217e-05, "loss": 2.0953, "step": 351 }, { "epoch": 0.28162291169451076, "grad_norm": 2.198537826538086, "learning_rate": 9.776754757575975e-05, "loss": 2.0162, "step": 354 }, { "epoch": 0.2840095465393795, "grad_norm": 2.8860132694244385, "learning_rate": 9.770886808457709e-05, "loss": 2.1242, "step": 357 }, { "epoch": 0.2863961813842482, "grad_norm": 2.483168125152588, "learning_rate": 9.764944545079196e-05, "loss": 2.1847, "step": 360 }, { "epoch": 0.28878281622911695, "grad_norm": 2.067625045776367, "learning_rate": 9.758928060000778e-05, "loss": 2.1283, "step": 363 }, { "epoch": 0.2911694510739857, "grad_norm": 1.8211537599563599, "learning_rate": 9.752837446938915e-05, "loss": 1.9557, "step": 366 }, { "epoch": 0.2935560859188544, "grad_norm": 1.7822015285491943, "learning_rate": 9.746672800764735e-05, "loss": 1.9696, "step": 369 }, { "epoch": 0.29594272076372313, "grad_norm": 2.028804302215576, "learning_rate": 9.740434217502547e-05, "loss": 1.9916, "step": 372 }, { "epoch": 0.29832935560859186, "grad_norm": 1.9470692873001099, "learning_rate": 9.734121794328357e-05, "loss": 2.0649, "step": 375 }, { "epoch": 0.30071599045346065, "grad_norm": 2.003063201904297, "learning_rate": 9.727735629568336e-05, "loss": 2.0997, "step": 378 }, { "epoch": 0.3031026252983294, "grad_norm": 3.469019889831543, "learning_rate": 9.721275822697306e-05, "loss": 2.0181, "step": 381 }, { "epoch": 0.3054892601431981, "grad_norm": 1.79829740524292, "learning_rate": 9.714742474337186e-05, "loss": 2.1289, "step": 384 }, { "epoch": 0.30787589498806683, "grad_norm": 1.9561500549316406, "learning_rate": 9.708135686255416e-05, "loss": 2.2278, "step": 387 }, { "epoch": 0.31026252983293556, "grad_norm": 2.805483102798462, "learning_rate": 9.701455561363379e-05, "loss": 1.9664, "step": 390 }, { "epoch": 0.3126491646778043, "grad_norm": 1.7945538759231567, "learning_rate": 9.6947022037148e-05, "loss": 2.0011, "step": 393 }, { "epoch": 0.315035799522673, "grad_norm": 2.022679328918457, "learning_rate": 9.687875718504126e-05, "loss": 1.9724, "step": 396 }, { "epoch": 0.31742243436754175, "grad_norm": 3.296950340270996, "learning_rate": 9.680976212064874e-05, "loss": 2.0533, "step": 399 }, { "epoch": 0.3198090692124105, "grad_norm": 1.849898099899292, "learning_rate": 9.674003791867991e-05, "loss": 2.0683, "step": 402 }, { "epoch": 0.3221957040572792, "grad_norm": 2.1924362182617188, "learning_rate": 9.666958566520174e-05, "loss": 2.1004, "step": 405 }, { "epoch": 0.324582338902148, "grad_norm": 2.3722035884857178, "learning_rate": 9.659840645762175e-05, "loss": 2.131, "step": 408 }, { "epoch": 0.3269689737470167, "grad_norm": 2.3362410068511963, "learning_rate": 9.652650140467093e-05, "loss": 2.0556, "step": 411 }, { "epoch": 0.32935560859188545, "grad_norm": 1.6226763725280762, "learning_rate": 9.645387162638652e-05, "loss": 2.0418, "step": 414 }, { "epoch": 0.3317422434367542, "grad_norm": 1.9308342933654785, "learning_rate": 9.638051825409453e-05, "loss": 2.2221, "step": 417 }, { "epoch": 0.3341288782816229, "grad_norm": 1.530688762664795, "learning_rate": 9.630644243039207e-05, "loss": 1.9592, "step": 420 }, { "epoch": 0.33651551312649164, "grad_norm": 1.967718482017517, "learning_rate": 9.623164530912963e-05, "loss": 2.1817, "step": 423 }, { "epoch": 0.33890214797136037, "grad_norm": 1.803712248802185, "learning_rate": 9.615612805539305e-05, "loss": 1.9614, "step": 426 }, { "epoch": 0.3412887828162291, "grad_norm": 1.9774303436279297, "learning_rate": 9.607989184548543e-05, "loss": 2.0609, "step": 429 }, { "epoch": 0.3436754176610978, "grad_norm": 1.9333628416061401, "learning_rate": 9.600293786690872e-05, "loss": 2.0487, "step": 432 }, { "epoch": 0.3460620525059666, "grad_norm": 1.785407543182373, "learning_rate": 9.592526731834537e-05, "loss": 2.1801, "step": 435 }, { "epoch": 0.34844868735083534, "grad_norm": 2.0003998279571533, "learning_rate": 9.584688140963944e-05, "loss": 1.8782, "step": 438 }, { "epoch": 0.35083532219570407, "grad_norm": 1.7324368953704834, "learning_rate": 9.576778136177798e-05, "loss": 2.0483, "step": 441 }, { "epoch": 0.3532219570405728, "grad_norm": 1.8628923892974854, "learning_rate": 9.568796840687184e-05, "loss": 1.9283, "step": 444 }, { "epoch": 0.3556085918854415, "grad_norm": 1.822113037109375, "learning_rate": 9.560744378813659e-05, "loss": 2.0287, "step": 447 }, { "epoch": 0.35799522673031026, "grad_norm": 1.9748612642288208, "learning_rate": 9.552620875987311e-05, "loss": 1.9536, "step": 450 }, { "epoch": 0.360381861575179, "grad_norm": 1.875239372253418, "learning_rate": 9.544426458744804e-05, "loss": 1.8725, "step": 453 }, { "epoch": 0.3627684964200477, "grad_norm": 1.9234291315078735, "learning_rate": 9.536161254727408e-05, "loss": 1.9569, "step": 456 }, { "epoch": 0.36515513126491644, "grad_norm": 1.6600582599639893, "learning_rate": 9.527825392679012e-05, "loss": 1.9387, "step": 459 }, { "epoch": 0.36754176610978523, "grad_norm": 1.7599331140518188, "learning_rate": 9.51941900244412e-05, "loss": 1.941, "step": 462 }, { "epoch": 0.36992840095465396, "grad_norm": 1.6681755781173706, "learning_rate": 9.51094221496582e-05, "loss": 1.9404, "step": 465 }, { "epoch": 0.3723150357995227, "grad_norm": 1.8792699575424194, "learning_rate": 9.502395162283759e-05, "loss": 1.8169, "step": 468 }, { "epoch": 0.3747016706443914, "grad_norm": 1.6947994232177734, "learning_rate": 9.493777977532072e-05, "loss": 1.9733, "step": 471 }, { "epoch": 0.37708830548926014, "grad_norm": 1.909915804862976, "learning_rate": 9.485090794937319e-05, "loss": 1.9364, "step": 474 }, { "epoch": 0.3794749403341289, "grad_norm": 1.6911386251449585, "learning_rate": 9.476333749816382e-05, "loss": 1.8554, "step": 477 }, { "epoch": 0.3818615751789976, "grad_norm": 1.8519269227981567, "learning_rate": 9.467506978574371e-05, "loss": 1.8662, "step": 480 }, { "epoch": 0.38424821002386633, "grad_norm": 1.8122614622116089, "learning_rate": 9.45861061870249e-05, "loss": 2.0183, "step": 483 }, { "epoch": 0.38663484486873506, "grad_norm": 2.058873414993286, "learning_rate": 9.449644808775902e-05, "loss": 1.9499, "step": 486 }, { "epoch": 0.38902147971360385, "grad_norm": 1.720900297164917, "learning_rate": 9.44060968845156e-05, "loss": 1.9129, "step": 489 }, { "epoch": 0.3914081145584726, "grad_norm": 1.9670952558517456, "learning_rate": 9.431505398466045e-05, "loss": 2.0279, "step": 492 }, { "epoch": 0.3937947494033413, "grad_norm": 1.800014853477478, "learning_rate": 9.42233208063336e-05, "loss": 1.8313, "step": 495 }, { "epoch": 0.39618138424821003, "grad_norm": 2.6560916900634766, "learning_rate": 9.413089877842736e-05, "loss": 2.0691, "step": 498 }, { "epoch": 0.39856801909307876, "grad_norm": 1.8405070304870605, "learning_rate": 9.403778934056391e-05, "loss": 2.0424, "step": 501 }, { "epoch": 0.4009546539379475, "grad_norm": 1.798326849937439, "learning_rate": 9.394399394307303e-05, "loss": 2.2063, "step": 504 }, { "epoch": 0.4033412887828162, "grad_norm": 1.9824241399765015, "learning_rate": 9.384951404696933e-05, "loss": 1.861, "step": 507 }, { "epoch": 0.40572792362768495, "grad_norm": 2.272447109222412, "learning_rate": 9.375435112392969e-05, "loss": 2.0633, "step": 510 }, { "epoch": 0.4081145584725537, "grad_norm": 1.8447043895721436, "learning_rate": 9.365850665627016e-05, "loss": 1.9229, "step": 513 }, { "epoch": 0.4105011933174224, "grad_norm": 1.7020233869552612, "learning_rate": 9.356198213692297e-05, "loss": 1.8778, "step": 516 }, { "epoch": 0.4128878281622912, "grad_norm": 1.7794997692108154, "learning_rate": 9.346477906941331e-05, "loss": 1.833, "step": 519 }, { "epoch": 0.4152744630071599, "grad_norm": 1.775090217590332, "learning_rate": 9.336689896783573e-05, "loss": 1.8797, "step": 522 }, { "epoch": 0.41766109785202865, "grad_norm": 1.8821420669555664, "learning_rate": 9.32683433568308e-05, "loss": 1.9237, "step": 525 }, { "epoch": 0.4200477326968974, "grad_norm": 2.1922411918640137, "learning_rate": 9.316911377156117e-05, "loss": 1.9907, "step": 528 }, { "epoch": 0.4224343675417661, "grad_norm": 1.8223942518234253, "learning_rate": 9.306921175768775e-05, "loss": 1.924, "step": 531 }, { "epoch": 0.42482100238663484, "grad_norm": 1.8186813592910767, "learning_rate": 9.29686388713456e-05, "loss": 2.1474, "step": 534 }, { "epoch": 0.42720763723150357, "grad_norm": 1.6708881855010986, "learning_rate": 9.286739667911972e-05, "loss": 1.956, "step": 537 }, { "epoch": 0.4295942720763723, "grad_norm": 1.6984988451004028, "learning_rate": 9.276548675802059e-05, "loss": 1.9995, "step": 540 }, { "epoch": 0.431980906921241, "grad_norm": 1.5586644411087036, "learning_rate": 9.266291069545972e-05, "loss": 1.9233, "step": 543 }, { "epoch": 0.4343675417661098, "grad_norm": 1.7961502075195312, "learning_rate": 9.255967008922474e-05, "loss": 1.9941, "step": 546 }, { "epoch": 0.43675417661097854, "grad_norm": 1.4892523288726807, "learning_rate": 9.245576654745471e-05, "loss": 2.0961, "step": 549 }, { "epoch": 0.43914081145584727, "grad_norm": 1.635480523109436, "learning_rate": 9.235120168861496e-05, "loss": 1.8031, "step": 552 }, { "epoch": 0.441527446300716, "grad_norm": 2.0391037464141846, "learning_rate": 9.224597714147186e-05, "loss": 2.0117, "step": 555 }, { "epoch": 0.4439140811455847, "grad_norm": 1.8993330001831055, "learning_rate": 9.214009454506753e-05, "loss": 1.8346, "step": 558 }, { "epoch": 0.44630071599045346, "grad_norm": 1.830111026763916, "learning_rate": 9.203355554869428e-05, "loss": 1.9372, "step": 561 }, { "epoch": 0.4486873508353222, "grad_norm": 3.3967881202697754, "learning_rate": 9.192636181186888e-05, "loss": 1.7628, "step": 564 }, { "epoch": 0.4510739856801909, "grad_norm": 1.7912521362304688, "learning_rate": 9.181851500430673e-05, "loss": 1.8183, "step": 567 }, { "epoch": 0.45346062052505964, "grad_norm": 1.9313652515411377, "learning_rate": 9.171001680589588e-05, "loss": 1.8283, "step": 570 }, { "epoch": 0.45584725536992843, "grad_norm": 2.1597900390625, "learning_rate": 9.160086890667086e-05, "loss": 1.8924, "step": 573 }, { "epoch": 0.45823389021479716, "grad_norm": 1.6880937814712524, "learning_rate": 9.14910730067863e-05, "loss": 1.9477, "step": 576 }, { "epoch": 0.4606205250596659, "grad_norm": 1.6879347562789917, "learning_rate": 9.138063081649051e-05, "loss": 2.0598, "step": 579 }, { "epoch": 0.4630071599045346, "grad_norm": 1.8424838781356812, "learning_rate": 9.126954405609882e-05, "loss": 1.9193, "step": 582 }, { "epoch": 0.46539379474940334, "grad_norm": 2.013153076171875, "learning_rate": 9.115781445596676e-05, "loss": 1.7798, "step": 585 }, { "epoch": 0.4677804295942721, "grad_norm": 2.0057547092437744, "learning_rate": 9.104544375646313e-05, "loss": 2.1303, "step": 588 }, { "epoch": 0.4701670644391408, "grad_norm": 1.924677848815918, "learning_rate": 9.093243370794291e-05, "loss": 1.9005, "step": 591 }, { "epoch": 0.47255369928400953, "grad_norm": 1.8524523973464966, "learning_rate": 9.081878607071996e-05, "loss": 1.9822, "step": 594 }, { "epoch": 0.47494033412887826, "grad_norm": 1.9615447521209717, "learning_rate": 9.07045026150396e-05, "loss": 1.9297, "step": 597 }, { "epoch": 0.477326968973747, "grad_norm": 1.671190619468689, "learning_rate": 9.058958512105104e-05, "loss": 1.8514, "step": 600 }, { "epoch": 0.4797136038186158, "grad_norm": 3.258155107498169, "learning_rate": 9.047403537877971e-05, "loss": 1.8893, "step": 603 }, { "epoch": 0.4821002386634845, "grad_norm": 1.7724066972732544, "learning_rate": 9.035785518809927e-05, "loss": 1.9497, "step": 606 }, { "epoch": 0.48448687350835323, "grad_norm": 1.958068609237671, "learning_rate": 9.024104635870368e-05, "loss": 1.917, "step": 609 }, { "epoch": 0.48687350835322196, "grad_norm": 1.6382533311843872, "learning_rate": 9.012361071007891e-05, "loss": 1.9146, "step": 612 }, { "epoch": 0.4892601431980907, "grad_norm": 3.012364625930786, "learning_rate": 9.000555007147469e-05, "loss": 1.9391, "step": 615 }, { "epoch": 0.4916467780429594, "grad_norm": 2.1372387409210205, "learning_rate": 8.988686628187597e-05, "loss": 1.9188, "step": 618 }, { "epoch": 0.49403341288782815, "grad_norm": 1.587446689605713, "learning_rate": 8.976756118997427e-05, "loss": 1.9319, "step": 621 }, { "epoch": 0.4964200477326969, "grad_norm": 2.177863121032715, "learning_rate": 8.964763665413893e-05, "loss": 1.7627, "step": 624 }, { "epoch": 0.4988066825775656, "grad_norm": 1.8430842161178589, "learning_rate": 8.952709454238808e-05, "loss": 2.0253, "step": 627 }, { "epoch": 0.5011933174224343, "grad_norm": 1.6200464963912964, "learning_rate": 8.940593673235962e-05, "loss": 1.9912, "step": 630 }, { "epoch": 0.5035799522673031, "grad_norm": 1.8540515899658203, "learning_rate": 8.928416511128195e-05, "loss": 1.8121, "step": 633 }, { "epoch": 0.5059665871121718, "grad_norm": 1.7102622985839844, "learning_rate": 8.916178157594453e-05, "loss": 1.8878, "step": 636 }, { "epoch": 0.5083532219570406, "grad_norm": 1.7976316213607788, "learning_rate": 8.903878803266841e-05, "loss": 2.0787, "step": 639 }, { "epoch": 0.5107398568019093, "grad_norm": 1.524535894393921, "learning_rate": 8.891518639727649e-05, "loss": 1.772, "step": 642 }, { "epoch": 0.513126491646778, "grad_norm": 1.793609380722046, "learning_rate": 8.879097859506372e-05, "loss": 1.9283, "step": 645 }, { "epoch": 0.5155131264916468, "grad_norm": 1.700348973274231, "learning_rate": 8.866616656076696e-05, "loss": 1.8155, "step": 648 }, { "epoch": 0.5178997613365155, "grad_norm": 1.611598253250122, "learning_rate": 8.854075223853508e-05, "loss": 1.9446, "step": 651 }, { "epoch": 0.5202863961813843, "grad_norm": 1.7585581541061401, "learning_rate": 8.841473758189854e-05, "loss": 1.7997, "step": 654 }, { "epoch": 0.522673031026253, "grad_norm": 1.823596715927124, "learning_rate": 8.828812455373891e-05, "loss": 2.0054, "step": 657 }, { "epoch": 0.5250596658711217, "grad_norm": 1.7616968154907227, "learning_rate": 8.816091512625843e-05, "loss": 1.9281, "step": 660 }, { "epoch": 0.5274463007159904, "grad_norm": 2.0290961265563965, "learning_rate": 8.803311128094918e-05, "loss": 1.9559, "step": 663 }, { "epoch": 0.5298329355608592, "grad_norm": 1.957862377166748, "learning_rate": 8.790471500856228e-05, "loss": 2.0428, "step": 666 }, { "epoch": 0.5322195704057279, "grad_norm": 1.768784999847412, "learning_rate": 8.777572830907684e-05, "loss": 2.1063, "step": 669 }, { "epoch": 0.5346062052505967, "grad_norm": 1.6378014087677002, "learning_rate": 8.764615319166886e-05, "loss": 2.0242, "step": 672 }, { "epoch": 0.5369928400954654, "grad_norm": 1.6081739664077759, "learning_rate": 8.751599167467985e-05, "loss": 1.7975, "step": 675 }, { "epoch": 0.5393794749403341, "grad_norm": 1.6603517532348633, "learning_rate": 8.738524578558547e-05, "loss": 1.8832, "step": 678 }, { "epoch": 0.5417661097852029, "grad_norm": 1.7453463077545166, "learning_rate": 8.72539175609639e-05, "loss": 1.7962, "step": 681 }, { "epoch": 0.5441527446300716, "grad_norm": 1.584671139717102, "learning_rate": 8.712200904646416e-05, "loss": 1.7748, "step": 684 }, { "epoch": 0.5465393794749404, "grad_norm": 1.787086844444275, "learning_rate": 8.698952229677422e-05, "loss": 1.8125, "step": 687 }, { "epoch": 0.548926014319809, "grad_norm": 2.3501312732696533, "learning_rate": 8.685645937558896e-05, "loss": 2.0739, "step": 690 }, { "epoch": 0.5513126491646778, "grad_norm": 1.6689587831497192, "learning_rate": 8.67228223555781e-05, "loss": 1.823, "step": 693 }, { "epoch": 0.5536992840095465, "grad_norm": 1.7818080186843872, "learning_rate": 8.658861331835385e-05, "loss": 1.9059, "step": 696 }, { "epoch": 0.5560859188544153, "grad_norm": 1.8221330642700195, "learning_rate": 8.645383435443852e-05, "loss": 1.8294, "step": 699 }, { "epoch": 0.5584725536992841, "grad_norm": 1.7873986959457397, "learning_rate": 8.631848756323197e-05, "loss": 1.8954, "step": 702 }, { "epoch": 0.5608591885441527, "grad_norm": 1.6317650079727173, "learning_rate": 8.618257505297886e-05, "loss": 1.9043, "step": 705 }, { "epoch": 0.5632458233890215, "grad_norm": 1.8384369611740112, "learning_rate": 8.604609894073584e-05, "loss": 1.7076, "step": 708 }, { "epoch": 0.5656324582338902, "grad_norm": 1.737593412399292, "learning_rate": 8.590906135233854e-05, "loss": 1.8403, "step": 711 }, { "epoch": 0.568019093078759, "grad_norm": 1.7566007375717163, "learning_rate": 8.577146442236857e-05, "loss": 1.9017, "step": 714 }, { "epoch": 0.5704057279236276, "grad_norm": 1.785732626914978, "learning_rate": 8.563331029412012e-05, "loss": 1.93, "step": 717 }, { "epoch": 0.5727923627684964, "grad_norm": 2.0541484355926514, "learning_rate": 8.549460111956664e-05, "loss": 1.8559, "step": 720 }, { "epoch": 0.5751789976133651, "grad_norm": 2.421417474746704, "learning_rate": 8.535533905932738e-05, "loss": 1.8517, "step": 723 }, { "epoch": 0.5775656324582339, "grad_norm": 1.7755558490753174, "learning_rate": 8.521552628263362e-05, "loss": 1.8476, "step": 726 }, { "epoch": 0.5799522673031027, "grad_norm": 1.632957100868225, "learning_rate": 8.507516496729495e-05, "loss": 1.8007, "step": 729 }, { "epoch": 0.5823389021479713, "grad_norm": 1.681512475013733, "learning_rate": 8.493425729966534e-05, "loss": 1.921, "step": 732 }, { "epoch": 0.5847255369928401, "grad_norm": 1.6569448709487915, "learning_rate": 8.479280547460907e-05, "loss": 1.8165, "step": 735 }, { "epoch": 0.5871121718377088, "grad_norm": 1.630233645439148, "learning_rate": 8.465081169546659e-05, "loss": 1.9824, "step": 738 }, { "epoch": 0.5894988066825776, "grad_norm": 1.7391397953033447, "learning_rate": 8.450827817402011e-05, "loss": 1.9757, "step": 741 }, { "epoch": 0.5918854415274463, "grad_norm": 2.036689281463623, "learning_rate": 8.436520713045922e-05, "loss": 1.7685, "step": 744 }, { "epoch": 0.594272076372315, "grad_norm": 1.4583113193511963, "learning_rate": 8.422160079334628e-05, "loss": 1.8458, "step": 747 }, { "epoch": 0.5966587112171837, "grad_norm": 3.0061111450195312, "learning_rate": 8.40774613995817e-05, "loss": 1.895, "step": 750 }, { "epoch": 0.5990453460620525, "grad_norm": 1.791383147239685, "learning_rate": 8.393279119436912e-05, "loss": 1.9162, "step": 753 }, { "epoch": 0.6014319809069213, "grad_norm": 2.032726526260376, "learning_rate": 8.378759243118044e-05, "loss": 2.0624, "step": 756 }, { "epoch": 0.60381861575179, "grad_norm": 1.6108125448226929, "learning_rate": 8.364186737172068e-05, "loss": 1.807, "step": 759 }, { "epoch": 0.6062052505966588, "grad_norm": 1.748790979385376, "learning_rate": 8.349561828589277e-05, "loss": 2.0034, "step": 762 }, { "epoch": 0.6085918854415274, "grad_norm": 1.6538299322128296, "learning_rate": 8.33488474517622e-05, "loss": 1.8479, "step": 765 }, { "epoch": 0.6109785202863962, "grad_norm": 1.8360040187835693, "learning_rate": 8.320155715552155e-05, "loss": 1.7923, "step": 768 }, { "epoch": 0.6133651551312649, "grad_norm": 1.8506301641464233, "learning_rate": 8.305374969145488e-05, "loss": 1.9816, "step": 771 }, { "epoch": 0.6157517899761337, "grad_norm": 1.670559048652649, "learning_rate": 8.290542736190188e-05, "loss": 1.7492, "step": 774 }, { "epoch": 0.6181384248210023, "grad_norm": 1.654943585395813, "learning_rate": 8.275659247722222e-05, "loss": 1.7765, "step": 777 }, { "epoch": 0.6205250596658711, "grad_norm": 1.7641255855560303, "learning_rate": 8.260724735575933e-05, "loss": 1.8642, "step": 780 }, { "epoch": 0.6229116945107399, "grad_norm": 2.0606367588043213, "learning_rate": 8.24573943238045e-05, "loss": 1.938, "step": 783 }, { "epoch": 0.6252983293556086, "grad_norm": 1.590706706047058, "learning_rate": 8.230703571556048e-05, "loss": 1.7473, "step": 786 }, { "epoch": 0.6276849642004774, "grad_norm": 1.6293972730636597, "learning_rate": 8.215617387310524e-05, "loss": 1.803, "step": 789 }, { "epoch": 0.630071599045346, "grad_norm": 1.719766616821289, "learning_rate": 8.200481114635536e-05, "loss": 1.8393, "step": 792 }, { "epoch": 0.6324582338902148, "grad_norm": 1.6134871244430542, "learning_rate": 8.185294989302958e-05, "loss": 1.7842, "step": 795 }, { "epoch": 0.6348448687350835, "grad_norm": 1.6503950357437134, "learning_rate": 8.170059247861194e-05, "loss": 1.7826, "step": 798 }, { "epoch": 0.6372315035799523, "grad_norm": 1.8185105323791504, "learning_rate": 8.154774127631501e-05, "loss": 1.7388, "step": 801 }, { "epoch": 0.639618138424821, "grad_norm": 1.7190958261489868, "learning_rate": 8.139439866704293e-05, "loss": 1.833, "step": 804 }, { "epoch": 0.6420047732696897, "grad_norm": 1.6161839962005615, "learning_rate": 8.124056703935423e-05, "loss": 1.801, "step": 807 }, { "epoch": 0.6443914081145584, "grad_norm": 1.5883020162582397, "learning_rate": 8.108624878942477e-05, "loss": 1.8195, "step": 810 }, { "epoch": 0.6467780429594272, "grad_norm": 1.6530261039733887, "learning_rate": 8.093144632101026e-05, "loss": 1.7564, "step": 813 }, { "epoch": 0.649164677804296, "grad_norm": 1.6084448099136353, "learning_rate": 8.077616204540897e-05, "loss": 1.8133, "step": 816 }, { "epoch": 0.6515513126491647, "grad_norm": 1.791277289390564, "learning_rate": 8.062039838142402e-05, "loss": 1.7978, "step": 819 }, { "epoch": 0.6539379474940334, "grad_norm": 1.726449728012085, "learning_rate": 8.046415775532585e-05, "loss": 1.7988, "step": 822 }, { "epoch": 0.6563245823389021, "grad_norm": 1.9136977195739746, "learning_rate": 8.030744260081426e-05, "loss": 1.826, "step": 825 }, { "epoch": 0.6587112171837709, "grad_norm": 1.9753978252410889, "learning_rate": 8.015025535898073e-05, "loss": 1.8964, "step": 828 }, { "epoch": 0.6610978520286396, "grad_norm": 1.6367616653442383, "learning_rate": 7.999259847827015e-05, "loss": 1.8764, "step": 831 }, { "epoch": 0.6634844868735084, "grad_norm": 1.7397571802139282, "learning_rate": 7.983447441444281e-05, "loss": 1.72, "step": 834 }, { "epoch": 0.665871121718377, "grad_norm": 2.1970224380493164, "learning_rate": 7.967588563053616e-05, "loss": 1.7779, "step": 837 }, { "epoch": 0.6682577565632458, "grad_norm": 1.6252713203430176, "learning_rate": 7.951683459682641e-05, "loss": 1.8043, "step": 840 }, { "epoch": 0.6706443914081146, "grad_norm": 1.6703697443008423, "learning_rate": 7.935732379079008e-05, "loss": 2.0202, "step": 843 }, { "epoch": 0.6730310262529833, "grad_norm": 2.3959710597991943, "learning_rate": 7.919735569706533e-05, "loss": 1.9039, "step": 846 }, { "epoch": 0.6754176610978521, "grad_norm": 1.7018755674362183, "learning_rate": 7.903693280741331e-05, "loss": 1.8505, "step": 849 }, { "epoch": 0.6778042959427207, "grad_norm": 1.9524502754211426, "learning_rate": 7.887605762067945e-05, "loss": 1.7577, "step": 852 }, { "epoch": 0.6801909307875895, "grad_norm": 1.9558862447738647, "learning_rate": 7.871473264275429e-05, "loss": 1.749, "step": 855 }, { "epoch": 0.6825775656324582, "grad_norm": 1.880624532699585, "learning_rate": 7.855296038653475e-05, "loss": 2.0309, "step": 858 }, { "epoch": 0.684964200477327, "grad_norm": 1.5008827447891235, "learning_rate": 7.83907433718847e-05, "loss": 1.7858, "step": 861 }, { "epoch": 0.6873508353221957, "grad_norm": 1.7783069610595703, "learning_rate": 7.82280841255959e-05, "loss": 1.755, "step": 864 }, { "epoch": 0.6897374701670644, "grad_norm": 1.6861096620559692, "learning_rate": 7.80649851813486e-05, "loss": 1.912, "step": 867 }, { "epoch": 0.6921241050119332, "grad_norm": 1.5460323095321655, "learning_rate": 7.790144907967201e-05, "loss": 1.7781, "step": 870 }, { "epoch": 0.6945107398568019, "grad_norm": 1.703471302986145, "learning_rate": 7.773747836790481e-05, "loss": 1.9725, "step": 873 }, { "epoch": 0.6968973747016707, "grad_norm": 1.86622953414917, "learning_rate": 7.757307560015538e-05, "loss": 1.7674, "step": 876 }, { "epoch": 0.6992840095465394, "grad_norm": 1.678232192993164, "learning_rate": 7.740824333726213e-05, "loss": 1.731, "step": 879 }, { "epoch": 0.7016706443914081, "grad_norm": 1.9272021055221558, "learning_rate": 7.724298414675353e-05, "loss": 1.7698, "step": 882 }, { "epoch": 0.7040572792362768, "grad_norm": 1.73203444480896, "learning_rate": 7.707730060280812e-05, "loss": 1.913, "step": 885 }, { "epoch": 0.7064439140811456, "grad_norm": 1.5196224451065063, "learning_rate": 7.691119528621444e-05, "loss": 1.8283, "step": 888 }, { "epoch": 0.7088305489260143, "grad_norm": 1.798051357269287, "learning_rate": 7.674467078433081e-05, "loss": 1.9543, "step": 891 }, { "epoch": 0.711217183770883, "grad_norm": 1.6706217527389526, "learning_rate": 7.657772969104508e-05, "loss": 1.6487, "step": 894 }, { "epoch": 0.7136038186157518, "grad_norm": 1.788098692893982, "learning_rate": 7.641037460673412e-05, "loss": 1.6219, "step": 897 }, { "epoch": 0.7159904534606205, "grad_norm": 1.5081647634506226, "learning_rate": 7.624260813822342e-05, "loss": 1.7062, "step": 900 }, { "epoch": 0.7183770883054893, "grad_norm": 1.6908079385757446, "learning_rate": 7.607443289874642e-05, "loss": 1.8747, "step": 903 }, { "epoch": 0.720763723150358, "grad_norm": 1.4784934520721436, "learning_rate": 7.590585150790389e-05, "loss": 1.9055, "step": 906 }, { "epoch": 0.7231503579952268, "grad_norm": 1.4617854356765747, "learning_rate": 7.573686659162293e-05, "loss": 1.8485, "step": 909 }, { "epoch": 0.7255369928400954, "grad_norm": 1.7699053287506104, "learning_rate": 7.556748078211635e-05, "loss": 1.9475, "step": 912 }, { "epoch": 0.7279236276849642, "grad_norm": 1.409713625907898, "learning_rate": 7.53976967178414e-05, "loss": 1.8663, "step": 915 }, { "epoch": 0.7303102625298329, "grad_norm": 1.4845449924468994, "learning_rate": 7.522751704345887e-05, "loss": 1.8818, "step": 918 }, { "epoch": 0.7326968973747017, "grad_norm": 1.5580782890319824, "learning_rate": 7.505694440979178e-05, "loss": 1.8258, "step": 921 }, { "epoch": 0.7350835322195705, "grad_norm": 1.523646593093872, "learning_rate": 7.488598147378416e-05, "loss": 1.6651, "step": 924 }, { "epoch": 0.7374701670644391, "grad_norm": 1.6686230897903442, "learning_rate": 7.471463089845956e-05, "loss": 1.7406, "step": 927 }, { "epoch": 0.7398568019093079, "grad_norm": 1.5957698822021484, "learning_rate": 7.454289535287968e-05, "loss": 1.7128, "step": 930 }, { "epoch": 0.7422434367541766, "grad_norm": 1.6253639459609985, "learning_rate": 7.437077751210279e-05, "loss": 1.9308, "step": 933 }, { "epoch": 0.7446300715990454, "grad_norm": 1.6088500022888184, "learning_rate": 7.419828005714194e-05, "loss": 1.8083, "step": 936 }, { "epoch": 0.747016706443914, "grad_norm": 1.4673054218292236, "learning_rate": 7.402540567492337e-05, "loss": 1.7764, "step": 939 }, { "epoch": 0.7494033412887828, "grad_norm": 1.5151442289352417, "learning_rate": 7.385215705824449e-05, "loss": 1.974, "step": 942 }, { "epoch": 0.7517899761336515, "grad_norm": 1.7416108846664429, "learning_rate": 7.367853690573208e-05, "loss": 1.638, "step": 945 }, { "epoch": 0.7541766109785203, "grad_norm": 1.7864446640014648, "learning_rate": 7.350454792180016e-05, "loss": 1.7128, "step": 948 }, { "epoch": 0.7565632458233891, "grad_norm": 1.6676292419433594, "learning_rate": 7.333019281660789e-05, "loss": 1.9339, "step": 951 }, { "epoch": 0.7589498806682577, "grad_norm": 1.4791321754455566, "learning_rate": 7.31554743060174e-05, "loss": 1.7184, "step": 954 }, { "epoch": 0.7613365155131265, "grad_norm": 1.6596827507019043, "learning_rate": 7.298039511155138e-05, "loss": 1.7909, "step": 957 }, { "epoch": 0.7637231503579952, "grad_norm": 1.8340952396392822, "learning_rate": 7.280495796035079e-05, "loss": 1.8153, "step": 960 }, { "epoch": 0.766109785202864, "grad_norm": 1.6827830076217651, "learning_rate": 7.262916558513237e-05, "loss": 1.6396, "step": 963 }, { "epoch": 0.7684964200477327, "grad_norm": 2.0441389083862305, "learning_rate": 7.245302072414601e-05, "loss": 1.7888, "step": 966 }, { "epoch": 0.7708830548926014, "grad_norm": 1.436802864074707, "learning_rate": 7.227652612113213e-05, "loss": 1.6449, "step": 969 }, { "epoch": 0.7732696897374701, "grad_norm": 1.5358387231826782, "learning_rate": 7.209968452527896e-05, "loss": 1.742, "step": 972 }, { "epoch": 0.7756563245823389, "grad_norm": 1.583846092224121, "learning_rate": 7.192249869117971e-05, "loss": 1.8354, "step": 975 }, { "epoch": 0.7780429594272077, "grad_norm": 1.6381198167800903, "learning_rate": 7.174497137878966e-05, "loss": 1.7394, "step": 978 }, { "epoch": 0.7804295942720764, "grad_norm": 1.707513689994812, "learning_rate": 7.156710535338312e-05, "loss": 1.8892, "step": 981 }, { "epoch": 0.7828162291169452, "grad_norm": 1.7260991334915161, "learning_rate": 7.138890338551048e-05, "loss": 1.832, "step": 984 }, { "epoch": 0.7852028639618138, "grad_norm": 1.486690878868103, "learning_rate": 7.121036825095492e-05, "loss": 1.6662, "step": 987 }, { "epoch": 0.7875894988066826, "grad_norm": 1.551670789718628, "learning_rate": 7.103150273068921e-05, "loss": 1.7214, "step": 990 }, { "epoch": 0.7899761336515513, "grad_norm": 1.50273859500885, "learning_rate": 7.085230961083249e-05, "loss": 1.842, "step": 993 }, { "epoch": 0.7923627684964201, "grad_norm": 1.8055702447891235, "learning_rate": 7.067279168260671e-05, "loss": 1.809, "step": 996 }, { "epoch": 0.7947494033412887, "grad_norm": 1.560795545578003, "learning_rate": 7.04929517422933e-05, "loss": 1.7781, "step": 999 }, { "epoch": 0.7971360381861575, "grad_norm": 1.615140438079834, "learning_rate": 7.031279259118946e-05, "loss": 1.6276, "step": 1002 }, { "epoch": 0.7995226730310262, "grad_norm": 1.745884656906128, "learning_rate": 7.013231703556471e-05, "loss": 1.8841, "step": 1005 }, { "epoch": 0.801909307875895, "grad_norm": 1.553002953529358, "learning_rate": 6.995152788661705e-05, "loss": 1.772, "step": 1008 }, { "epoch": 0.8042959427207638, "grad_norm": 1.5847023725509644, "learning_rate": 6.977042796042917e-05, "loss": 1.7492, "step": 1011 }, { "epoch": 0.8066825775656324, "grad_norm": 1.5925689935684204, "learning_rate": 6.958902007792466e-05, "loss": 1.7523, "step": 1014 }, { "epoch": 0.8090692124105012, "grad_norm": 2.4262924194335938, "learning_rate": 6.940730706482399e-05, "loss": 1.8089, "step": 1017 }, { "epoch": 0.8114558472553699, "grad_norm": 1.591212272644043, "learning_rate": 6.922529175160054e-05, "loss": 1.6939, "step": 1020 }, { "epoch": 0.8138424821002387, "grad_norm": 1.5190821886062622, "learning_rate": 6.904297697343655e-05, "loss": 1.9102, "step": 1023 }, { "epoch": 0.8162291169451074, "grad_norm": 1.556463599205017, "learning_rate": 6.886036557017881e-05, "loss": 1.8486, "step": 1026 }, { "epoch": 0.8186157517899761, "grad_norm": 1.6876753568649292, "learning_rate": 6.867746038629462e-05, "loss": 1.9492, "step": 1029 }, { "epoch": 0.8210023866348448, "grad_norm": 1.4290738105773926, "learning_rate": 6.849426427082735e-05, "loss": 1.7682, "step": 1032 }, { "epoch": 0.8233890214797136, "grad_norm": 1.6473435163497925, "learning_rate": 6.83107800773521e-05, "loss": 1.8299, "step": 1035 }, { "epoch": 0.8257756563245824, "grad_norm": 1.6430293321609497, "learning_rate": 6.812701066393124e-05, "loss": 1.7187, "step": 1038 }, { "epoch": 0.8281622911694511, "grad_norm": 1.537840485572815, "learning_rate": 6.79429588930699e-05, "loss": 1.7417, "step": 1041 }, { "epoch": 0.8305489260143198, "grad_norm": 2.192531108856201, "learning_rate": 6.775862763167142e-05, "loss": 1.7398, "step": 1044 }, { "epoch": 0.8329355608591885, "grad_norm": 1.550441861152649, "learning_rate": 6.757401975099262e-05, "loss": 1.6647, "step": 1047 }, { "epoch": 0.8353221957040573, "grad_norm": 3.2861685752868652, "learning_rate": 6.738913812659912e-05, "loss": 1.8928, "step": 1050 }, { "epoch": 0.837708830548926, "grad_norm": 2.5351181030273438, "learning_rate": 6.720398563832055e-05, "loss": 1.7647, "step": 1053 }, { "epoch": 0.8400954653937948, "grad_norm": 1.4341228008270264, "learning_rate": 6.701856517020565e-05, "loss": 1.8559, "step": 1056 }, { "epoch": 0.8424821002386634, "grad_norm": 1.5583041906356812, "learning_rate": 6.683287961047742e-05, "loss": 1.9764, "step": 1059 }, { "epoch": 0.8448687350835322, "grad_norm": 1.6127870082855225, "learning_rate": 6.664693185148807e-05, "loss": 1.7269, "step": 1062 }, { "epoch": 0.847255369928401, "grad_norm": 1.662477731704712, "learning_rate": 6.646072478967397e-05, "loss": 1.8924, "step": 1065 }, { "epoch": 0.8496420047732697, "grad_norm": 1.673250675201416, "learning_rate": 6.627426132551058e-05, "loss": 1.7657, "step": 1068 }, { "epoch": 0.8520286396181385, "grad_norm": 1.5422923564910889, "learning_rate": 6.608754436346725e-05, "loss": 1.689, "step": 1071 }, { "epoch": 0.8544152744630071, "grad_norm": 1.7752854824066162, "learning_rate": 6.590057681196191e-05, "loss": 1.6743, "step": 1074 }, { "epoch": 0.8568019093078759, "grad_norm": 1.5732386112213135, "learning_rate": 6.571336158331589e-05, "loss": 1.967, "step": 1077 }, { "epoch": 0.8591885441527446, "grad_norm": 1.6915782690048218, "learning_rate": 6.552590159370844e-05, "loss": 1.6303, "step": 1080 }, { "epoch": 0.8615751789976134, "grad_norm": 1.5300819873809814, "learning_rate": 6.53381997631314e-05, "loss": 1.855, "step": 1083 }, { "epoch": 0.863961813842482, "grad_norm": 1.5509366989135742, "learning_rate": 6.515025901534364e-05, "loss": 1.7637, "step": 1086 }, { "epoch": 0.8663484486873508, "grad_norm": 1.4942635297775269, "learning_rate": 6.496208227782556e-05, "loss": 1.7456, "step": 1089 }, { "epoch": 0.8687350835322196, "grad_norm": 1.7065314054489136, "learning_rate": 6.477367248173352e-05, "loss": 1.7814, "step": 1092 }, { "epoch": 0.8711217183770883, "grad_norm": 1.5171153545379639, "learning_rate": 6.458503256185404e-05, "loss": 1.7815, "step": 1095 }, { "epoch": 0.8735083532219571, "grad_norm": 1.507451057434082, "learning_rate": 6.439616545655834e-05, "loss": 1.6747, "step": 1098 }, { "epoch": 0.8758949880668258, "grad_norm": 1.4318897724151611, "learning_rate": 6.420707410775626e-05, "loss": 1.8103, "step": 1101 }, { "epoch": 0.8782816229116945, "grad_norm": 1.7363917827606201, "learning_rate": 6.401776146085072e-05, "loss": 2.0862, "step": 1104 }, { "epoch": 0.8806682577565632, "grad_norm": 1.6215336322784424, "learning_rate": 6.382823046469167e-05, "loss": 1.831, "step": 1107 }, { "epoch": 0.883054892601432, "grad_norm": 1.4969305992126465, "learning_rate": 6.363848407153016e-05, "loss": 1.6292, "step": 1110 }, { "epoch": 0.8854415274463007, "grad_norm": 1.5934969186782837, "learning_rate": 6.344852523697247e-05, "loss": 1.8363, "step": 1113 }, { "epoch": 0.8878281622911695, "grad_norm": 1.6700190305709839, "learning_rate": 6.325835691993394e-05, "loss": 1.5579, "step": 1116 }, { "epoch": 0.8902147971360382, "grad_norm": 1.6834436655044556, "learning_rate": 6.306798208259297e-05, "loss": 1.7307, "step": 1119 }, { "epoch": 0.8926014319809069, "grad_norm": 1.5143299102783203, "learning_rate": 6.287740369034485e-05, "loss": 1.5429, "step": 1122 }, { "epoch": 0.8949880668257757, "grad_norm": 1.6106433868408203, "learning_rate": 6.26866247117555e-05, "loss": 1.6108, "step": 1125 }, { "epoch": 0.8973747016706444, "grad_norm": 1.6025004386901855, "learning_rate": 6.249564811851543e-05, "loss": 1.8183, "step": 1128 }, { "epoch": 0.8997613365155132, "grad_norm": 1.3973323106765747, "learning_rate": 6.230447688539316e-05, "loss": 1.6247, "step": 1131 }, { "epoch": 0.9021479713603818, "grad_norm": 1.7466994524002075, "learning_rate": 6.211311399018916e-05, "loss": 1.7925, "step": 1134 }, { "epoch": 0.9045346062052506, "grad_norm": 1.537386178970337, "learning_rate": 6.192156241368929e-05, "loss": 1.7553, "step": 1137 }, { "epoch": 0.9069212410501193, "grad_norm": 1.7690049409866333, "learning_rate": 6.172982513961845e-05, "loss": 1.7205, "step": 1140 }, { "epoch": 0.9093078758949881, "grad_norm": 1.7017157077789307, "learning_rate": 6.153790515459404e-05, "loss": 1.6503, "step": 1143 }, { "epoch": 0.9116945107398569, "grad_norm": 1.6713097095489502, "learning_rate": 6.13458054480795e-05, "loss": 1.8414, "step": 1146 }, { "epoch": 0.9140811455847255, "grad_norm": 1.5956037044525146, "learning_rate": 6.115352901233779e-05, "loss": 1.8079, "step": 1149 }, { "epoch": 0.9164677804295943, "grad_norm": 1.4910719394683838, "learning_rate": 6.096107884238458e-05, "loss": 1.6404, "step": 1152 }, { "epoch": 0.918854415274463, "grad_norm": 1.4896982908248901, "learning_rate": 6.0768457935941817e-05, "loss": 1.7845, "step": 1155 }, { "epoch": 0.9212410501193318, "grad_norm": 1.8766003847122192, "learning_rate": 6.0575669293390954e-05, "loss": 1.8104, "step": 1158 }, { "epoch": 0.9236276849642004, "grad_norm": 1.8520368337631226, "learning_rate": 6.038271591772615e-05, "loss": 1.8249, "step": 1161 }, { "epoch": 0.9260143198090692, "grad_norm": 1.7131208181381226, "learning_rate": 6.0189600814507604e-05, "loss": 1.8909, "step": 1164 }, { "epoch": 0.9284009546539379, "grad_norm": 1.3130649328231812, "learning_rate": 5.9996326991814654e-05, "loss": 1.5973, "step": 1167 }, { "epoch": 0.9307875894988067, "grad_norm": 1.7194204330444336, "learning_rate": 5.980289746019892e-05, "loss": 1.8653, "step": 1170 }, { "epoch": 0.9331742243436754, "grad_norm": 1.6134217977523804, "learning_rate": 5.9609315232637483e-05, "loss": 1.5751, "step": 1173 }, { "epoch": 0.9355608591885441, "grad_norm": 1.5093486309051514, "learning_rate": 5.941558332448589e-05, "loss": 1.6214, "step": 1176 }, { "epoch": 0.9379474940334129, "grad_norm": 1.4754223823547363, "learning_rate": 5.922170475343125e-05, "loss": 1.69, "step": 1179 }, { "epoch": 0.9403341288782816, "grad_norm": 1.7144575119018555, "learning_rate": 5.9027682539445104e-05, "loss": 1.6786, "step": 1182 }, { "epoch": 0.9427207637231504, "grad_norm": 1.453788161277771, "learning_rate": 5.883351970473654e-05, "loss": 1.8321, "step": 1185 }, { "epoch": 0.9451073985680191, "grad_norm": 1.688596248626709, "learning_rate": 5.863921927370498e-05, "loss": 1.7347, "step": 1188 }, { "epoch": 0.9474940334128878, "grad_norm": 1.600243330001831, "learning_rate": 5.8444784272893175e-05, "loss": 1.678, "step": 1191 }, { "epoch": 0.9498806682577565, "grad_norm": 1.6450378894805908, "learning_rate": 5.8250217730939973e-05, "loss": 1.6785, "step": 1194 }, { "epoch": 0.9522673031026253, "grad_norm": 1.597424030303955, "learning_rate": 5.8055522678533225e-05, "loss": 1.6594, "step": 1197 }, { "epoch": 0.954653937947494, "grad_norm": 1.7151094675064087, "learning_rate": 5.786070214836254e-05, "loss": 1.6255, "step": 1200 }, { "epoch": 0.9570405727923628, "grad_norm": 1.4792280197143555, "learning_rate": 5.7665759175072034e-05, "loss": 1.8538, "step": 1203 }, { "epoch": 0.9594272076372315, "grad_norm": 1.4509261846542358, "learning_rate": 5.747069679521305e-05, "loss": 1.7551, "step": 1206 }, { "epoch": 0.9618138424821002, "grad_norm": 1.5576130151748657, "learning_rate": 5.727551804719693e-05, "loss": 1.7022, "step": 1209 }, { "epoch": 0.964200477326969, "grad_norm": 1.6218217611312866, "learning_rate": 5.708022597124758e-05, "loss": 1.675, "step": 1212 }, { "epoch": 0.9665871121718377, "grad_norm": 1.6137958765029907, "learning_rate": 5.688482360935423e-05, "loss": 1.8624, "step": 1215 }, { "epoch": 0.9689737470167065, "grad_norm": 1.4816503524780273, "learning_rate": 5.668931400522396e-05, "loss": 1.791, "step": 1218 }, { "epoch": 0.9713603818615751, "grad_norm": 1.543349027633667, "learning_rate": 5.649370020423431e-05, "loss": 1.6761, "step": 1221 }, { "epoch": 0.9737470167064439, "grad_norm": 1.5985081195831299, "learning_rate": 5.629798525338589e-05, "loss": 1.6904, "step": 1224 }, { "epoch": 0.9761336515513126, "grad_norm": 1.6068087816238403, "learning_rate": 5.6102172201254835e-05, "loss": 1.7542, "step": 1227 }, { "epoch": 0.9785202863961814, "grad_norm": 1.3910298347473145, "learning_rate": 5.5906264097945407e-05, "loss": 1.7712, "step": 1230 }, { "epoch": 0.9809069212410502, "grad_norm": 1.47352135181427, "learning_rate": 5.5710263995042434e-05, "loss": 1.8361, "step": 1233 }, { "epoch": 0.9832935560859188, "grad_norm": 1.5060720443725586, "learning_rate": 5.551417494556376e-05, "loss": 1.7454, "step": 1236 }, { "epoch": 0.9856801909307876, "grad_norm": 1.4085941314697266, "learning_rate": 5.531800000391275e-05, "loss": 1.7465, "step": 1239 }, { "epoch": 0.9880668257756563, "grad_norm": 1.5695737600326538, "learning_rate": 5.5121742225830665e-05, "loss": 1.8416, "step": 1242 }, { "epoch": 0.9904534606205251, "grad_norm": 1.7752211093902588, "learning_rate": 5.4925404668349076e-05, "loss": 1.779, "step": 1245 }, { "epoch": 0.9928400954653938, "grad_norm": 1.3961496353149414, "learning_rate": 5.472899038974225e-05, "loss": 1.6866, "step": 1248 }, { "epoch": 0.9952267303102625, "grad_norm": 1.5418916940689087, "learning_rate": 5.45325024494795e-05, "loss": 1.7653, "step": 1251 }, { "epoch": 0.9976133651551312, "grad_norm": 1.6181163787841797, "learning_rate": 5.433594390817756e-05, "loss": 1.8697, "step": 1254 }, { "epoch": 1.0, "grad_norm": 2.188194990158081, "learning_rate": 5.413931782755283e-05, "loss": 1.8344, "step": 1257 } ], "logging_steps": 3, "max_steps": 2514, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1257, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.127117763309732e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }